diff -Nru bl-dspsr-0+git20160405/Benchmark/filterbank_bench.csh bl-dspsr-0.0~git20180312.50ea209/Benchmark/filterbank_bench.csh
--- bl-dspsr-0+git20160405/Benchmark/filterbank_bench.csh	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Benchmark/filterbank_bench.csh	2018-03-12 23:02:35.000000000 +0000
@@ -19,7 +19,7 @@
 
     echo -n "Testing nchan=$nchan nfft=$nfft "
   
-    time filterbank_speed -c$nchan -n$nfft -cuda >> filterbank_bench.out
+    time ../Signal/General/filterbank_speed -c$nchan -n$nfft -cuda >> filterbank_bench.out
 
     nfft=`expr $nfft '*' 2`
 
diff -Nru bl-dspsr-0+git20160405/config/ax_hdf5.m4 bl-dspsr-0.0~git20180312.50ea209/config/ax_hdf5.m4
--- bl-dspsr-0+git20160405/config/ax_hdf5.m4	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/config/ax_hdf5.m4	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,183 @@
+dnl
+dnl NOTE: this file has been modified from its original form on 9/22/2015.
+dnl
+dnl ######################################################################
+dnl
+dnl File:	hdf5.m4
+dnl
+dnl Purpose:	Determine the locations of hdf5 includes and libraries
+dnl
+dnl Version: $Id: hdf5.m4,v 1.26 2003/09/15 20:36:26 cary Exp $
+dnl
+dnl Tech-X configure system
+dnl
+dnl Copyright Tech-X Corporation
+dnl
+dnl ######################################################################
+dnl
+
+dnl
+dnl NOTE: this file was retrieved from:
+dnl
+dnl   https://www.hdfgroup.org/ftp/HDF5/contrib/autoconf-macros/hdf5.m4
+dnl
+
+dnl
+dnl Copyright Notice and License Terms for 
+dnl HDF5 (Hierarchical Data Format 5) Software Library and Utilities
+dnl -----------------------------------------------------------------------------
+dnl 
+dnl HDF5 (Hierarchical Data Format 5) Software Library and Utilities
+dnl Copyright 2006-2015 by The HDF Group.
+dnl 
+dnl NCSA HDF5 (Hierarchical Data Format 5) Software Library and Utilities
+dnl Copyright 1998-2006 by the Board of Trustees of the University of Illinois.
+dnl 
+dnl All rights reserved.
+dnl 
+dnl Redistribution and use in source and binary forms, with or without 
+dnl modification, are permitted for any purpose (including commercial purposes) 
+dnl provided that the following conditions are met:
+dnl 
+dnl 1. Redistributions of source code must retain the above copyright notice, 
+dnl    this list of conditions, and the following disclaimer.
+dnl 
+dnl 2. Redistributions in binary form must reproduce the above copyright notice, 
+dnl    this list of conditions, and the following disclaimer in the documentation 
+dnl    and/or materials provided with the distribution.
+dnl 
+dnl 3. In addition, redistributions of modified forms of the source or binary 
+dnl    code must carry prominent notices stating that the original code was 
+dnl    changed and the date of the change.
+dnl 
+dnl 4. All publications or advertising materials mentioning features or use of 
+dnl    this software are asked, but not required, to acknowledge that it was 
+dnl    developed by The HDF Group and by the National Center for Supercomputing 
+dnl    Applications at the University of Illinois at Urbana-Champaign and 
+dnl    credit the contributors.
+dnl 
+dnl 5. Neither the name of The HDF Group, the name of the University, nor the 
+dnl    name of any Contributor may be used to endorse or promote products derived 
+dnl    from this software without specific prior written permission from 
+dnl    The HDF Group, the University, or the Contributor, respectively.
+dnl
+dnl DISCLAIMER: 
+dnl THIS SOFTWARE IS PROVIDED BY THE HDF GROUP AND THE CONTRIBUTORS 
+dnl "AS IS" WITH NO WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED.  In no 
+dnl event shall The HDF Group or the Contributors be liable for any damages 
+dnl suffered by the users arising out of the use of this software, even if 
+dnl advised of the possibility of such damage. 
+
+AC_DEFUN([AX_HDF5], [
+
+dnl ######################################################################
+dnl
+dnl Allow the user to specify an overall hdf5 directory.  If specified,
+dnl we look for include and lib under this.
+dnl
+dnl ######################################################################
+
+AC_ARG_WITH(hdf5,[  --with-hdf5=<location of hdf5 installation> ],HDF5_DIR="$withval",HDF5_DIR="")
+
+dnl ######################################################################
+dnl
+dnl Find hdf5 includes - looking in include location if present,
+dnl otherwise in dir/include if present, otherwise in default locations,
+dnl first parallel, then serial.
+dnl
+dnl ######################################################################
+
+AC_ARG_WITH(hdf5-incdir,[  --with-hdf5-incdir=<location of hdf5 includes> ],
+HDF5_INCDIR="$withval",HDF5_INCDIR="")
+if test "x$HDF5_DIR" != xno; then
+if test -n "$HDF5_INCDIR"; then
+  HDF5_INCPATH=$HDF5_INCDIR
+elif test -n "$HDF5_DIR"; then
+  HDF5_INCPATH=$HDF5_DIR/include
+elif test "$MPI" = yes; then
+  HDF5_INCPATH=$HOME/hdf5mpi/include:/usr/local/hdf5mpi/include:/loc/hdf5mpi/include:$HOME/hdf5/include:/usr/local/hdf5/include:/loc/hdf5/include:/usr/common/usg/hdf5/default/parallel/include:/usr/local/include
+else
+  HDF5_INCPATH=$HOME/hdf5/include:/usr/local/hdf5/include:/loc/hdf5/include:$HOME/hdf5mpi/include:/usr/local/hdf5mpi/include:/loc/hdf5mpi/include:/usr/common/usg/hdf5/default/serial/include
+fi
+saveCPPFLAGS=$CPPFLAGS
+CPPFLAGS="-I$HDF5_INCPATH $CPPFLAGS"
+AC_CHECK_HEADER(hdf5.h, [HDF5_H=y], [HDF5_H=""])
+CPPFLAGS=$saveCPPFLAGS
+if test -z "$HDF5_H"; then
+  AC_MSG_WARN(hdf5.h not found in $HDF5_INCPATH.  Set with --with-hdf5-incdir=)
+  HDF5_INC=" "
+  ac_cv_have_hdf5=no
+else
+  HDF5_INCDIR=$HDF5_INCPATH
+  AC_SUBST(HDF5_INCDIR)
+  HDF5_INC=-I$HDF5_INCDIR
+  HDF5_CPPFLAGS=$HDF5_INC
+  AC_SUBST(HDF5_INC)
+  AC_SUBST(HDF5_CPPFLAGS)
+  HDF5_DIR=`dirname $HDF5_INCDIR`
+  ac_cv_have_hdf5=yes
+fi
+fi
+dnl ######################################################################
+dnl
+dnl See if built parallel
+dnl
+dnl ######################################################################
+
+if test $ac_cv_have_hdf5 = yes; then
+  if test -f $HDF5_INCDIR/H5config.h; then
+    hdf5par=`grep "HAVE_PARALLEL 1" $HDF5_INCDIR/H5config.h`
+  elif test -f $HDF5_INCDIR/H5pubconf.h; then
+    hdf5par=`grep "HAVE_PARALLEL 1" $HDF5_INCDIR/H5pubconf.h`
+  fi
+fi
+
+dnl ######################################################################
+dnl
+dnl Find hdf5 libraries
+dnl
+dnl ######################################################################
+
+AC_ARG_WITH(hdf5-libdir,[  --with-hdf5-libdir=<location of hdf5 library> ],
+HDF5_LIBDIR="$withval",HDF5_LIBDIR="")
+if test $ac_cv_have_hdf5 = yes; then
+  if test -n "$HDF5_LIBDIR"; then
+    HDF5_LIBPATH=$HDF5_LIBDIR
+  else
+    HDF5_LIBPATH=$HDF5_DIR/lib
+  fi
+  
+  saveLDFLAGS=$LDFLAGS
+  LDFLAGS="-L$HDF5_LIBPATH $LDFLAGS"
+  AC_CHECK_LIB([hdf5],[H5open],[LIBHDF5_A=y],[LIBHDF5_A=""])
+  LDFLAGS=$saveLDFLAGS
+  
+  if test -z "$LIBHDF5_A"; then
+    AC_MSG_WARN(libhdf5.a not found.  Set with --with-hdf5-libdir=)
+    ac_cv_have_hdf5=no
+    HDF5_LDFLAGS=" "
+    HDF5_LIBS=" "
+  else
+    HDF5_LIBDIR=$HDF5_LIBPATH
+    AC_SUBST(HDF5_LIBDIR)
+    HDF5_LDFLAGS="-L$HDF5_LIBDIR"
+    HDF5_LIBS="-lhdf5"
+    AC_SUBST(HDF5_LDFLAGS)
+    AC_SUBST(HDF5_LIBS)
+  fi
+fi
+
+dnl ######################################################################
+dnl
+dnl Define for whether hdf5 found
+dnl
+dnl ######################################################################
+
+if test $ac_cv_have_hdf5 = yes; then
+  AC_DEFINE(HAVE_HDF5, [1], [Define if we have libhdf5])
+  AM_CONDITIONAL(HAVE_HDF5, true)
+else
+  AM_CONDITIONAL(HAVE_HDF5, false)
+fi
+
+]) dnl End of DEFUN
diff -Nru bl-dspsr-0+git20160405/config/cuda.m4 bl-dspsr-0.0~git20180312.50ea209/config/cuda.m4
--- bl-dspsr-0+git20160405/config/cuda.m4	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/config/cuda.m4	2018-03-12 23:02:35.000000000 +0000
@@ -6,6 +6,9 @@
 
   SWIN_PACKAGE_OPTIONS([cuda])
 
+  AC_ARG_ENABLE([cufft_callbacks],
+     AC_HELP_STRING([--enable-cufft-callbacks],[Use CUFFT callbacks if CUDA enabled, EXPERIMENTAL]))
+
   CUDA_CFLAGS=""
   CUDA_LIBS=""
 
@@ -87,6 +90,35 @@
 
   fi
 
+  have_cufft_callbacks="no"
+
+  if test x"$enable_cufft_callbacks" = xyes; then
+
+    if test "$have_cufft" = "yes" ; then
+
+      AC_MSG_CHECKING([for CUDA FFT Callbacks])
+
+      SWIN_PACKAGE_FIND([cufft_callbacks],[cufftXt.h])
+      SWIN_PACKAGE_TRY_COMPILE([cufft_callbacks],[#include <cufft.h>
+                                                  #include <cufftXt.h> ],[],[$swin_cuda_include_dir])
+
+      SWIN_PACKAGE_FIND([cufft_callbacks],[libcufft_static.*])
+      SWIN_PACKAGE_TRY_LINK([cufft_callbacks],[#include <cufft.h>
+                                             #include <cufftXt.h> ],
+                          [cufftPlan1d (0, 1024, CUFFT_C2C, 1);],[-lcudart -lcufft])
+
+      AC_MSG_RESULT([$have_cufft_callbacks])
+
+      if test "$have_cufft_callbacks" = "yes"; then
+        AC_DEFINE([HAVE_CUFFT_CALLBACKS],[1],[Define if the CUFFT Callbacks library is present])
+        [$1]
+      else
+        AC_MSG_WARN([CUFFT Callbacks will not be compiled])
+        [$2]
+      fi
+    fi
+  fi
+
   AC_SUBST(CUDA_NVCC)
 
   CUDA_LIBS="$cuda_LIBS"
@@ -103,4 +135,10 @@
   AC_SUBST(CUFFT_CFLAGS)
   AM_CONDITIONAL(HAVE_CUFFT,[test "$have_cufft" = "yes"])
 
+  CUFFT_CALLBACKS_LIBS="${cufft_callbacks_LIBS}_static -lculibos"
+  CUFFT_CALLBACKS_CFLAGS="$cufft_callbacks_CFLAGS"
+
+  AC_SUBST(CUFFT_CALLBACKS_LIBS)
+  AC_SUBST(CUFFT_CALLBACKS_CFLAGS)
+  AM_CONDITIONAL(HAVE_CUFFT_CALLBACKS,[test "$have_cufft_callbacks" = "yes"])
 ])
diff -Nru bl-dspsr-0+git20160405/config/guppi_daq.m4 bl-dspsr-0.0~git20180312.50ea209/config/guppi_daq.m4
--- bl-dspsr-0+git20160405/config/guppi_daq.m4	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/config/guppi_daq.m4	2018-03-12 23:02:35.000000000 +0000
@@ -17,7 +17,7 @@
 
   if test x"$GUPPI_DIR" != x; then
     GUPPI_DAQ_CFLAGS="-I$GUPPI_DIR/src"
-    GUPPI_DAQ_LIBS="-L$GUPPI_DIR/src -lguppi_daq -lsla -lvdifio -lm"
+    GUPPI_DAQ_LIBS="-L$GUPPI_DIR/src -lsla -lm"
     have_guppi_daq="yes"
   fi
 
diff -Nru bl-dspsr-0+git20160405/config/mark5access.m4 bl-dspsr-0.0~git20180312.50ea209/config/mark5access.m4
--- bl-dspsr-0+git20160405/config/mark5access.m4	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/config/mark5access.m4	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,67 @@
+# SWIN_LIB_MARK5ACCESS([ACTION-IF-FOUND [,ACTION-IF-NOT-FOUND]])
+# ----------------------------------------------------------
+AC_DEFUN([SWIN_LIB_MARK5ACCESS],
+[
+  AC_PROVIDE([SWIN_LIB_MARK5ACCESS])
+
+  AC_ARG_WITH([mark5access-dir],
+              AC_HELP_STRING([--with-mark5access-dir=DIR],
+                             [MARK5ACCESS is installed in DIR]))
+
+  MARK5ACCESS_CFLAGS=""
+  MARK5ACCESS_LIBS=""
+
+  if test x"$with_mark5access_dir" = xno; then
+    # user disabled mark5access. Leave cache alone.
+    have_mark5access="User disabled mark5access."
+  else
+
+    AC_MSG_CHECKING([for mark5access installation])
+
+    # "yes" is not a specification
+    if test x"$with_mark5access_dir" = xyes; then
+      with_mark5access_dir=
+    fi
+
+    have_mark5access="not found"
+
+    ac_save_CPPFLAGS="$CPPFLAGS"
+    ac_save_LIBS="$LIBS"
+
+    CPPFLAGS="`pkg-config --cflags mark5access` $CPPFLAGS"
+    LIBS="`pkg-config --libs mark5access` $LIBS"
+
+    # TESTPKG="`pkg-config --cflags mark5access`"
+    # AC_MSG_NOTICE([pkg-config returns $TESTPKG])
+    AC_TRY_LINK([#include <mark5access.h>], [new_mark5_stream(0,0);],
+                have_mark5access=yes, have_mark5access=no)
+
+    if test $have_mark5access = yes; then
+      MARK5ACCESS_CFLAGS="`pkg-config --cflags mark5access`"
+      MARK5ACCESS_LIBS="`pkg-config --libs mark5access`"
+    fi
+
+    LIBS="$ac_save_LIBS"
+    CPPFLAGS="$ac_save_CPPFLAGS"
+
+  fi
+
+  AC_MSG_RESULT([$have_mark5access])
+
+  if test "$have_mark5access" = "yes"; then
+    AC_DEFINE([HAVE_MARK5ACCESS], [1], [Define if the mark5access library is present])
+    [$1]
+  else
+    AC_MSG_NOTICE([Ensure that the PKG_CONFIG_PATH environment variable points to])
+    AC_MSG_NOTICE([the lib/pkgconfig sub-directory of the root directory where])
+    AC_MSG_NOTICE([the mark5access library was installed.])
+    AC_MSG_NOTICE([Alternatively, use the --with-mark5access-dir option.])
+    [$2]
+  fi
+
+  AC_SUBST(MARK5ACCESS_LIBS)
+  AC_SUBST(MARK5ACCESS_CFLAGS)
+  AM_CONDITIONAL(HAVE_MARK5ACCESS,[test "$have_mark5access" = "yes"])
+
+])
+
diff -Nru bl-dspsr-0+git20160405/configure.ac bl-dspsr-0.0~git20180312.50ea209/configure.ac
--- bl-dspsr-0+git20160405/configure.ac	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/configure.ac	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,223 @@
+#                                               -*- Autoconf -*-
+# Process this file with autoconf to produce a configure script.
+
+AC_PREREQ(2.57)
+AC_INIT([DSPSR],[2016-06+],[dspsr-developers@lists.sourceforge.net])
+
+AC_CONFIG_AUX_DIR([config])
+AC_CONFIG_SRCDIR([Kernel/Makefile.am])
+
+AC_ARG_VAR([PSRHOME],    [Standard pulsar home directory])
+AC_ARG_VAR([LOGIN_ARCH], [Architecture-dependent sub-directory of PSRHOME])
+AC_ARG_VAR([PACKAGES],   [Root directories of third-party packages])
+AC_ARG_VAR([CUDA_NVCC_FLAGS], [CUDA nvcc flags (e.g. -arch, etc)])
+
+AC_PREFIX_DEFAULT([${PSRHOME:-"/usr/local"}${PSRHOME:+"/$LOGIN_ARCH"}])
+
+if test x"$PSRHOME" = x; then
+  AC_MSG_WARN([The PSRHOME environment variable is not set])
+else
+  if test x"$LOGIN_ARCH" = x; then
+    AC_MSG_WARN([The LOGIN_ARCH environment variable is not set])
+  fi
+fi
+
+#
+#
+#
+
+# Check if the user has set compiler options
+SWIN_OPTIONS_SET
+
+# Enable convenience compiler selection
+SWIN_COMPILER
+
+AM_INIT_AUTOMAKE([foreign subdir-objects])
+
+#
+# Check for selected formats in backends.list
+#
+
+DSPSR_FORMATS
+
+# Create static libraries by default
+AC_DISABLE_SHARED
+
+# Enable large-file support
+AC_SYS_LARGEFILE
+
+# Determine the machine endian
+AC_C_BIGENDIAN
+
+# Checks for programs.
+AC_PROG_CXX
+AC_PROG_CXXCPP
+AC_PROG_CC
+AC_PROG_CPP
+AM_PROG_CC_C_O
+
+AC_PROG_F77
+
+AC_PROG_INSTALL
+AC_PROG_LIBTOOL
+
+#
+# Disable the debugging information option, -g
+#
+SWIN_DEBUG
+SWIN_LOCAL
+
+#
+# Checks for essential libraries
+#
+AC_CHECK_LIB([m], [sin])
+SWIN_LIB_PSRCHIVE([],AC_MSG_ERROR([PSRCHIVE Library not found.
+
+Please see http://psrchive.sourceforge.net
+
+]))
+
+PSRCHIVE_ACLOCAL=`$psrchive_config --aclocal`
+AC_SUBST(PSRCHIVE_ACLOCAL)
+
+PSRCHIVE_INCLUDE=`$psrchive_config --cflags | sed s/-pthread//`
+AC_SUBST(PSRCHIVE_INCLUDE)
+
+#
+# Checks for optional libraries
+#
+SWIN_LIB_PSRDADA
+SWIN_LIB_CFITSIO
+SWIN_LIB_MPI
+SWIN_LIB_CUDA
+SWIN_LIB_GUPPI_DAQ
+SWIN_LIB_MARK5ACCESS
+
+AX_OPENMP
+AC_SUBST(OPENMP_CFLAGS)
+
+#
+# Checks for graphics libraries
+#
+SWIN_LIB_PGPLOT
+
+#
+# For developers: preserve file modification times
+#
+INSTALL_DATA="${INSTALL} -m 644 -p"
+install_sh="CPPROG='cp -p' ${install_sh}"
+
+#
+# Checks for header files.
+#
+AC_CHECK_HEADERS([malloc.h])
+
+# Check for openssl
+MJK_LIB_CRYPTO
+
+# Check for psrxml io library
+MJK_LIB_PSRXML
+
+# find HDF5 (needed for LOFAR)
+AX_HDF5
+
+#
+# Checks for library functions.
+#
+SWIN_FUNC_GETOPT_LONG
+SWIN_FUNC_AFFINITY
+
+#
+# Generate python module if --enable-shared is used
+#
+if test x"$enable_shared" == xyes; then
+  AM_PATH_PYTHON(,, [:])
+  AC_PROG_SWIG
+  SWIG_ENABLE_CXX
+  SWIG_PYTHON
+else
+  PYTHON=":"
+fi
+
+AM_CONDITIONAL([HAVE_PYTHON], [test "$PYTHON" != :])
+
+#
+# Initialize variables used by Makefile.include 
+#
+INCLUDE_CPPFLAGS="$PSRCHIVE_CPPFLAGS"
+AC_SUBST(INCLUDE_CPPFLAGS)
+
+INCLUDE_LDFLAGS="$PSRCHIVE_LIBS"
+AC_SUBST(INCLUDE_LDFLAGS)
+
+AM_CONFIG_HEADER([config.h])
+AC_CONFIG_FILES([Makefile
+		config/Makefile
+		Kernel/Makefile
+		Kernel/Classes/Makefile
+		Kernel/Applications/Makefile
+		Kernel/Formats/Makefile
+		Kernel/Formats/apsr/Makefile
+		Kernel/Formats/asp/Makefile
+		Kernel/Formats/bcpm/Makefile
+		Kernel/Formats/bpsr/Makefile
+		Kernel/Formats/caspsr/Makefile
+		Kernel/Formats/cpsr/Makefile
+		Kernel/Formats/cpsr2/Makefile
+		Kernel/Formats/dada/Makefile
+		Kernel/Formats/dummy/Makefile
+		Kernel/Formats/emerlin/Makefile
+		Kernel/Formats/fadc/Makefile
+        	Kernel/Formats/fits/Makefile
+		Kernel/Formats/gmrt/Makefile
+		Kernel/Formats/guppi/Makefile
+		Kernel/Formats/kat/Makefile
+		Kernel/Formats/lbadr/Makefile
+		Kernel/Formats/lbadr64/Makefile
+		Kernel/Formats/lofar_dal/Makefile
+		Kernel/Formats/lump/Makefile
+		Kernel/Formats/lwa/Makefile
+		Kernel/Formats/spda1k/Makefile
+		Kernel/Formats/mark4/Makefile
+		Kernel/Formats/mark5/Makefile
+		Kernel/Formats/mark5b/Makefile
+		Kernel/Formats/maxim/Makefile
+		Kernel/Formats/mopsr/Makefile
+		Kernel/Formats/mwa/Makefile
+		Kernel/Formats/pmdaq/Makefile
+		Kernel/Formats/pdev/Makefile
+		Kernel/Formats/puma/Makefile
+		Kernel/Formats/puma2/Makefile
+		Kernel/Formats/s2/Makefile
+		Kernel/Formats/sigproc/Makefile
+		Kernel/Formats/ska1/Makefile
+		Kernel/Formats/spigot/Makefile
+		Kernel/Formats/vdif/Makefile
+		Kernel/Formats/wapp/Makefile
+		Signal/Makefile
+		Signal/Statistics/Makefile
+		Signal/General/Makefile
+		Signal/Pulsar/Makefile
+		Management/Makefile
+		Management/dspsr_ldflags
+		Management/dspsr_cflags
+		Management/release.csh
+    More/Makefile
+    More/Plotting/Makefile
+    More/Applications/Makefile
+		python/Makefile])
+
+AC_OUTPUT
+
+echo
+
+if test x"$selected_formats" = x; then
+  echo "WARNING: no file formats have been selected"
+  echo "Please see http://dspsr.sourceforge.net/formats for details"
+else
+  echo "DSPSR will support these formats: $selected_formats" 
+fi
+
+echo
+echo "DSPSR is now ready to be compiled.  Please run 'make'"
+echo
diff -Nru bl-dspsr-0+git20160405/configure.in bl-dspsr-0.0~git20180312.50ea209/configure.in
--- bl-dspsr-0+git20160405/configure.in	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/configure.in	1970-01-01 00:00:00.000000000 +0000
@@ -1,213 +0,0 @@
-#                                               -*- Autoconf -*-
-# Process this file with autoconf to produce a configure script.
-
-AC_PREREQ(2.57)
-AC_INIT([DSPSR],[2.0],[dspsr-developers@lists.sourceforge.net])
-
-AC_CONFIG_AUX_DIR([config])
-AC_CONFIG_SRCDIR([Kernel/Makefile.am])
-
-AC_ARG_VAR([PSRHOME],    [Standard pulsar home directory])
-AC_ARG_VAR([LOGIN_ARCH], [Architecture-dependent sub-directory of PSRHOME])
-AC_ARG_VAR([PACKAGES],   [Root directories of third-party packages])
-AC_ARG_VAR([CUDA_NVCC_FLAGS], [CUDA nvcc flags (e.g. -arch, etc)])
-
-AC_PREFIX_DEFAULT([${PSRHOME:-"/usr/local"}${PSRHOME:+"/$LOGIN_ARCH"}])
-
-if test x"$PSRHOME" = x; then
-  AC_MSG_WARN([The PSRHOME environment variable is not set])
-else
-  if test x"$LOGIN_ARCH" = x; then
-    AC_MSG_WARN([The LOGIN_ARCH environment variable is not set])
-  fi
-fi
-
-#
-#
-#
-
-# Check if the user has set compiler options
-SWIN_OPTIONS_SET
-
-# Enable convenience compiler selection
-SWIN_COMPILER
-
-AM_INIT_AUTOMAKE([foreign subdir-objects])
-
-#
-# Check for selected formats in backends.list
-#
-
-DSPSR_FORMATS
-
-# Create static libraries by default
-AC_DISABLE_SHARED
-
-# Enable large-file support
-AC_SYS_LARGEFILE
-
-# Determine the machine endian
-AC_C_BIGENDIAN
-
-# Checks for programs.
-AC_PROG_CXX
-AC_PROG_CXXCPP
-AC_PROG_CC
-AC_PROG_CPP
-AM_PROG_CC_C_O
-
-AC_PROG_INSTALL
-AC_PROG_LIBTOOL
-
-#
-# Disable the debugging information option, -g
-#
-SWIN_DEBUG
-SWIN_LOCAL
-
-#
-# Checks for essential libraries
-#
-AC_CHECK_LIB([m], [sin])
-SWIN_LIB_PSRCHIVE([],AC_MSG_ERROR([PSRCHIVE Library not found.
-
-Please see http://psrchive.sourceforge.net
-
-]))
-
-PSRCHIVE_ACLOCAL=`$psrchive_config --aclocal`
-AC_SUBST(PSRCHIVE_ACLOCAL)
-
-PSRCHIVE_INCLUDE=`$psrchive_config --cflags | sed s/-pthread//`
-AC_SUBST(PSRCHIVE_INCLUDE)
-
-#
-# Checks for optional libraries
-#
-SWIN_LIB_PSRDADA
-SWIN_LIB_CFITSIO
-SWIN_LIB_MPI
-SWIN_LIB_CUDA
-SWIN_LIB_GUPPI_DAQ
-
-AX_OPENMP
-AC_SUBST(OPENMP_CFLAGS)
-
-#
-# Checks for graphics libraries
-#
-SWIN_LIB_PGPLOT
-
-#
-# For developers: preserve file modification times
-#
-INSTALL_DATA="${INSTALL} -m 644 -p"
-install_sh="CPPROG='cp -p' ${install_sh}"
-
-#
-# Checks for header files.
-#
-AC_CHECK_HEADERS([malloc.h])
-
-# Check for openssl
-MJK_LIB_CRYPTO
-
-# Check for psrxml io library
-MJK_LIB_PSRXML
-
-#
-# Checks for library functions.
-#
-SWIN_FUNC_GETOPT_LONG
-SWIN_FUNC_AFFINITY
-
-#
-# Generate python module if --enable-shared is used
-#
-if test x"$enable_shared" == xyes; then
-  AM_PATH_PYTHON(,, [:])
-  AC_PROG_SWIG
-  SWIG_ENABLE_CXX
-  SWIG_PYTHON
-else
-  PYTHON=":"
-fi
-
-AM_CONDITIONAL([HAVE_PYTHON], [test "$PYTHON" != :])
-
-#
-# Initialize variables used by Makefile.include 
-#
-INCLUDE_CPPFLAGS="$PSRCHIVE_CPPFLAGS"
-AC_SUBST(INCLUDE_CPPFLAGS)
-
-INCLUDE_LDFLAGS="$PSRCHIVE_LIBS"
-AC_SUBST(INCLUDE_LDFLAGS)
-
-AM_CONFIG_HEADER([config.h])
-AC_CONFIG_FILES([Makefile
-		config/Makefile
-		Kernel/Makefile
-		Kernel/Classes/Makefile
-		Kernel/Applications/Makefile
-		Kernel/Formats/Makefile
-		Kernel/Formats/apsr/Makefile
-		Kernel/Formats/asp/Makefile
-		Kernel/Formats/bcpm/Makefile
-		Kernel/Formats/bpsr/Makefile
-		Kernel/Formats/caspsr/Makefile
-		Kernel/Formats/cpsr/Makefile
-		Kernel/Formats/cpsr2/Makefile
-		Kernel/Formats/dada/Makefile
-		Kernel/Formats/dummy/Makefile
-		Kernel/Formats/fadc/Makefile
-        	Kernel/Formats/fits/Makefile
-		Kernel/Formats/gmrt/Makefile
-		Kernel/Formats/guppi/Makefile
-		Kernel/Formats/kat/Makefile
-		Kernel/Formats/lbadr/Makefile
-		Kernel/Formats/lbadr64/Makefile
-		Kernel/Formats/lofar_dal/Makefile
-		Kernel/Formats/lump/Makefile
-		Kernel/Formats/lwa/Makefile
-		Kernel/Formats/spda1k/Makefile
-		Kernel/Formats/mark4/Makefile
-		Kernel/Formats/mark5/Makefile
-		Kernel/Formats/maxim/Makefile
-		Kernel/Formats/mwa/Makefile
-		Kernel/Formats/pmdaq/Makefile
-		Kernel/Formats/pdev/Makefile
-		Kernel/Formats/puma/Makefile
-		Kernel/Formats/puma2/Makefile
-		Kernel/Formats/s2/Makefile
-		Kernel/Formats/sigproc/Makefile
-		Kernel/Formats/spigot/Makefile
-		Kernel/Formats/vdif/Makefile
-		Kernel/Formats/wapp/Makefile
-		Signal/Makefile
-		Signal/Statistics/Makefile
-		Signal/General/Makefile
-		Signal/Pulsar/Makefile
-		Management/Makefile
-		Management/dspsr_ldflags
-		Management/dspsr_cflags
-		Management/release.csh
-    More/Makefile
-    More/Plotting/Makefile
-    More/Applications/Makefile
-		python/Makefile])
-
-AC_OUTPUT
-
-echo
-
-if test x"$selected_formats" = x; then
-  echo "WARNING: no file formats have been selected"
-  echo "Please see http://dspsr.sourceforge.net/formats for details"
-else
-  echo "DSPSR will support these formats: $selected_formats" 
-fi
-
-echo
-echo "DSPSR is now ready to be compiled.  Please run 'make'"
-echo
diff -Nru bl-dspsr-0+git20160405/debian/changelog bl-dspsr-0.0~git20180312.50ea209/debian/changelog
--- bl-dspsr-0+git20160405/debian/changelog	2018-03-13 10:20:03.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/debian/changelog	2018-08-20 13:41:36.000000000 +0000
@@ -1,3 +1,13 @@
+bl-dspsr (0.0~git20180312.50ea209-1) bionic; urgency=medium
+
+  [ Gijs Molenaar ]
+  * add watch file
+  * New upstream version 0.0~git20180312.50ea209
+  * remove old patch
+  * disable tests since broken
+
+ -- KERN packaging <packaging@kernsuite.info>  Mon, 20 Aug 2018 15:41:36 +0200
+
 bl-dspsr (0+git20160405-1) bionic; urgency=medium
 
   * Initial release
diff -Nru bl-dspsr-0+git20160405/debian/clean bl-dspsr-0.0~git20180312.50ea209/debian/clean
--- bl-dspsr-0+git20160405/debian/clean	2018-03-13 10:19:45.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/debian/clean	2018-08-20 13:41:36.000000000 +0000
@@ -89,3 +89,57 @@
 Kernel/Formats/fits/FITSOutputFile.o
 Kernel/Formats/fits/FITSUnpacker.o
 Kernel/Formats/fits/GUPPIFITSUnpacker.o
+Kernel/Classes/Makefile.in
+Kernel/Formats/Makefile.in
+Kernel/Formats/apsr/Makefile.in
+Kernel/Formats/asp/Makefile.in
+Kernel/Formats/bcpm/Makefile.in
+Kernel/Formats/bpsr/Makefile.in
+Kernel/Formats/caspsr/Makefile.in
+Kernel/Formats/cpsr/Makefile.in
+Kernel/Formats/cpsr2/Makefile.in
+Kernel/Formats/dada/Makefile.in
+Kernel/Formats/dummy/Makefile.in
+Kernel/Formats/emerlin/Makefile.in
+Kernel/Formats/fadc/Makefile.in
+Kernel/Formats/fits/Makefile.in
+Kernel/Formats/gmrt/Makefile.in
+Kernel/Formats/guppi/Makefile.in
+Kernel/Formats/kat/Makefile.in
+Kernel/Formats/lbadr/Makefile.in
+Kernel/Formats/lbadr64/Makefile.in
+Kernel/Formats/lofar_dal/Makefile.in
+Kernel/Formats/lump/Makefile.in
+Kernel/Formats/lwa/Makefile.in
+Kernel/Formats/mark4/Makefile.in
+Kernel/Formats/mark5/Makefile.in
+Kernel/Formats/mark5b/Makefile.in
+Kernel/Formats/maxim/Makefile.in
+Kernel/Formats/mopsr/Makefile.in
+Kernel/Formats/mwa/Makefile.in
+Kernel/Formats/pdev/Makefile.in
+Kernel/Formats/pmdaq/Makefile.in
+Kernel/Formats/puma/Makefile.in
+Kernel/Formats/puma2/Makefile.in
+Kernel/Formats/s2/Makefile.in
+Kernel/Formats/sigproc/Makefile.in
+Kernel/Formats/ska1/Makefile.in
+Kernel/Formats/spda1k/Makefile.in
+Kernel/Formats/spigot/Makefile.in
+Kernel/Formats/vdif/Makefile.in
+Kernel/Formats/wapp/Makefile.in
+Kernel/Makefile.in
+Makefile.in
+Management/Makefile.in
+More/Applications/Makefile.in
+More/Makefile.in
+More/Plotting/Makefile.in
+Signal/General/Makefile.in
+Signal/Makefile.in
+Signal/Pulsar/Makefile.in
+Signal/Statistics/Makefile.in
+config/Makefile.in
+python/Makefile.in
+config/fontutil.m4
+config/mysql.m4
+
diff -Nru bl-dspsr-0+git20160405/debian/control bl-dspsr-0.0~git20180312.50ea209/debian/control
--- bl-dspsr-0+git20160405/debian/control	2018-03-13 10:20:03.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/debian/control	2018-08-20 13:41:36.000000000 +0000
@@ -2,8 +2,16 @@
 Section: science
 Priority: optional
 Maintainer: KERN packaging <packaging@kernsuite.info>
-Build-Depends: debhelper (>=9), psrchive, liblapack-dev, libfftw3-dev,
- libcfitsio-dev, dh-autoreconf, gfortran
+Build-Depends:
+ debhelper (>=9),
+ psrchive,
+ psrchive-dev,
+ liblapack-dev,
+ libfftw3-dev,
+ libcfitsio3-dev|libcfitsio-dev,
+ libgsl-dev,
+ dh-autoreconf,
+ gfortran
 Standards-Version: 3.9.6
 Homepage: https://github.com/UCBerkeleySETI/dspsr
 
diff -Nru bl-dspsr-0+git20160405/debian/patches/add_missing_dep bl-dspsr-0.0~git20180312.50ea209/debian/patches/add_missing_dep
--- bl-dspsr-0+git20160405/debian/patches/add_missing_dep	2018-03-13 10:19:45.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/debian/patches/add_missing_dep	1970-01-01 00:00:00.000000000 +0000
@@ -1,13 +0,0 @@
-Description: lib dependencies missing
-
---- bl-dspsr-0+git20160405.orig/Signal/Pulsar/Makefile.am
-+++ bl-dspsr-0+git20160405/Signal/Pulsar/Makefile.am
-@@ -43,6 +43,8 @@ LDADD = libdspsr.la \
- 	$(top_builddir)/Signal/General/libdspdsp.la \
- 	$(top_builddir)/Kernel/libdspbase.la \
- 	$(top_builddir)/Signal/Statistics/libdspstats.la \
-+	$(top_builddir)/Kernel/Formats/fits/libfits.la \
-+	$(top_builddir)/Kernel/Classes/libClasses.la \
- 	@CUFFT_LIBS@ @CUDA_LIBS@
- 
- AM_CPPFLAGS += @CUFFT_CFLAGS@
diff -Nru bl-dspsr-0+git20160405/debian/patches/series bl-dspsr-0.0~git20180312.50ea209/debian/patches/series
--- bl-dspsr-0+git20160405/debian/patches/series	2018-03-13 10:19:45.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/debian/patches/series	1970-01-01 00:00:00.000000000 +0000
@@ -1 +0,0 @@
-add_missing_dep
diff -Nru bl-dspsr-0+git20160405/debian/rules bl-dspsr-0.0~git20180312.50ea209/debian/rules
--- bl-dspsr-0+git20160405/debian/rules	2018-03-13 10:19:45.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/debian/rules	2018-08-20 13:40:06.000000000 +0000
@@ -11,3 +11,4 @@
 	dh_auto_build -- -C Kernel/Formats/fits
 	dh_auto_build
 
+override_dh_auto_test:
diff -Nru bl-dspsr-0+git20160405/debian/watch bl-dspsr-0.0~git20180312.50ea209/debian/watch
--- bl-dspsr-0+git20160405/debian/watch	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/debian/watch	2018-08-17 14:32:40.000000000 +0000
@@ -0,0 +1,5 @@
+version=4
+opts="mode=git, pgpmode=none, pretty=0.0~git%cd.%h" \
+https://github.com/UCBerkeleySETI/dspsr \
+heads/directio-branch debian uupdate
+
diff -Nru bl-dspsr-0+git20160405/.gitignore bl-dspsr-0.0~git20180312.50ea209/.gitignore
--- bl-dspsr-0+git20160405/.gitignore	2018-03-12 08:32:59.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/.gitignore	2018-03-12 23:02:35.000000000 +0000
@@ -52,6 +52,7 @@
 Management/release.csh
 More/Applications/searchplot
 Signal/General/Makefile
+Signal/General/cufft_callback_bench
 Signal/General/digifil
 Signal/General/digifits
 Signal/General/digihist
@@ -59,9 +60,11 @@
 Signal/General/digistat
 Signal/General/digitxt
 Signal/General/dmsmear
+Signal/General/fftbatch_speed
 Signal/General/filterbank_speed
 Signal/General/passband
 Signal/General/the_decimator
+Signal/General/undersampling_speed
 Signal/Makefile
 Signal/Pulsar/Makefile
 Signal/Pulsar/dspsr
diff -Nru bl-dspsr-0+git20160405/Kernel/Applications/Makefile.am bl-dspsr-0.0~git20180312.50ea209/Kernel/Applications/Makefile.am
--- bl-dspsr-0+git20160405/Kernel/Applications/Makefile.am	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Applications/Makefile.am	2018-03-12 23:02:35.000000000 +0000
@@ -15,9 +15,11 @@
 
 #############################################################################
 
-INCLUDES = -I$(top_builddir)/local_include 
 
-LDADD = $(top_builddir)/Kernel/libdspbase.la
+LDADD = $(top_builddir)/Kernel/libdspbase.la @CUDA_LIBS@ @CUFFT_LIBS@
 
 include $(top_srcdir)/config/Makefile.include
 
+AM_CPPFLAGS += -I$(top_builddir)/local_include 
+AM_CPPFLAGS += @CUDA_CFLAGS@ @CUFFT_CFLAGS@
+
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/ASCIIObservation.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/ASCIIObservation.C
--- bl-dspsr-0+git20160405/Kernel/Classes/ASCIIObservation.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/ASCIIObservation.C	2018-03-12 23:02:35.000000000 +0000
@@ -340,6 +340,18 @@
 
   // //////////////////////////////////////////////////////////////////////
   //
+  // PICOSECONDS offset from UTC_START second 
+  //
+  uint64_t offset_picoseconds = 0;
+  if (ascii_header_check (header, "PICOSECONDS", UI64, &offset_picoseconds) >= 0)
+  {
+    double offset_seconds = double(offset_picoseconds) / 1e12;
+    recording_start_time += offset_seconds;
+  }
+
+
+  // //////////////////////////////////////////////////////////////////////
+  //
   // OBS_OFFSET
   //
   offset_bytes = 0;
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/BitSeries.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/BitSeries.C
--- bl-dspsr-0+git20160405/Kernel/Classes/BitSeries.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/BitSeries.C	2018-03-12 23:02:35.000000000 +0000
@@ -102,7 +102,7 @@
   const unsigned char* from = bitseries.get_rawptr();
   unsigned char* to = get_rawptr();
 
-  memcpy(to,from,size_t(bitseries.get_nbytes()));
+  memory->do_copy (to, from, size_t(bitseries.get_nbytes()));
 
   return *this;
 }
@@ -154,7 +154,7 @@
     unsigned char *into = get_rawptr();
     const unsigned char *from = copy->get_rawptr() + offset;
 
-    memcpy (into, from, size_t(bytes));
+    memory->do_copy (into, from, size_t(bytes));
   }
 
   input_sample = copy->input_sample + idat_start;
@@ -189,7 +189,7 @@
   const unsigned char* from = little->get_datptr(0);
   unsigned char* to = get_datptr(get_ndat());
 
-  memcpy(to,from,size_t(get_nbytes()));
+  memory->do_copy (to, from, size_t(get_nbytes()));
 
   set_ndat( get_ndat() + little->get_ndat() );
 }
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/DADAFile.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/DADAFile.C
--- bl-dspsr-0+git20160405/Kernel/Classes/DADAFile.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/DADAFile.C	2018-03-12 23:02:35.000000000 +0000
@@ -15,6 +15,7 @@
 
 #include "FilePtr.h"
 #include "Error.h"
+#include "strutil.h"
 
 #include <fstream>
 #include <fcntl.h>
@@ -23,11 +24,13 @@
 
 dsp::DADAFile::DADAFile (const char* filename) : File ("DADA")
 {
+  separate_header_file = false;
+  
   if (filename) 
     open (filename);
 }
 
-string dsp::DADAFile::get_header (const char* filename)
+string dsp::DADAFile::get_header (const char* filename) const
 {
   FilePtr fptr = fopen (filename, "r");
   if (!fptr)
@@ -35,7 +38,7 @@
 		 "fopen (%s)", filename);
 
   // default DADA header size
-  unsigned hdr_size = 4096;
+  long hdr_size = 4096;
   vector<char> buffer;
   char* header = 0;
 
@@ -55,13 +58,51 @@
 
     /* Get the header size */
     if (ascii_header_get (header, "HDR_SIZE", "%u", &hdr_size) != 1)
-      throw Error (InvalidState, "dsp::DADAFile::get_header",
-		   "could not parse HDR_SIZE");
+      hdr_size = 0;
 
     /* Ensure that the incoming header fits in the client header buffer */
   }
   while (hdr_size > buffer.size());
 
+  if (hdr_size == 0)
+  {
+    // search for a matching .hdr file
+    string hdr_ext = ".hdr";
+    string hdr_fname = replace_extension (filename, hdr_ext);
+    FilePtr hdr_ptr = fopen (hdr_fname.c_str(), "r");
+    if (!fptr)
+    {
+      hdr_fname = filename + hdr_ext;
+      hdr_ptr = fopen (hdr_fname.c_str(), "r");
+    }
+    
+    if (!hdr_ptr)
+      throw Error (InvalidState, "dsp::DADAFile::get_header",
+		   "file has no header and no matching header file found");
+
+    if (fseek (hdr_ptr, 0, SEEK_END) < 0)
+      throw Error (FailedSys, "dsp::DADAFile::get_header",
+		   "could not fseek to end of header file");
+
+    hdr_size = ftell (hdr_ptr);
+    if (hdr_size < 0)
+      throw Error (FailedSys, "dsp::DADAFile::get_header",
+		   "ftell fails at end of header file");
+
+    ::rewind (hdr_ptr);
+
+    buffer.resize (hdr_size);
+    header = &(buffer[0]);
+
+    if (fread (header, 1, hdr_size, hdr_ptr) != hdr_size)
+      throw Error (FailedSys, "dsp::DADAFile::get_header",
+		   "fread (nbyte=%u) from header file", hdr_size);
+
+    // ensure that text is null-terminated before calling ascii_header_get
+    header[ hdr_size-1 ] = '\0';
+    separate_header_file = true;
+  }
+  
   if (!header)
     return string();
 
@@ -116,11 +157,15 @@
   
   info = new ASCIIObservation (header.c_str());
 
-  if (ascii_header_get (header.c_str(), "HDR_SIZE", "%u", &header_bytes) < 0)
+  const char* hdr = header.c_str();
+  
+  if (separate_header_file)
+    header_bytes = 0;
+  else if (ascii_header_get (hdr, "HDR_SIZE", "%u", &header_bytes) < 0)
     throw Error (FailedCall, "dsp::DADAFile::open_file",
 		 "ascii_header_get(HDR_SIZE) failed");
 
-  if (ascii_header_get (header.c_str(), "RESOLUTION", "%u", &resolution) < 0)
+  if (ascii_header_get (hdr, "RESOLUTION", "%u", &resolution) < 0)
     resolution = 1;
 
   // the resolution is the _byte_ resolution; convert to _sample_ resolution
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/debug.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/debug.h
--- bl-dspsr-0+git20160405/Kernel/Classes/debug.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/debug.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/debug.h,v $
-   $Revision: 1.1 $
-   $Date: 2009/11/15 00:47:21 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/debug.h
 
 #ifndef __debug_h
 #define __debug_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/ASCIIObservation.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/ASCIIObservation.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/ASCIIObservation.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/ASCIIObservation.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/ASCIIObservation.h,v $
-   $Revision: 1.8 $
-   $Date: 2011/08/01 10:05:37 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/ASCIIObservation.h
 
 #ifndef __ASCIIObservation_h
 #define __ASCIIObservation_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/BitUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/BitUnpacker.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/BitUnpacker.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/BitUnpacker.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/BitUnpacker.h,v $
-   $Revision: 1.6 $
-   $Date: 2009/06/17 10:16:53 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/BitUnpacker.h
 
 #ifndef __BitUnpacker_h
 #define __BitUnpacker_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/BlockFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/BlockFile.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/BlockFile.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/BlockFile.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/BlockFile.h,v $
-   $Revision: 1.8 $
-   $Date: 2010/10/22 19:17:56 $
-   $Author: demorest $ */
+// dspsr/Kernel/Classes/dsp/BlockFile.h
 
 
 #ifndef __dsp_BlockFile_h
@@ -59,7 +56,7 @@
       than the sampled data, this method should be overloaded and the
       additional information should be filtered out. */
     virtual int64_t load_bytes (unsigned char* buffer, uint64_t nbytes);
-    
+
     //! Set the file pointer to the absolute number of sampled data bytes
     /*! If the header_bytes attribute is set, this number of bytes
       will be subtracted by File::seek_bytes before seeking.  If the
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/BlockIterator.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/BlockIterator.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/BlockIterator.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/BlockIterator.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/BlockIterator.h,v $
-   $Revision: 1.3 $
-   $Date: 2008/09/09 06:34:14 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/BlockIterator.h
 
 #ifndef __BlockIterator_h
 #define __BlockIterator_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/BufferingPolicy.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/BufferingPolicy.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/BufferingPolicy.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/BufferingPolicy.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/BufferingPolicy.h,v $
-   $Revision: 1.7 $
-   $Date: 2009/06/17 10:16:53 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/BufferingPolicy.h
 
 #ifndef __baseband_dsp_BufferingPolicy_h
 #define __baseband_dsp_BufferingPolicy_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/DADAFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/DADAFile.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/DADAFile.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/DADAFile.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/DADAFile.h,v $
-   $Revision: 1.2 $
-   $Date: 2008/05/28 21:12:42 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/DADAFile.h
 
 #ifndef __DADAFile_h
 #define __DADAFile_h
@@ -36,8 +33,10 @@
     virtual void open_file (const char* filename);
 
     //! Read the DADA ascii header from filename
-    static std::string get_header (const char* filename);
+    std::string get_header (const char* filename) const;
 
+    //! Flag set true when the header information is in a separate text file
+    mutable bool separate_header_file;
   };
 
 }
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/Digitizer.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Digitizer.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/Digitizer.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Digitizer.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/Digitizer.h,v $
-   $Revision: 1.3 $
-   $Date: 2010/04/02 21:27:32 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/Digitizer.h
 
 
 #ifndef __Digitizer_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/dsp.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/dsp.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/dsp.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/dsp.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/dsp.h,v $
-   $Revision: 1.6 $
-   $Date: 2008/04/14 21:23:59 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/dsp.h
 
 #ifndef __baseband_dsp_h
 #define __baseband_dsp_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/DummyFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/DummyFile.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/DummyFile.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/DummyFile.h	2018-03-12 23:02:35.000000000 +0000
@@ -36,7 +36,7 @@
     void close();
 
     //! load bytes
-    int64_t load_bytes(unsigned char *buffer, uint64_t bytes);
+    int64_t load_bytes (unsigned char *buffer, uint64_t bytes);
 
     //! seek bytes
     int64_t seek_bytes(uint64_t bytes);
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/EightBitOne.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/EightBitOne.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/EightBitOne.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/EightBitOne.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/EightBitOne.h,v $
-   $Revision: 1.3 $
-   $Date: 2009/10/30 00:15:03 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/EightBitOne.h
 
 #ifndef __EightBitOne_h
 #define __EightBitOne_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/EightBitUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/EightBitUnpacker.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/EightBitUnpacker.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/EightBitUnpacker.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/EightBitUnpacker.h,v $
-   $Revision: 1.5 $
-   $Date: 2009/06/17 10:16:53 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/EightBitUnpacker.h
 
 #ifndef __EightBitUnpacker_h
 #define __EightBitUnpacker_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/ExcisionUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/ExcisionUnpacker.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/ExcisionUnpacker.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/ExcisionUnpacker.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/ExcisionUnpacker.h,v $
-   $Revision: 1.6 $
-   $Date: 2009/08/27 06:53:58 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/ExcisionUnpacker.h
 
 #ifndef __ExcisionUnpacker_h
 #define __ExcisionUnpacker_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/excision_unpack.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/excision_unpack.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/excision_unpack.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/excision_unpack.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/excision_unpack.h,v $
-   $Revision: 1.5 $
-   $Date: 2009/10/30 00:15:08 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/excision_unpack.h
 
 #ifndef __ExcisionUnpacker_excision_unpack_h
 #define __ExcisionUnpacker_excision_unpack_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/File.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/File.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/File.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/File.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/File.h,v $
-   $Revision: 1.34 $
-   $Date: 2012/02/24 20:47:06 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/File.h
 
 
 #ifndef __File_h
@@ -117,11 +114,23 @@
     //! The name of the currently opened file, set by open()
     std::string current_filename;
 
+#if HAVE_CUDA
+    //! staging buffer for Host to Device transfers
+    void * host_buffer;
+
+    //! The size of the host_buffer in bytes
+    uint64_t host_buffer_size;
+#endif
+
     //! Load nbyte bytes of sampled data from the device into buffer
     /*! If the data stored on the device contains information other
       than the sampled data, this method should be overloaded and the
       additional information should be filtered out. */
     virtual int64_t load_bytes (unsigned char* buffer, uint64_t nbytes);
+
+#if HAVE_CUDA
+    virtual int64_t load_bytes_device (unsigned char* buffer, uint64_t bytes, void * device_handle);
+#endif
     
     //! Set the file pointer to the absolute number of sampled data bytes
     /*! If the header_bytes attribute is set, this number of bytes
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/FloatUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/FloatUnpacker.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/FloatUnpacker.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/FloatUnpacker.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/FloatUnpacker.h,v $
-   $Revision: 1.1 $
-   $Date: 2011/08/01 10:07:00 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/FloatUnpacker.h
 
 #ifndef __FloatUnpacker_h
 #define __FloatUnpacker_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/FourBitTwo.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/FourBitTwo.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/FourBitTwo.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/FourBitTwo.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/FourBitTwo.h,v $
-   $Revision: 1.3 $
-   $Date: 2009/10/30 00:15:03 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/FourBitTwo.h
 
 #ifndef __FourBitTwo_h
 #define __FourBitTwo_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/FourBitUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/FourBitUnpacker.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/FourBitUnpacker.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/FourBitUnpacker.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/FourBitUnpacker.h,v $
-   $Revision: 1.9 $
-   $Date: 2009/06/17 10:16:53 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/FourBitUnpacker.h
 
 #ifndef __FourBitUnpacker_h
 #define __FourBitUnpacker_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/GenericEightBitUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/GenericEightBitUnpacker.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/GenericEightBitUnpacker.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/GenericEightBitUnpacker.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/GenericEightBitUnpacker.h,v $
-   $Revision: 1.1 $
-   $Date: 2012/03/21 09:19:09 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/GenericEightBitUnpacker.h
 
 #ifndef __GenericEightBitUnpacker_h
 #define __GenericEightBitUnpacker_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/GenericFourBitUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/GenericFourBitUnpacker.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/GenericFourBitUnpacker.h	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/GenericFourBitUnpacker.h	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,30 @@
+//-*-C++-*-
+/***************************************************************************
+ *
+ *   Copyright (C) 2017 by Willem van Straten
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#ifndef __GenericFourBitUnpacker_h
+#define __GenericFourBitUnpacker_h
+
+#include "dsp/FourBitUnpacker.h"
+
+namespace dsp
+{
+  //! Converts single-dish GMRT data from 4-bit to floating point values
+  class GenericFourBitUnpacker: public FourBitUnpacker
+  {
+  public:
+
+    //! Constructor initializes bit table
+    GenericFourBitUnpacker ();
+
+    //! Return true if this unpacker can handle the observation
+    bool matches (const Observation*);
+
+  };
+}
+
+#endif
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/HasInput.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/HasInput.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/HasInput.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/HasInput.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/HasInput.h,v $
-   $Revision: 1.3 $
-   $Date: 2011/07/26 12:40:30 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/HasInput.h
 
 #ifndef __dsp_HasInput_h
 #define __dsp_HasInput_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/HasOutput.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/HasOutput.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/HasOutput.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/HasOutput.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/HasOutput.h,v $
-   $Revision: 1.3 $
-   $Date: 2010/09/16 17:45:34 $
-   $Author: demorest $ */
+// dspsr/Kernel/Classes/dsp/HasOutput.h
 
 #ifndef __dsp_HasOutput_h
 #define __dsp_HasOutput_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/HistUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/HistUnpacker.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/HistUnpacker.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/HistUnpacker.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/HistUnpacker.h,v $
-   $Revision: 1.17 $
-   $Date: 2010/05/28 14:13:15 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/HistUnpacker.h
 
 #ifndef __HistUnpacker_h
 #define __HistUnpacker_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/Input.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Input.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/Input.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Input.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/Input.h,v $
-   $Revision: 1.47 $
-   $Date: 2011/09/20 20:20:26 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/Input.h
 
 #ifndef __Input_h
 #define __Input_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/IOManager.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/IOManager.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/IOManager.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/IOManager.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/IOManager.h,v $
-   $Revision: 1.33 $
-   $Date: 2011/08/04 21:04:38 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/IOManager.h
 
 
 #ifndef __IOManager_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/MemoryCUDA.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/MemoryCUDA.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/MemoryCUDA.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/MemoryCUDA.h	2018-03-12 23:02:35.000000000 +0000
@@ -27,7 +27,8 @@
   class DeviceMemory : public dsp::Memory
   {
   public:
-    DeviceMemory (cudaStream_t _stream = 0) { stream = _stream; }
+
+    DeviceMemory (cudaStream_t _stream = 0, int _device = 0);
 
     void* do_allocate (size_t nbytes);
     void do_free (void*);
@@ -35,10 +36,17 @@
     void do_zero (void*, size_t);
     bool on_host () const { return false; }
 
+    void set_stream (cudaStream_t _stream) { stream = _stream; }
     cudaStream_t get_stream () { return stream; }
+    cudaStream_t get_stream () const { return stream; }
+
+    int get_device () { return device; };
+    int get_device () const { return device; };
+
 
   protected:
     cudaStream_t stream;
+    int device;
   };
 
   class SharedPinnedMemory : public dsp::Memory
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/Memory.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Memory.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/Memory.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Memory.h	2018-03-12 23:02:35.000000000 +0000
@@ -10,7 +10,7 @@
 #define __dsp_Memory_h_
 
 #include "Reference.h"
-#include <inttypes.h>
+#include "environ.h"
 
 namespace dsp {
 
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/MultiFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/MultiFile.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/MultiFile.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/MultiFile.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/MultiFile.h,v $
-   $Revision: 1.29 $
-   $Date: 2011/09/20 20:20:31 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/MultiFile.h
 
 
 #ifndef __MultiFile_h
@@ -51,6 +48,9 @@
     File* get_loader ();
     const File* get_loader () const;
 
+    //! Access to current file objects
+    std::vector< Reference::To<File> >& get_files () {return files;}
+
     //! Return true if the loader File instance is set
     bool has_loader ();
 
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/NLowLookup.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/NLowLookup.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/NLowLookup.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/NLowLookup.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/NLowLookup.h,v $
-   $Revision: 1.2 $
-   $Date: 2008/07/13 00:38:53 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/NLowLookup.h
 
 #ifndef __NLowLookup_h
 #define __NLowLookup_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/ObservationInterface.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/ObservationInterface.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/ObservationInterface.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/ObservationInterface.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/ObservationInterface.h,v $
-   $Revision: 1.1 $
-   $Date: 2012/01/19 21:46:11 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/ObservationInterface.h
 
 #ifndef __dsp_ObservationTI_h
 #define __dsp_ObservationTI_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/Operation.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Operation.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/Operation.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Operation.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/Operation.h,v $
-   $Revision: 1.49 $
-   $Date: 2010/02/04 09:15:10 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/Operation.h
 
 #ifndef __Operation_h
 #define __Operation_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/OutputFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/OutputFile.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/OutputFile.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/OutputFile.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/OutputFile.h,v $
-   $Revision: 1.2 $
-   $Date: 2011/09/19 01:56:42 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/OutputFile.h
 
 #ifndef __OutputFile_h
 #define __OutputFile_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/PrestoObservation.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/PrestoObservation.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/PrestoObservation.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/PrestoObservation.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/PrestoObservation.h,v $
-   $Revision: 1.2 $
-   $Date: 2009/03/03 05:29:30 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/PrestoObservation.h
 
 #ifndef __PrestoObservation_h
 #define __PrestoObservation_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/Scratch.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Scratch.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/Scratch.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Scratch.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/Scratch.h,v $
-   $Revision: 1.5 $
-   $Date: 2010/01/21 23:36:23 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/Scratch.h
 
 #ifndef __dsp_Scratch_h
 #define __dsp_Scratch_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/Seekable.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Seekable.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/Seekable.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Seekable.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,15 +6,16 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/Seekable.h,v $
-   $Revision: 1.15 $
-   $Date: 2010/06/04 03:36:31 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/Seekable.h
 
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
 
 #ifndef __Seekable_h
 #define __Seekable_h
 
+#include "dsp/Memory.h"
 #include "dsp/Input.h"
 
 namespace dsp {
@@ -44,9 +45,15 @@
     //! Inquire current time sample
     virtual uint64_t get_current_sample() { return current_sample; }
 
+    //! Set the bits series into which data will be loaded
+    void set_output (BitSeries* data);
+
     //! Buffer used to store overlap (useful in multi-threaded applications)
     void set_overlap_buffer (BitSeries*);
 
+    //! Set the memory type used in the overlap buffer 
+    void set_overlap_buffer_memory (Memory * memory);
+
   protected:
     
     //! set end_of_data
@@ -57,6 +64,11 @@
  
     //! Load data from device and return the number of bytes read.
     virtual int64_t load_bytes (unsigned char* buffer, uint64_t bytes) = 0;
+
+#ifdef HAVE_CUDA
+    //! Load data from device to device memory and return the number of bytes read.
+    virtual int64_t load_bytes_device (unsigned char* buffer, uint64_t bytes, void * dev_handle) = 0;
+#endif
     
     //! Seek to absolute position and return absolute position in bytes
     virtual int64_t seek_bytes (uint64_t bytes) = 0;
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/Sink.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Sink.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/Sink.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Sink.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/Sink.h,v $
-   $Revision: 1.2 $
-   $Date: 2009/06/07 01:22:34 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/Sink.h
 
 #ifndef __dsp_Sink_h
 #define __dsp_Sink_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/StepIterator.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/StepIterator.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/StepIterator.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/StepIterator.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/StepIterator.h,v $
-   $Revision: 1.2 $
-   $Date: 2008/07/13 00:38:53 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/StepIterator.h
 
 #ifndef __dsp_StepIterator_h
 #define __dsp_StepIterator_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/SubByteTwoBitCorrection.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/SubByteTwoBitCorrection.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/SubByteTwoBitCorrection.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/SubByteTwoBitCorrection.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/SubByteTwoBitCorrection.h,v $
-   $Revision: 1.9 $
-   $Date: 2010/05/11 06:21:12 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/SubByteTwoBitCorrection.h
 
 #ifndef __SubByteTwoBitCorrection_h
 #define __SubByteTwoBitCorrection_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/TestInput.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TestInput.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/TestInput.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TestInput.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/TestInput.h,v $
-   $Revision: 1.3 $
-   $Date: 2009/06/17 10:16:53 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/TestInput.h
 
 #ifndef __TestInput_h
 #define __TestInput_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/TimeSeriesCUDA.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TimeSeriesCUDA.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/TimeSeriesCUDA.h	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TimeSeriesCUDA.h	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,100 @@
+//-*-C++-*-
+
+/***************************************************************************
+ *
+ *   Copyright (C) 2016 by Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#ifndef __TimeSeriesEngine_h
+#define __TimeSeriesEngine_h
+
+#include "dsp/TimeSeries.h"
+#include "dsp/MemoryCUDA.h"
+
+#include <cuda_runtime.h>
+
+namespace CUDA
+{
+  class TimeSeriesEngine : public dsp::TimeSeries::Engine
+  {
+  public:
+
+    //! Default constructor
+    TimeSeriesEngine (dsp::Memory * _memory);
+
+    //! Copy constructor
+    //TimeSeriesEngine (const TimeSeriesEngine& tse);
+
+    ~TimeSeriesEngine ();
+
+    //TimeSeriesEngine& operator = (const TimeSeriesEngine& copy);
+
+    void prepare (dsp::TimeSeries * parent);
+
+    void prepare_buffer (unsigned nbytes);
+
+    void copy_data_fpt (const dsp::TimeSeries * copy,
+                        uint64_t idat_start = 0,
+                        uint64_t ndat = 0);
+
+    void copy_data_fpt_same_stream (const dsp::TimeSeries * from,
+            uint64_t idat_start, uint64_t ndat);
+
+    void copy_data_fpt_same_device (const dsp::TimeSeries * from,
+            uint64_t idat_start, uint64_t ndat);
+
+    void copy_data_fpt_diff_device (const dsp::TimeSeries * from,
+            uint64_t idat_start, uint64_t ndat);
+
+    void copy_data_fpt_kernel_multidim (float * to, const float * from,
+            uint64_t to_stride, uint64_t from_stride, 
+            uint64_t idat_start, uint64_t ndat, cudaStream_t stream);
+
+    void * buffer;
+
+  protected:
+
+    dsp::TimeSeries * to;
+
+    CUDA::DeviceMemory * memory;
+
+    CUDA::PinnedMemory * pinned_memory;
+
+    void * host_buffer;
+
+    size_t host_buffer_size;
+
+    size_t buffer_size;
+
+    unsigned nchan;
+
+    unsigned npol;
+
+    unsigned ndim;
+
+    uint64_t ichanpol_stride;
+
+    uint64_t ochanpol_stride;
+
+    uint64_t bchanpol_stride;
+
+    unsigned nthread;
+
+    dim3 blocks;
+
+    int device;
+
+    cudaStream_t to_stream;
+      
+    cudaStream_t from_stream;
+
+    int to_device;
+
+    int from_device;
+
+  };
+}
+
+#endif // !defined(__TimeSeriesEngine_h)
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/TimeSeries.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TimeSeries.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/TimeSeries.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TimeSeries.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/TimeSeries.h,v $
-   $Revision: 1.55 $
-   $Date: 2011/08/04 21:05:19 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/TimeSeries.h
 
 #ifndef __TimeSeries_h
 #define __TimeSeries_h
@@ -152,6 +149,12 @@
 
     void set_match (TimeSeries*);
 
+    class Engine;
+
+    void set_engine (Engine*);
+
+    Engine * get_engine () const { return engine; };
+
   protected:
 
     //! Returns a uchar pointer to the first piece of data
@@ -179,6 +182,8 @@
     // do the work of the null_clone: copy necessary attributes from the given TimeSeries
     void null_work (const TimeSeries* from);
 
+    Reference::To<Engine> engine;
+
   private:
 
     //! Order of the dimensions
@@ -202,7 +207,21 @@
 
 
   };
-  
+ 
+  class TimeSeries::Engine : public OwnStream
+  {
+  public:
+
+    virtual void prepare (dsp::TimeSeries * to) = 0;
+
+    virtual void prepare_buffer (unsigned nbytes) = 0;
+
+    virtual void copy_data_fpt (const dsp::TimeSeries * copy, 
+                                uint64_t idat_start = 0, 
+                                uint64_t ndat = 0) = 0;
+
+  };
+
 }
 
 #endif
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/Transformation.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Transformation.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/Transformation.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Transformation.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/Transformation.h,v $
-   $Revision: 1.54 $
-   $Date: 2011/08/26 22:02:53 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/Transformation.h
 
 #ifndef __dsp_Transformation_h
 #define __dsp_Transformation_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/TwoBit1or2.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TwoBit1or2.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/TwoBit1or2.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TwoBit1or2.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/TwoBit1or2.h,v $
-   $Revision: 1.6 $
-   $Date: 2010/05/28 14:13:32 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/TwoBit1or2.h
 
 #ifndef __TwoBit1or2_h
 #define __TwoBit1or2_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/TwoBitCorrection.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TwoBitCorrection.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/TwoBitCorrection.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TwoBitCorrection.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/TwoBitCorrection.h,v $
-   $Revision: 1.44 $
-   $Date: 2010/05/11 06:21:17 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/TwoBitCorrection.h
 
 #ifndef __TwoBitCorrection_h
 #define __TwoBitCorrection_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/TwoBitFour.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TwoBitFour.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/TwoBitFour.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TwoBitFour.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/TwoBitFour.h,v $
-   $Revision: 1.7 $
-   $Date: 2010/05/11 06:22:25 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/TwoBitFour.h
 
 #ifndef __TwoBitFour_h
 #define __TwoBitFour_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/TwoBitLookup.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TwoBitLookup.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/TwoBitLookup.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TwoBitLookup.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/TwoBitLookup.h,v $
-   $Revision: 1.1 $
-   $Date: 2008/07/17 01:17:33 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/TwoBitLookup.h
 
 #ifndef __TwoBitLookup_h
 #define __TwoBitLookup_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/TwoBitMask.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TwoBitMask.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/TwoBitMask.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TwoBitMask.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/TwoBitMask.h,v $
-   $Revision: 1.3 $
-   $Date: 2008/07/13 00:38:54 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/TwoBitMask.h
 
 #ifndef __TwoBitMask_h
 #define __TwoBitMask_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/TwoBitTable.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TwoBitTable.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/TwoBitTable.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TwoBitTable.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/TwoBitTable.h,v $
-   $Revision: 1.15 $
-   $Date: 2009/07/31 12:23:13 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/TwoBitTable.h
 
 
 #ifndef __TwoBitTable_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/Unpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Unpacker.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/Unpacker.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Unpacker.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/Unpacker.h,v $
-   $Revision: 1.32 $
-   $Date: 2012/02/24 20:47:06 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/Unpacker.h
 
 
 #ifndef __Unpacker_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/UnpackerIterator.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/UnpackerIterator.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/UnpackerIterator.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/UnpackerIterator.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/UnpackerIterator.h,v $
-   $Revision: 1.1 $
-   $Date: 2008/09/09 06:34:07 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/UnpackerIterator.h
 
 #ifndef __UnpackerIterator_h
 #define __UnpackerIterator_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/WeightedTimeSeries.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/WeightedTimeSeries.h
--- bl-dspsr-0+git20160405/Kernel/Classes/dsp/WeightedTimeSeries.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/WeightedTimeSeries.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/WeightedTimeSeries.h,v $
-   $Revision: 1.15 $
-   $Date: 2011/08/04 21:05:36 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/WeightedTimeSeries.h
 
 #ifndef __WeightedTimeSeries_h
 #define __WeightedTimeSeries_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/DummyFile.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/DummyFile.C
--- bl-dspsr-0+git20160405/Kernel/Classes/DummyFile.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/DummyFile.C	2018-03-12 23:02:35.000000000 +0000
@@ -56,6 +56,19 @@
   
   // Read obs info from ASCII file
   info = new ASCIIObservation(header);
+
+  if (ascii_header_get (header, "RESOLUTION", "%u", &resolution) < 0)
+    resolution = 1;
+
+  // the resolution is the _byte_ resolution; convert to _sample_ resolution
+  if (verbose)
+    cerr << "dsp::DummyFile::open_file byte_resolution=" << resolution << endl;
+  resolution = info->get_nsamples (resolution);
+  if (verbose)
+    cerr << "dsp::DummyFile::open_file sample_resolution=" << resolution << endl;
+  if (resolution == 0)
+    resolution = 1;
+
 }
 
 void dsp::DummyFile::close ()
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/environ.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/environ.h
--- bl-dspsr-0+git20160405/Kernel/Classes/environ.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/environ.h	2018-03-12 23:02:35.000000000 +0000
@@ -4,10 +4,7 @@
  *   Licensed under the Academic Free License version 2.1
  *
  ***************************************************************************/
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/environ.h,v $
-   $Revision: 1.1 $
-   $Date: 2009/06/18 00:05:05 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/environ.h
 
 /*
  * Use the standard C integer types
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/File.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/File.C
--- bl-dspsr-0+git20160405/Kernel/Classes/File.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/File.C	2018-03-12 23:02:35.000000000 +0000
@@ -23,6 +23,10 @@
 #include <unistd.h>
 #include <errno.h>
 
+#if HAVE_CUDA
+#include <cuda_runtime.h>
+#endif
+
 using namespace std;
 using std::cerr;
 
@@ -51,6 +55,11 @@
 
   current_filename = "";
 
+#if HAVE_CUDA
+  host_buffer = 0;
+  host_buffer_size = 0;
+#endif
+
   get_info()->init();
 }
 
@@ -200,6 +209,70 @@
   return bytes_read;
 }
 
+#if HAVE_CUDA
+int64_t dsp::File::load_bytes_device (unsigned char* buffer, uint64_t bytes, void * device_handle)
+{
+  cudaStream_t stream = (cudaStream_t) device_handle;
+
+  if (verbose)
+    cerr << "dsp::File::load_bytes_device (" << (void *) buffer << ", " 
+         << bytes << ", " << (void *) stream << ")" << endl;
+
+  cudaError_t result;
+
+  // ensure the host CPU buffer is large enough
+  if (bytes > host_buffer_size)
+  {
+    if (host_buffer)
+    {
+      if (result != cudaSuccess)
+        throw Error (InvalidState, "dsp::File::load_bytes_device",
+                     "failed to synchronize cuda stream prior to buffer enlargement: %s",
+                     cudaGetErrorString (result));
+
+      result = cudaFreeHost (host_buffer);
+      if (result != cudaSuccess)
+        throw Error (InvalidState, "dsp::File::load_bytes_device",
+                     "cudaFreeHost (host_buffer) failed: %s",
+                    cudaGetErrorString (result));
+    }
+
+    if (verbose)
+      cerr << "dsp::File::load_bytes_device cudaMallocHost() " <<  bytes 
+           << "bytes  for host_buffer" << endl;
+    result = cudaMallocHost (&host_buffer, bytes);
+    if (result != cudaSuccess)
+      throw Error (InvalidState, "dsp::File::load_bytes_device",
+                   "cudaMallocHost (host_buffer, %"PRIu64") failed: %s",
+                  bytes, cudaGetErrorString (result));
+    host_buffer_size = bytes;
+  }
+
+  if (verbose)
+    cerr << "dsp::File::load_bytes_device load_bytes(" << (void *) host_buffer
+         << ", " << bytes << ")" << endl;
+
+  // load the data from device to the host buffer
+  int64_t bytes_read = load_bytes ((unsigned char *) host_buffer, bytes);
+
+  if (bytes_read > 0)
+  {
+    if (verbose)
+      cerr << "dsp::File::load_bytes_device cudaMemcpyAsync (" 
+           << (void *) buffer << ", " << (void *) host_buffer
+           << ", " << bytes << ", cudaMemcpyHostToDevice, " 
+           << (void *) stream << ")" << endl;
+    result = cudaMemcpyAsync (buffer, host_buffer, bytes, cudaMemcpyHostToDevice, stream);
+    if (result != cudaSuccess)
+      throw Error (InvalidState, "dsp::File::load_bytes_device",
+                   "cudaMemcpyAsync (%p, %p, %"PRIu64") failed: %s",
+                   (void *) buffer, host_buffer, bytes, cudaGetErrorString (result));
+    cudaStreamSynchronize(stream);
+  }
+  return bytes_read;
+}
+#endif
+
 //! Adjust the file pointer
 int64_t dsp::File::seek_bytes (uint64_t bytes)
 {
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/GenericEightBitUnpacker.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/GenericEightBitUnpacker.C
--- bl-dspsr-0+git20160405/Kernel/Classes/GenericEightBitUnpacker.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/GenericEightBitUnpacker.C	2018-03-12 23:02:35.000000000 +0000
@@ -26,7 +26,12 @@
 dsp::GenericEightBitUnpacker::GenericEightBitUnpacker ()
   : EightBitUnpacker ("GenericEightBitUnpacker")
 {
+#define ASSUME_TWOS_COMPLEMENT 1
+#if ASSUME_TWOS_COMPLEMENT
   table = new BitTable (8, BitTable::TwosComplement);
+#else
+  table = new BitTable (8, BitTable::OffsetBinary);
+#endif
   gpu_stream = undefined_stream;
 }
 
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/GenericEightBitUnpackerCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/GenericEightBitUnpackerCUDA.cu
--- bl-dspsr-0+git20160405/Kernel/Classes/GenericEightBitUnpackerCUDA.cu	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/GenericEightBitUnpackerCUDA.cu	2018-03-12 23:02:35.000000000 +0000
@@ -13,7 +13,7 @@
 
 using namespace std;
 
-void check_error (const char*);
+void check_error_stream (const char*, cudaStream_t);
 
 /*
  * Simple CUDA 8-bit unpack kernel
@@ -102,6 +102,6 @@
 		 "unknown BitTable::Type");
 
   if (dsp::Operation::record_time || dsp::Operation::verbose)
-    check_error ("generic_8bit_unpack");
+    check_error_stream ("generic_8bit_unpack", stream);
 }
 
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/GenericFourBitUnpacker.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/GenericFourBitUnpacker.C
--- bl-dspsr-0+git20160405/Kernel/Classes/GenericFourBitUnpacker.C	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/GenericFourBitUnpacker.C	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,35 @@
+/***************************************************************************
+ *
+ *   Copyright (C) 2017 by Willem van Straten
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#include "dsp/GenericFourBitUnpacker.h"
+#include "dsp/BitTable.h"
+
+#include <iostream>
+using namespace std;
+
+dsp::GenericFourBitUnpacker::GenericFourBitUnpacker ()
+  : FourBitUnpacker ("GenericFourBitUnpacker")
+{
+#define ASSUME_TWOS_COMPLEMENT 1
+#if ASSUME_TWOS_COMPLEMENT
+  BitTable* table = new BitTable (4, BitTable::TwosComplement);
+#else
+  BitTable* table = new BitTable (4, BitTable::OffsetBinary);
+#endif
+  table->set_order( BitTable::LeastToMost );
+  set_table( table );
+}
+
+bool dsp::GenericFourBitUnpacker::matches (const Observation* observation)
+{
+  if (verbose)
+    cerr << "dsp::GenericUnpacker::matches"
+      " machine=" << observation->get_machine() <<
+      " nbit=" << observation->get_nbit() << endl;
+
+  return observation->get_nbit() == 4;
+}
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/Input.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/Input.C
--- bl-dspsr-0+git20160405/Kernel/Classes/Input.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/Input.C	2018-03-12 23:02:35.000000000 +0000
@@ -66,7 +66,8 @@
     maximum_load_size += 2 * resolution;
 
   if (verbose)
-    cerr << "dsp::Input::reserve " << maximum_load_size << endl;
+    cerr << "dsp::Input::reserve block_size=" << block_size 
+         << " maximum_load_size=" << maximum_load_size << endl;
 
   buffer->resize (maximum_load_size);
   buffer->resize (block_size);
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/IOManager.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/IOManager.C
--- bl-dspsr-0+git20160405/Kernel/Classes/IOManager.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/IOManager.C	2018-03-12 23:02:35.000000000 +0000
@@ -375,13 +375,15 @@
   if (verbose)
     cerr << "dsp::IOManager::set_block_size required block_size="
          << block_size << endl;
+  if (verbose)
+    cerr << "dsp::IOManager::set_block_size minimum_RAM=" << minimum_RAM << " nbyte_dat=" << nbyte_dat << endl;
 
   if (minimum_RAM)
   {
     uint64_t size = (uint64_t(minimum_RAM/nbyte_dat)/resolution) * resolution;
     if (verbose)
       cerr << "dsp::IOManager::set_block_size"
-	" minimum block_size=" << size << endl;
+       " minimum block_size=" << size << endl;
 
     block_size = std::max (block_size, size);
   }
@@ -405,12 +407,13 @@
       cerr << "dsp::IOManager::set_block_size insufficient RAM" << endl;
 
     throw Error (InvalidState, "dsp::IOManager::set_block_size",
-		 "insufficient RAM: limit=%g MB -> block="UI64" samples\n\t"
-		 "require="UI64" samples -> \"-U %g\" on command line",
-		 float(maximum_RAM)/megabyte, block_size,
-		 minimum_samples, min_ram/megabyte);
+                 "insufficient RAM: limit=%g MB -> block="UI64" samples\n\t"
+                 "require="UI64" samples -> \"-U %g\" on command line",
+                 float(maximum_RAM)/megabyte, block_size,
+                 minimum_samples, min_ram/megabyte);
   }
 
+  // input overlap incorporates overlapping blocks of input data
   if (input->get_overlap())
   {
     unsigned overlap = input->get_overlap();
@@ -419,10 +422,11 @@
     double parts = (block_size - overlap) / stride;
 
     if (verbose)
-      cerr << "dsp::IOManager::set_block_size input"
-              " overlap=" << overlap << " parts=" << parts << endl;
+      cerr << "dsp::IOManager::set_block_size block_size=" <<  block_size
+           << " overlap=" << overlap << " parts=" << parts << endl;
 
-    uint64_t block_resize = unsigned(parts)*(minimum_samples-overlap) + overlap;
+    uint64_t block_resize = unsigned(parts)*(minimum_samples - overlap)+ overlap;
+      cerr << "dsp::IOManager::set_block_size block_resize=" << block_resize << endl;
 
     if (filterbank_resolution)
     {
@@ -431,17 +435,17 @@
       unsigned best_npart = 0;
       while (trial_block_size < block_size)
       {
-	double trial_parts = (trial_block_size-overlap) / stride;
-	if (trial_parts == unsigned(trial_parts))
-	  best_npart = trial_block_size / filterbank_resolution;
+        double trial_parts = (trial_block_size-overlap) / stride;
+        if (trial_parts == unsigned(trial_parts))
+          best_npart = trial_block_size / filterbank_resolution;
 
-	trial_block_size += filterbank_resolution;
+        trial_block_size += filterbank_resolution;
       }
 
       if (best_npart == 0)
-	throw Error (InvalidState, "dsp::IOManager::set_block_size",
-		     "could not find an overlapping block size "
-		     "for both Filterbank and Convolution");
+        throw Error (InvalidState, "dsp::IOManager::set_block_size",
+                     "could not find an overlapping block size "
+                     "for both Filterbank and Convolution");
 
       // WvS to-do: if filterbank also loses samples, then add nlost here
       block_resize = best_npart * filterbank_resolution;
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/Makefile.am bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/Makefile.am
--- bl-dspsr-0+git20160405/Kernel/Classes/Makefile.am	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/Makefile.am	2018-03-12 23:02:35.000000000 +0000
@@ -29,7 +29,9 @@
 	dsp/HasOutput.h dsp/Sink.h dsp/Multiplex.h \
 	dsp/Memory.h debug.h dsp/OperationThread.h dsp/FloatUnpacker.h \
 	dsp/UniversalInputBuffering.h dsp/OutputFile.h \
-	dsp/ObservationInterface.h dsp/GenericEightBitUnpacker.h     \
+	dsp/ObservationInterface.h \
+	dsp/GenericEightBitUnpacker.h \
+	dsp/GenericFourBitUnpacker.h \
 	dsp/CommandLineHeader.h dsp/OutputFileShare.h
 
 libClasses_la_SOURCES = ascii_header.c ASCIIObservation.C	    \
@@ -48,18 +50,21 @@
 	UnpackerIterator.C ObservationChange.C PrestoObservation.C  \
 	CloneArchive.C SignalPath.C Multiplex.C Memory.C \
 	OperationThread.C FloatUnpacker.C OutputFile.C \
-	ObservationInterface.C GenericEightBitUnpacker.C            \
+	ObservationInterface.C \
+	GenericEightBitUnpacker.C \
+	GenericFourBitUnpacker.C \
 	CommandLineHeader.C OutputFileShare.C
 
 if HAVE_MPI
 libClasses_la_SOURCES += MPIRoot.C MPITrans.C MPIServer.C mpi_Observation.C
 endif
 
-if HAVE_CUFFT
-nobase_include_HEADERS += dsp/MemoryCUDA.h dsp/GenericEightBitUnpackerCUDA.h
-libClasses_la_SOURCES += MemoryCUDA.C GenericEightBitUnpackerCUDA.cu \
-	check_error.C
-libClasses_la_LIBADD = @CUFFT_LIBS@ @CUDA_LIBS@
+if HAVE_CUDA
+nobase_include_HEADERS += dsp/MemoryCUDA.h dsp/GenericEightBitUnpackerCUDA.h \
+                          dsp/TimeSeriesCUDA.h
+libClasses_la_SOURCES += MemoryCUDA.C check_error.C GenericEightBitUnpackerCUDA.cu \
+                         TimeSeriesCUDA.cu
+libClasses_la_LIBADD = @CUDA_LIBS@
 endif
 
 check_PROGRAMS = test_BlockIterator test_environ
@@ -73,5 +78,5 @@
 
 LDADD = libClasses.la
 
-AM_CPPFLAGS += @CUFFT_CFLAGS@
+AM_CPPFLAGS += @CUDA_CFLAGS@ @PSRDADA_CFLAGS@
 
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/Memory.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/Memory.C
--- bl-dspsr-0+git20160405/Kernel/Classes/Memory.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/Memory.C	2018-03-12 23:02:35.000000000 +0000
@@ -16,18 +16,19 @@
 
 void* dsp::Memory::do_allocate (size_t nbytes)
 {
-  DEBUG("dsp::Memory::allocate (" << nbytes << ")");
+  DEBUG("dsp::Memory::do_allocate (" << nbytes << ")");
   return malloc16 (nbytes);
 }
 
 void dsp::Memory::do_free (void* ptr)
 {
-  DEBUG("dsp::Memory::free (" << ptr << ")");
+  DEBUG("dsp::Memory::do_free (" << ptr << ")");
   free16 (ptr);
 }
 
 void dsp::Memory::do_zero (void* ptr, size_t nbytes)
 {
+  DEBUG("dsp::Memory::do_zero (" << (void*) ptr << "," << nbytes << ")");
   memset (ptr, 0, nbytes);
 }
 
@@ -39,11 +40,13 @@
 
 void* dsp::Memory::allocate (size_t nbytes)
 {
+  DEBUG("dsp::Memory::allocate (" << nbytes << ")");
   return get_manager()->do_allocate (nbytes);
 }
 
 void dsp::Memory::free (void* ptr)
 {
+  DEBUG("dsp::Memory::free (" << (void*) ptr << ")");
   get_manager()->do_free (ptr);
 }
 
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/MemoryCUDA.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/MemoryCUDA.C
--- bl-dspsr-0+git20160405/Kernel/Classes/MemoryCUDA.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/MemoryCUDA.C	2018-03-12 23:02:35.000000000 +0000
@@ -5,8 +5,6 @@
  *
  ***************************************************************************/
 
-// #define _DEBUG 1
-
 #include "dsp/MemoryCUDA.h"
 #include "debug.h"
 
@@ -46,6 +44,12 @@
  *
  ***************************************************************************/
 
+CUDA::DeviceMemory::DeviceMemory (cudaStream_t _stream, int _device)
+{
+  stream = _stream; 
+  device = _device;
+} 
+
 void* CUDA::DeviceMemory::do_allocate (size_t nbytes)
 {
   DEBUG("CUDA::DeviceMemory::allocate cudaMalloc (" << nbytes << ")");
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/Seekable.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/Seekable.C
--- bl-dspsr-0+git20160405/Kernel/Classes/Seekable.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/Seekable.C	2018-03-12 23:02:35.000000000 +0000
@@ -5,6 +5,10 @@
  *
  ***************************************************************************/
 
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
 #include "dsp/Seekable.h"
 #include "dsp/BitSeries.h"
 
@@ -13,6 +17,11 @@
 
 #include <string.h>
 
+#if HAVE_CUDA
+#include "dsp/MemoryCUDA.h"
+#include <cuda_runtime.h>
+#endif
+
 using namespace std;
 
 //! Constructor
@@ -75,26 +84,26 @@
   {
     if (verbose)
       cerr << "dsp::Seekable::load_data total ndat=" << get_info()->get_ndat() 
-	   << " read_sample=" << read_sample << endl;
+           << " read_sample=" << read_sample << endl;
 
     if (read_sample > get_info()->get_ndat())
       throw Error (InvalidState, "dsp::Seekable::load_data",
-		   "read_sample="UI64" > ndat="UI64 "\n\t"
-		   "recycled="UI64" load_sample="UI64,
-		   read_sample, get_info()->get_ndat(),
-		   recycled, get_load_sample());
+                   "read_sample="UI64" > ndat="UI64 "\n\t"
+                   "recycled="UI64" load_sample="UI64,
+                   read_sample, get_info()->get_ndat(),
+                   recycled, get_load_sample());
 
     uint64_t samples_left = get_info()->get_ndat() - read_sample;
 
     if (verbose)
       cerr << "dsp::Seekable::load_data " << samples_left 
-	   << " samples remaining" << endl;
+           << " samples remaining" << endl;
 
     if (samples_left <= read_size)
     {
       if (verbose)
-	cerr << "dsp::Seekable::load_data end of data read_size="
-	     << samples_left << endl;
+        cerr << "dsp::Seekable::load_data end of data read_size="
+             << samples_left << endl;
 
       read_size = samples_left;
       end_of_data = true;
@@ -115,7 +124,7 @@
     if (verbose)
       cerr << "dsp::Seekable::load_data read_sample=" << read_sample
            << " != current_sample=" << current_sample 
-	   << " seek_bytes=" << toseek_bytes << endl;
+           << " seek_bytes=" << toseek_bytes << endl;
 
     int64_t seeked = seek_bytes (toseek_bytes);
     if (seeked < 0)
@@ -124,8 +133,8 @@
     // confirm that we be where we expect we be
     if (read_sample != (uint64_t) data->get_nsamples (seeked))
       throw Error (InvalidState, "dsp::Seekable::load_data", "seek mismatch"
-		   " read_sample="UI64" absolute_sample="UI64,
-		   read_sample, data->get_nsamples (seeked));
+                   " read_sample="UI64" absolute_sample="UI64,
+                   read_sample, data->get_nsamples (seeked));
 
     current_sample = read_sample;
   }
@@ -135,18 +144,42 @@
 
   if (toread_bytes < 1)
     throw Error (InvalidState, "dsp::Seekable::load_data",
-		 "invalid BitSeries state");
+                 "invalid BitSeries state");
+
+  int64_t bytes_read;
+
+#if HAVE_CUDA
+  // check if the bit series resides in device memory
+  CUDA::DeviceMemory * device_mem = dynamic_cast<CUDA::DeviceMemory*>(data->get_memory() );
+  if (device_mem)
+  {
+    cudaStream_t stream = device_mem->get_stream();
+    if (verbose)
+      cerr << "dsp::Seekable::load_data"
+        " call load_bytes_device ("<< toread_bytes << ")" <<endl;
 
+    bytes_read = load_bytes_device (into, toread_bytes, (void *) stream);
+    cudaStreamSynchronize (stream);
+  }
+  else
+  {
+    if (verbose)
+    cerr << "dsp::Seekable::load_data"
+      " call load_bytes ("<< toread_bytes << ")" <<endl;
+    bytes_read = load_bytes (into, toread_bytes);
+  }
+#else
   if (verbose)
     cerr << "dsp::Seekable::load_data"
       " call load_bytes("<< toread_bytes << ")" <<endl;
 
-  int64_t bytes_read = load_bytes (into, toread_bytes);
+  bytes_read = load_bytes (into, toread_bytes);
+#endif
 
   if (bytes_read < 0)
     throw Error (FailedCall, "dsp::Seekable::load_data",
-		 "load_bytes ("UI64") block_size=", toread_bytes,
-		 get_block_size());
+                 "load_bytes ("UI64") block_size=", toread_bytes,
+                 get_block_size());
 
   if ((uint64_t)bytes_read < toread_bytes)
   {
@@ -170,7 +203,7 @@
 
     if (verbose)
       cerr << "dsp::Seekable::load_data overlap=" << get_overlap()
-	   << " to_copy=" << to_copy << endl;
+           << " to_copy=" << to_copy << endl;
 
     overlap_buffer->set_nchan( data->get_nchan() );
     overlap_buffer->set_npol ( data->get_npol() );
@@ -187,7 +220,7 @@
 
     if (verbose)
       cerr << "dsp::Seekable::load_data overlap buffer input_sample="
-	   << overlap_buffer->get_input_sample () << endl;
+           << overlap_buffer->get_input_sample () << endl;
   }
 }
 
@@ -234,7 +267,7 @@
 
   if (verbose)
     cerr << "dsp::Seekable::recycle_data recycle " 
-	 << to_recycle << " samples" << endl;
+         << to_recycle << " samples" << endl;
 
   if (to_recycle > get_load_size())
     to_recycle = get_load_size();
@@ -244,13 +277,15 @@
 
   if (verbose)
     cerr << "dsp::Seekable::recycle_data recycle " << recycle_bytes
-	 << " bytes (offset=" << offset_bytes << " bytes)" << endl;
+         << " bytes (offset=" << offset_bytes << " bytes)" << endl;
 
   unsigned char *into = data->get_rawptr();
   unsigned char *rbuf = from->get_rawptr() + offset_bytes;
 
   if (overlap_buffer)
-    memcpy (into, rbuf, size_t(recycle_bytes));
+  {
+    overlap_buffer->get_memory()->do_copy( into, rbuf, size_t(recycle_bytes));
+  }
   else
   {
     // perform an "overlap safe" memcpy
@@ -262,9 +297,9 @@
     while (recycle_bytes)
     {
       if (offset_bytes > recycle_bytes)
-	offset_bytes = recycle_bytes;
+        offset_bytes = recycle_bytes;
 
-      memcpy (into, rbuf, size_t(offset_bytes));
+      from->get_memory()->do_copy (into, rbuf, size_t(offset_bytes));
       
       recycle_bytes -= offset_bytes;
       into += offset_bytes;
@@ -278,9 +313,23 @@
   return to_recycle;
 }
 
+void dsp::Seekable::set_output (BitSeries* data)
+{
+  Input::set_output (data);
+}
+
 void dsp::Seekable::set_overlap_buffer (BitSeries* buffer)
 {
   overlap_buffer = buffer;
 }
 
+void dsp::Seekable::set_overlap_buffer_memory (Memory * memory)
+{
+  if (verbose)
+    cerr << "dsp::Seekable::set_overlap_buffer_memory()" << endl;
+  if (!overlap_buffer)
+    set_overlap_buffer( new BitSeries );
+  overlap_buffer->set_memory( memory );
+}
+
 
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/TimeSeries.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/TimeSeries.C
--- bl-dspsr-0+git20160405/Kernel/Classes/TimeSeries.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/TimeSeries.C	2018-03-12 23:02:35.000000000 +0000
@@ -6,9 +6,15 @@
  *
  ***************************************************************************/
 
+#include "config.h"
 #include "dsp/TimeSeries.h"
 #include "dsp/Memory.h"
 
+#ifdef HAVE_CUDA
+#include "dsp/MemoryCUDA.h"
+#include "dsp/TimeSeriesCUDA.h"
+#endif
+
 #include "fsleep.h"
 #include "Error.h"
 
@@ -45,6 +51,7 @@
   reserve_nfloat = 0;
   input_sample = -1;
   zeroed_data = false;
+  engine = 0;
 }
 
 dsp::TimeSeries* dsp::TimeSeries::clone () const
@@ -65,7 +72,16 @@
 void dsp::TimeSeries::null_work (const TimeSeries* from)
 {
   order = from->order;
+
+#ifdef HAVE_CUDA
   memory = from->memory;
+  if (from->engine) 
+  {
+    set_engine (new CUDA::TimeSeriesEngine(memory));
+  }
+#else
+  memory = from->memory;
+#endif
 }
 
 dsp::TimeSeries::~TimeSeries()
@@ -139,8 +155,12 @@
 	   << int64_t((data-(float*)buffer)) << endl;
   }  
 
-  uint64_t fake_ndat = reserve_nfloat / get_ndim();
-  if (reserve_nfloat % get_ndim())
+  uint64_t reserve_step = get_ndim();
+  if (order == OrderTFP)
+    reserve_step *= get_nchan() * get_npol();
+
+  uint64_t fake_ndat = reserve_nfloat / reserve_step;
+  if (reserve_nfloat % reserve_step)
     fake_ndat ++;
 
   if (verbose)
@@ -459,13 +479,20 @@
     switch (order)
     {
     case OrderFPT:
-      for (unsigned ichan=0; ichan<get_nchan(); ichan++)
+      if (engine)
+      {
+        engine->copy_data_fpt (copy, idat_start, copy_ndat);
+      }
+      else
       {
-        for (unsigned ipol=0; ipol<get_npol(); ipol++)
+        for (unsigned ichan=0; ichan<get_nchan(); ichan++)
         {
-          float* to = get_datptr (ichan, ipol);
-          const float* from = copy->get_datptr(ichan,ipol) + offset;
-          memory->do_copy (to, from, size_t(byte_count));
+          for (unsigned ipol=0; ipol<get_npol(); ipol++)
+          {
+            float* to = get_datptr (ichan, ipol);
+            const float* from = copy->get_datptr(ichan,ipol) + offset;
+            memory->do_copy (to, from, size_t(byte_count));
+          }
         }
       }
       break;
@@ -710,3 +737,10 @@
 		 non_finite, nfloat * nchan * npol);
 }
 
+void dsp::TimeSeries::set_engine( Engine* _engine )
+{
+  engine = _engine;
+  engine->prepare (this);
+}
+
+
diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/TimeSeriesCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/TimeSeriesCUDA.cu
--- bl-dspsr-0+git20160405/Kernel/Classes/TimeSeriesCUDA.cu	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/TimeSeriesCUDA.cu	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,261 @@
+//-*-C++-*-
+
+/***************************************************************************
+ *
+ *   Copyright (C) 2016 by Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#include "dsp/TimeSeriesCUDA.h"
+#include "dsp/MemoryCUDA.h"
+
+#include "Error.h"
+
+void check_error_stream (const char*, cudaStream_t);
+
+using namespace std;
+
+template<typename T>
+__global__ void copy_data_fpt_kernel(T * to, T * from,
+                                     uint64_t to_stride, uint64_t from_stride,
+                                     uint64_t ndat)
+{
+  uint64_t dx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (dx >= ndat)
+    return;
+  to[blockIdx.y * to_stride + dx] = from[blockIdx.y * from_stride + dx];
+}
+
+CUDA::TimeSeriesEngine::TimeSeriesEngine (dsp::Memory * _memory)
+{
+  memory = dynamic_cast<CUDA::DeviceMemory*>(_memory);
+  buffer = NULL;
+  buffer_size = 0;
+
+  pinned_memory = new CUDA::PinnedMemory;
+  host_buffer = NULL;
+  host_buffer_size = 0;
+}
+
+CUDA::TimeSeriesEngine::~TimeSeriesEngine ()
+{
+  if (buffer)
+    memory->do_free (buffer);
+  buffer = 0;
+}
+
+void CUDA::TimeSeriesEngine::prepare (dsp::TimeSeries * parent)
+{
+  to = parent;
+}
+
+void CUDA::TimeSeriesEngine::prepare_buffer (unsigned nbytes)
+{
+  if (nbytes > buffer_size)
+  {
+    if (buffer)
+      memory->do_free (buffer);
+    buffer_size = nbytes;
+    buffer = memory->do_allocate (buffer_size);
+    memory->do_zero(buffer, buffer_size);
+  }
+}
+
+// copy data from another time series to this time series
+void CUDA::TimeSeriesEngine::copy_data_fpt (const dsp::TimeSeries* from, 
+    uint64_t idat_start, uint64_t ndat)
+{
+  nchan = to->get_nchan();
+  npol  = to->get_npol();
+  ndim  = to->get_ndim();
+
+  // current cuda device that is executing this function
+  cudaGetDevice (&device);
+
+#ifdef _DEBUG
+  cerr << "CUDA::TimeSeriesEngine::copy_data_fpt from=" << (void *) from 
+       << " idat_start=" << idat_start << " ndat=" << ndat << " device=" << device << endl;
+#endif
+
+  // stream and device upon which to TSE exists
+  to_stream = memory->get_stream();
+  to_device = memory->get_device();
+
+  // stream and device upon which from TSE exists
+  const CUDA::DeviceMemory * from_mem = dynamic_cast<const CUDA::DeviceMemory*>( from->get_memory());
+  from_stream = from_mem->get_stream();
+  from_device = from_mem->get_device();
+
+  if (!from_mem)
+    throw Error (FailedSys, "CUDA::TimeSeriesEngine::copy_data_fpt", "From TimeSeries did not use DeviceMemory");
+
+  ichanpol_stride = 0;
+  ochanpol_stride = 0;
+  bchanpol_stride = ndat;
+
+  if (npol > 1)
+  {
+    ochanpol_stride = to->get_datptr (0,1) - to->get_datptr (0,0);
+    ichanpol_stride = from->get_datptr (0,1) - from->get_datptr (0,0);
+  }
+  else if (nchan > 1)
+  {
+    ochanpol_stride = to->get_datptr (1,0) - to->get_datptr (1,0);
+    ichanpol_stride = from->get_datptr (1,0) - from->get_datptr (1,0);
+  }
+  else
+  {
+    ; 
+  }
+
+  ichanpol_stride /= ndim;
+  ochanpol_stride /= ndim;
+
+#ifdef _DEBUG
+  cerr << "CUDA::TimeSeriesEngine::copy_data_fpt streams to="
+       << (void*) to_stream << " from=" << (void*) from_stream << endl;
+  cerr << "CUDA::TimeSeriesEngine::copy_data_fpt device to=" << device 
+       << " from=" << from_device << endl;
+  cerr  << "CUDA::TimeSeriesEngine::copy_data_fpt nchan=" << nchan << " ndim=" << ndim << " npol=" << npol << " ndat=" << ndat << endl;
+  cerr  << "CUDA::TimeSeriesEngine::copy_data_fpt istride=" << ichanpol_stride << " ostride=" << ochanpol_stride << " bstride=" << bchanpol_stride << endl;
+#endif
+
+  // configure the kernels
+  nthread = 1024;
+  if (nthread > ndat)
+    nthread = ndat;
+  blocks = dim3 (ndat / nthread, nchan*npol);
+  if (ndat % nthread)
+    blocks.x++;
+
+#ifdef _DEBUG
+  cerr << "blocks=(" << blocks.x << "," << blocks.y << ") threads=" << nthread << endl;
+#endif
+
+  if (from_device == to_device)
+    if (from_stream == to_stream) 
+      copy_data_fpt_same_stream (from, idat_start, ndat);
+    else
+      copy_data_fpt_same_device (from, idat_start, ndat);
+  else
+    copy_data_fpt_diff_device (from, idat_start, ndat);
+}
+
+// if both time series are within the same stream
+void CUDA::TimeSeriesEngine::copy_data_fpt_same_stream (const dsp::TimeSeries* from,
+    uint64_t idat_start, uint64_t ndat)
+{
+  cudaStream_t stream = memory->get_stream();
+  copy_data_fpt_kernel_multidim (to->get_datptr (0,0), from->get_datptr (0,0),
+                                 ochanpol_stride, ichanpol_stride, 
+                                 idat_start, ndat, stream);
+}
+
+// if both time series are in different streams, but the same device
+void CUDA::TimeSeriesEngine::copy_data_fpt_same_device (const dsp::TimeSeries* from,
+    uint64_t idat_start, uint64_t ndat)
+{
+  size_t nbytes = nchan * ndim * npol * ndat * sizeof(float);
+
+  // stream upon which from TSE exists
+  CUDA::TimeSeriesEngine * from_engine = dynamic_cast<CUDA::TimeSeriesEngine*>(from->get_engine());
+  const CUDA::DeviceMemory * from_mem = dynamic_cast<const CUDA::DeviceMemory*>( from->get_memory());
+
+  // ensure the buffers in each time series are allocated
+  from_engine->prepare_buffer (nbytes);
+  prepare_buffer (nbytes);
+
+  // copy from -> from_buffer
+  copy_data_fpt_kernel_multidim ((float *) from_engine->buffer, from->get_datptr (0,0), 
+                                 bchanpol_stride, ichanpol_stride, 
+                                 idat_start, ndat, from_stream);
+
+  // copy from_buffer -> to_buffer  
+  cudaMemcpyAsync (buffer, from_engine->buffer, nbytes, cudaMemcpyDeviceToDevice, from_stream);
+  cudaStreamSynchronize(from_stream);
+
+  // copy buffer -> to
+  copy_data_fpt_kernel_multidim (to->get_datptr (0,0), (float *) buffer, 
+                                 ochanpol_stride, bchanpol_stride, 
+                                 0, ndat, to_stream);
+}
+
+// if both time series are in different streams, but the same device
+void CUDA::TimeSeriesEngine::copy_data_fpt_diff_device (const dsp::TimeSeries* from,
+    uint64_t idat_start, uint64_t ndat)
+{
+  size_t nbytes = nchan * ndim * npol * ndat * sizeof(float);
+
+  // if the current device is not the to device, switch and allocate
+  if (device != to_device)
+    cudaSetDevice (to_device);
+  prepare_buffer (nbytes);
+
+  // switch to the from_device
+  cudaSetDevice (from_device);
+
+  // ensure buffer is allocated
+  CUDA::TimeSeriesEngine * from_engine = dynamic_cast<CUDA::TimeSeriesEngine*>(from->get_engine());
+  from_engine->prepare_buffer (nbytes);
+
+  // copy from -> from_buffer
+  copy_data_fpt_kernel_multidim ((float *) from_engine->buffer, from->get_datptr (0,0),
+                                 bchanpol_stride, ichanpol_stride,
+                                 idat_start, ndat, from_stream);
+
+  // if the host buffer is too small, allocate some pinned memory
+  if (host_buffer_size < nbytes)
+  {
+    if (host_buffer)
+      pinned_memory->do_free (host_buffer);
+    host_buffer = pinned_memory->do_allocate (nbytes);
+    host_buffer_size = nbytes;
+  }
+
+  // copy from_buffer -> host_buffer
+  cudaMemcpyAsync (host_buffer, from_engine->buffer, nbytes, cudaMemcpyDeviceToHost, from_stream);
+
+  // wait for the D2H transfer to complete before continuing
+  cudaStreamSynchronize (from_stream);
+
+  // switch to the to_device
+  cudaSetDevice (to_device);
+
+  // copy host_buffer -> to_buffer
+  cudaMemcpyAsync (buffer, host_buffer, nbytes, cudaMemcpyHostToDevice, to_stream);
+
+  // copy to_buffer -> to
+  copy_data_fpt_kernel_multidim (to->get_datptr (0,0), (float *) buffer,
+                                 ochanpol_stride, bchanpol_stride,
+                                 0, ndat, to_stream);
+
+  if (to_device != device)
+    cudaSetDevice (device);
+}
+
+
+void CUDA::TimeSeriesEngine::copy_data_fpt_kernel_multidim (float * to, const float * from, 
+                                 uint64_t to_stride, uint64_t from_stride, 
+                                 uint64_t idat_start, uint64_t ndat, 
+                                 cudaStream_t stream)
+{
+  if (ndim == 2)
+  {
+    float2 * to_ptr   = (float2 *) to;
+    float2 * from_ptr = (float2 *) from;
+    copy_data_fpt_kernel<float2><<<blocks,nthread,0,stream>>> (
+      to_ptr, from_ptr + idat_start, to_stride, from_stride, ndat);
+  }
+  else
+  {
+    float * from_ptr = (float *) from;
+    copy_data_fpt_kernel<float><<<blocks,nthread,0,stream>>> (
+      to, from_ptr + idat_start, to_stride, from_stride, ndat);
+  }
+}
+
+
+
+
+
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/apsr/dsp/APSRIterator.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/apsr/dsp/APSRIterator.h
--- bl-dspsr-0+git20160405/Kernel/Formats/apsr/dsp/APSRIterator.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/apsr/dsp/APSRIterator.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/apsr/dsp/APSRIterator.h,v $
-   $Revision: 1.2 $
-   $Date: 2008/07/13 00:38:54 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/apsr/dsp/APSRIterator.h
 
 #ifndef __APSRIterator_h
 #define __APSRIterator_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/apsr/dsp/APSRTwoBitCorrection.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/apsr/dsp/APSRTwoBitCorrection.h
--- bl-dspsr-0+git20160405/Kernel/Formats/apsr/dsp/APSRTwoBitCorrection.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/apsr/dsp/APSRTwoBitCorrection.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/apsr/dsp/APSRTwoBitCorrection.h,v $
-   $Revision: 1.4 $
-   $Date: 2008/07/13 00:38:54 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/apsr/dsp/APSRTwoBitCorrection.h
 
 #ifndef __APSRTwoBitCorrection_h
 #define __APSRTwoBitCorrection_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/apsr/dsp/APSRUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/apsr/dsp/APSRUnpacker.h
--- bl-dspsr-0+git20160405/Kernel/Formats/apsr/dsp/APSRUnpacker.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/apsr/dsp/APSRUnpacker.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/apsr/dsp/APSRUnpacker.h,v $
-   $Revision: 1.5 $
-   $Date: 2009/06/17 10:16:53 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/apsr/dsp/APSRUnpacker.h
 
 #ifndef __APSRUnpacker_h
 #define __APSRUnpacker_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/asp/dsp/ASPUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/asp/dsp/ASPUnpacker.h
--- bl-dspsr-0+git20160405/Kernel/Formats/asp/dsp/ASPUnpacker.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/asp/dsp/ASPUnpacker.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/asp/dsp/ASPUnpacker.h,v $
-   $Revision: 1.3 $
-   $Date: 2006/07/09 13:27:03 $
-   $Author: wvanstra $ */
+// dspsr/Kernel/Formats/asp/dsp/ASPUnpacker.h
 
 #ifndef __ASPUnpacker_h
 #define __ASPUnpacker_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/bpsr/BPSRCrossUnpacker.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/bpsr/BPSRCrossUnpacker.C
--- bl-dspsr-0+git20160405/Kernel/Formats/bpsr/BPSRCrossUnpacker.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/bpsr/BPSRCrossUnpacker.C	2018-03-12 23:02:35.000000000 +0000
@@ -1,12 +1,11 @@
 /***************************************************************************
  *
- *   Copyright (C) 2008-2014 by Andrew Jameson & Willem van Straten
+ *   Copyright (C) 2008 - 2016 by Andrew Jameson & Willem van Straten
  *   Licensed under the Academic Free License version 2.1
  *
  ***************************************************************************/
 
 #include "dsp/BPSRCrossUnpacker.h"
-//#include "dsp/DADABuffer.h"
 #include "dsp/ASCIIObservation.h"
 #include "Error.h"
 
@@ -19,6 +18,16 @@
 {
   gain_polx = -1;
   unpack_ppqq_only = false;
+
+  /*
+    This constant is an observed approximate mean value of
+    GAIN_POL1 and GAIN_POL2 and it is applied simply to keep
+    rescale factors close to unity
+  */
+  reference_gain = 100000.0 / 256.0;
+  ppqq_scale[0] = 1.0;
+  ppqq_scale[1] = 1.0;
+  pq_scale = 1.0;
 }
 
 //! Return true if the unpacker support the specified output order
@@ -93,44 +102,90 @@
     const Input * in = input->get_loader();
     const Observation * obs = in->get_info();
     const ASCIIObservation * info = dynamic_cast<const ASCIIObservation *>(obs);
-    if (info)
+    if (!info)
+      throw Error (InvalidState, "dsp::BPSRCrossUnpacker::unpack",
+                   "ASCIIObservation required and not available");
+
+
+    // attempt to get the FACTOR_POLX from the header.
+    // This describes the factor necessary to correct the AB* values
+    // relative to AA and BB.
+
+    info->custom_header_get ("GAIN_POL1", "%f", &gain_pol1);
+    info->custom_header_get ("GAIN_POL2", "%f", &gain_pol2);
+
+    // attempt to get the FACTOR_POLX from the header. This completely describes
+    // the factor necessary to correct the AB* values
+    try
+    {
+      if (info->custom_header_get ("FACTOR_POLX", "%f", &gain_polx) == 1)
+      {
+        if (verbose)
+          cerr << "dsp::BPSRCrossUnpacker::unpack FACTOR_POLX="
+               << gain_polx << endl;
+      }
+    }
+    catch (Error& error)
     {
-      if (info)
+      // older method that makes the assumption that the AA and BB are in
+      // bit window 1. AB* is in bit window 3. The correct calculation is
+      //    gain_polx = polx * 2^11 / (2^8 * (bwx - bw))
+      unsigned polx;
+      if (info->custom_header_get ("GAIN_POLX", "%u", &polx) == 1)
       {
-        // attempt to get the FACTOR_POLX from the header. This completely describes
-        // the factor necessary to correct the AB* values
-        try
+        if (polx == 0)
         {
-          if (info->custom_header_get ("FACTOR_POLX", "%f", &gain_polx) == 1)
-          {
-            if (verbose)
-              cerr << "dsp::BPSRCrossUnpacker::unpack FACTOR_POLX=" << gain_polx << endl;
-          }
+          gain_polx = 1;
         }
-        catch (Error& error)
+        else
         {
-          // older method that makes the assumption that the AA and BB are in
-          // bit window 1. AB* is in bit window 3. The correct calculation is
-          //    gain_polx = polx * 2^11 / (2^8 * (bwx - bw))
-          unsigned polx;
-          if (info->custom_header_get ("GAIN_POLX", "%u", &polx) == 1)
-          {
-            if (polx == 0)
-            {
-              gain_polx = 1;
-            }
-            else
-            {
-              gain_polx = ((float) polx) / 32;
-            }
-          }
-          if (verbose)
-            cerr << "dsp::BPSRCrossUnpacker::unpack GAIN_POLX=" << polx << " FACTOR_POLX=" << gain_polx << endl;
+          gain_polx = ((float) polx) / 32;
         }
       }
+      if (verbose)
+        cerr << "dsp::BPSRCrossUnpacker::unpack GAIN_POLX="
+             << polx << " FACTOR_POLX=" << gain_polx << endl;
     }
-  }
 
+    // try to read the Bit Window of the PPQQ data
+    try
+    {
+      if (info->custom_header_get ("PPQQ_BW", "%u", &ppqq_bw) == 1)
+      {
+        if (verbose)
+          cerr << "dsp::BPSRCrossUnpacker::unpack PPQQ_BW="
+               << ppqq_bw << endl;
+      }
+    }
+    catch (Error& error)
+    {
+      ppqq_bw = 1;
+      if (verbose)
+        cerr << "dsp::BPSRCrossUnpacker::unpack assuming PPQQ_BW="
+             << ppqq_bw << endl;
+
+    }
+
+    // each bit window suppresses by 256 (2^8)
+    float ppqq_bw_scale = powf (2, 8*ppqq_bw);
+    if (verbose)
+    {
+      cerr << "dsp::BPSRCrossUnpacker::unpack raw GAIN_POL1=" << gain_pol1 
+           << " GAIN_POL1=" << gain_pol1/ppqq_bw_scale << endl;
+      cerr << "dsp::BPSRCrossUnpacker::unpack raw GAIN_POL2=" << gain_pol2 
+           << " GAIN_POL2=" << gain_pol2/ppqq_bw_scale << endl;
+    }
+    gain_pol1 /= ppqq_bw_scale;
+    gain_pol2 /= ppqq_bw_scale;
+
+    float p_scale = reference_gain/gain_pol1;
+    float q_scale = reference_gain/gain_pol2;
+
+    ppqq_scale[0] = p_scale * p_scale;
+    ppqq_scale[1] = q_scale * q_scale;
+    pq_scale = p_scale * q_scale / gain_polx;
+  }
+  
   switch ( output->get_order() )
   {
   case TimeSeries::OrderFPT:
@@ -165,7 +220,7 @@
             for (unsigned bt = 0; bt < ndat; bt++)
             {
               // hist[ *from ] ++;
-              into[bt] = float( *from );
+              into[bt] = float( *from ) * ppqq_scale[ipol];
               from += step;
             }
           }
@@ -174,7 +229,7 @@
             for (unsigned bt = 0; bt < ndat; bt++)
             {
               if (!unpack_ppqq_only)
-                into[bt] = float( ((char) *from) ) / gain_polx;
+                into[bt] = float( ((char) *from) ) * pq_scale;
               from += step;
             }
           }
@@ -185,7 +240,7 @@
   case TimeSeries::OrderTFP:
     {
       if (verbose)
-        cerr << "dsp::BPSRCrossUnpacker::unpack Output order OrderTFP\n" << endl;
+        cerr << "dsp::BPSRCrossUnpacker::unpack Output order OrderTFP" << endl;
 
       const unsigned char* from = input->get_rawptr();
       float* into = output->get_dattfp();
@@ -201,6 +256,11 @@
           into[2] = float( from[1] ) + 0.5;
           into[3] = float( from[3] ) + 0.5;
 
+          into[0] *= ppqq_scale[0];
+          into[1] *= ppqq_scale[1];
+          into[2] *= ppqq_scale[0];
+          into[3] *= ppqq_scale[1];
+
           into += 4;
           from += 8;
         }
@@ -218,10 +278,14 @@
           into[6] = float( ((char) from[6]) ) + 0.5;
           into[7] = float( ((char) from[7]) ) + 0.5;
 
-          into[2] /= gain_polx;
-          into[3] /= gain_polx;
-          into[6] /= gain_polx;
-          into[7] /= gain_polx;
+          into[0] *= ppqq_scale[0];
+          into[1] *= ppqq_scale[1];
+          into[2] *= pq_scale;
+          into[3] *= pq_scale;
+          into[4] *= ppqq_scale[0];
+          into[5] *= ppqq_scale[1];          
+          into[6] *= pq_scale;
+          into[7] *= pq_scale;
           
           into += 8;
           from += 8;
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/bpsr/dsp/BPSRCrossUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/bpsr/dsp/BPSRCrossUnpacker.h
--- bl-dspsr-0+git20160405/Kernel/Formats/bpsr/dsp/BPSRCrossUnpacker.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/bpsr/dsp/BPSRCrossUnpacker.h	2018-03-12 23:02:35.000000000 +0000
@@ -44,9 +44,20 @@
     unsigned get_output_ipol (unsigned idig) const;
 
     float gain_polx;
+    float gain_pol1;
+    float gain_pol2;
+    unsigned ppqq_bw;
 
   private:
 
+    float reference_gain;
+
+    float ppqq_scale[2];
+
+    float pq_scale;
+    
+  private:
+
     bool unpack_ppqq_only;
 
   };
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/bpsr/dsp/BPSRUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/bpsr/dsp/BPSRUnpacker.h
--- bl-dspsr-0+git20160405/Kernel/Formats/bpsr/dsp/BPSRUnpacker.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/bpsr/dsp/BPSRUnpacker.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/bpsr/dsp/BPSRUnpacker.h,v $
-   $Revision: 1.4 $
-   $Date: 2008/10/08 23:19:06 $
-   $Author: sixbynine $ */
+// dspsr/Kernel/Formats/bpsr/dsp/BPSRUnpacker.h
 
 #ifndef __BPSRUnpacker_h
 #define __BPSRUnpacker_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/caspsr/CASPSRSingleUnpacker.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/caspsr/CASPSRSingleUnpacker.C
--- bl-dspsr-0+git20160405/Kernel/Formats/caspsr/CASPSRSingleUnpacker.C	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/caspsr/CASPSRSingleUnpacker.C	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,273 @@
+//-*-C++-*-
+/***************************************************************************
+ *
+ *   Copyright (C) 2009
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "dsp/CASPSRSingleUnpacker.h"
+#include "dsp/BitTable.h"
+
+#include "Error.h"
+
+#if HAVE_CUDA
+#include "dsp/MemoryCUDA.h"
+#include "dsp/CASPSRUnpackerCUDA.h"
+#include <cuda_runtime.h>
+#endif
+
+#include <errno.h>
+
+using namespace std;
+
+static void* const undefined_stream = (void *) -1;
+
+dsp::CASPSRSingleUnpacker::CASPSRSingleUnpacker (const char* _name) : HistUnpacker (_name)
+{
+  if (verbose)
+    cerr << "dsp::CASPSRSingleUnpacker ctor" << endl;
+
+  set_nstate (256);
+  gpu_stream = undefined_stream;
+
+  table = new BitTable (8, BitTable::TwosComplement);
+
+#if HAVE_CUDA
+  int device;
+  struct cudaDeviceProp gpu;
+  cudaGetDevice(&device);
+  cudaGetDeviceProperties (&gpu, device);
+  threadsPerBlock = gpu.maxThreadsPerBlock;
+#endif
+
+  device_prepared = false;
+}
+
+dsp::CASPSRSingleUnpacker::~CASPSRSingleUnpacker ()
+{
+}
+
+dsp::CASPSRSingleUnpacker * dsp::CASPSRSingleUnpacker::clone () const
+{
+  return new CASPSRSingleUnpacker (*this);
+}
+
+//! Return true if the unpacker can operate on the specified device
+bool dsp::CASPSRSingleUnpacker::get_device_supported (Memory* memory) const
+{
+#if HAVE_CUDA
+  if (verbose)
+    cerr << "dsp::CASPSRSingleUnpacker::get_device_supported HAVE_CUDA" << endl;
+  return dynamic_cast< CUDA::DeviceMemory*> ( memory );
+#else
+  return false;
+#endif
+}
+
+//! Set the device on which the unpacker will operate
+void dsp::CASPSRSingleUnpacker::set_device (Memory* memory)
+{
+#if HAVE_CUDA
+  CUDA::DeviceMemory * gpu_mem = dynamic_cast< CUDA::DeviceMemory*>( memory );
+  if (gpu_mem)
+  {
+    gpu_stream = (void *) gpu_mem->get_stream();
+    if (verbose)
+      cerr << "dsp::CASPSRSingleUnpacker::set_device using gpu memory" << endl;
+  }
+  else
+  {
+    if (verbose)
+      cerr << "dsp::CASPSRSingleUnpacker::set_device using cpu memory" << endl;
+    gpu_stream = undefined_stream;
+    Unpacker::set_device (memory);
+  }
+#else
+  Unpacker::set_device (memory);
+#endif
+  device_prepared = true;
+}
+
+
+bool dsp::CASPSRSingleUnpacker::matches (const Observation* observation)
+{
+  return observation->get_machine()== "CASPSR"
+    && observation->get_nbit() == 8;
+}
+
+// default CPU unpacker for CASPSR format
+void dsp::CASPSRSingleUnpacker::unpack_default ()
+{
+  uint64_t ndat = input->get_ndat();
+  const float* lookup = table->get_values ();
+
+  const uint64_t * from64 = (uint64_t *) input->get_rawptr();
+
+  unsigned long* hist_p0 = get_histogram (0);
+  unsigned long* hist_p1 = get_histogram (1);
+
+  float * into_p0 = output->get_datptr (0, 0);
+  float * into_p1 = output->get_datptr (0, 1);
+
+  uint64_t val64;
+  unsigned char * val8 = (unsigned char *) &val64;
+  char * val8h = (char *) &val64;
+
+  // process 4 samples, from 2 pols per loop
+  for (uint64_t idat=0; idat<ndat; idat+=4)
+  {
+    // read 8 values
+    val64 = *from64;
+
+    into_p0[0] = lookup[ val8[0] ];
+    into_p0[1] = lookup[ val8[1] ];
+    into_p0[2] = lookup[ val8[2] ];
+    into_p0[3] = lookup[ val8[3] ];
+
+    into_p1[0] = lookup[ val8[4] ];
+    into_p1[1] = lookup[ val8[5] ];
+    into_p1[2] = lookup[ val8[6] ];
+    into_p1[3] = lookup[ val8[7] ];
+
+    hist_p0[int(val8h[0])+128]++;
+    hist_p0[int(val8h[1])+128]++;
+    hist_p0[int(val8h[2])+128]++;
+    hist_p0[int(val8h[3])+128]++;
+
+    hist_p1[int(val8h[4])+128]++;
+    hist_p1[int(val8h[5])+128]++;
+    hist_p1[int(val8h[6])+128]++;
+    hist_p1[int(val8h[7])+128]++;
+
+    from64  += 1;
+    into_p0 += 4;
+    into_p1 += 4;
+  }
+}
+
+void dsp::CASPSRSingleUnpacker::unpack (uint64_t ndat,
+                                  const unsigned char* from,
+                                  float* into,
+                                  const unsigned fskip,
+                                  unsigned long* hist)
+{
+  if (verbose)
+    cerr << "dsp::CASPSRSingleUnpacker::unpack(...)" << endl;
+  const float* lookup = table->get_values ();
+  const float scale = table->get_scale();
+
+  const unsigned into_stride = fskip * 4;
+  const unsigned from_stride = 2;
+
+  // read 4 samples at a time
+  uint32_t * from32 = (uint32_t *) from;
+  uint32_t val32;
+  unsigned char * val8 = (unsigned char *) &val32;
+
+  //std::cout << ndat << std::endl;
+  for (uint64_t idat=0; idat < ndat; idat+=4)
+  {
+    // read 4 uint8_t (actually int8_t)
+    val32 = *from32;
+
+    into[0] = lookup[ val8[0] ];
+    into[1] = lookup[ val8[1] ];
+    into[2] = lookup[ val8[2] ];
+    into[3] = lookup[ val8[3] ];
+
+    hist[val8[0]]++;
+    hist[val8[1]]++;
+    hist[val8[2]]++;
+    hist[val8[3]]++;
+
+    from32 += from_stride;
+    into += into_stride;
+  }
+}
+
+void dsp::CASPSRSingleUnpacker::unpack ()
+{
+
+#if HAVE_CUDA
+  if (gpu_stream != undefined_stream)
+  {
+    unpack_on_gpu ();
+    return;
+  }
+#endif
+
+  // some programs (digifil) do not call set_device
+  if (! device_prepared)
+    set_device ( Memory::get_manager ());
+
+  const uint64_t ndat  = input->get_ndat();
+  const unsigned nchan = input->get_nchan();
+  const unsigned npol  = input->get_npol();
+  const unsigned ndim  = input->get_ndim();
+
+  if (ndim == 1 && npol == 2 && nchan == 1)
+  {
+    unpack_default();
+    return;
+  }
+
+  const unsigned fskip = ndim;
+  unsigned offset = 0;
+
+  for (unsigned ichan=0; ichan<nchan; ichan++)
+  {
+    for (unsigned ipol=0; ipol<npol; ipol++)
+    {
+      if (ipol==1)
+        offset = 4;
+      for (unsigned idim=0; idim<ndim; idim++)
+      {
+        const unsigned char* from = input->get_rawptr() + offset;
+        float* into = output->get_datptr (ichan, ipol) + idim;
+        unsigned long* hist = get_histogram (ipol);
+
+        unpack (ndat, from, into, fskip, hist);
+        offset ++;
+      }
+    }
+  }
+}
+
+unsigned dsp::CASPSRSingleUnpacker::get_resolution () const { return 1024; }
+
+#if HAVE_CUDA
+
+void dsp::CASPSRSingleUnpacker::unpack_on_gpu ()
+{
+  const uint64_t ndat = input->get_ndat();
+  const unsigned nchan = input->get_nchan();
+  const unsigned ndim = input->get_ndim();
+  const unsigned npol = input->get_npol();
+
+  const unsigned char* from = input->get_rawptr();
+  float * into_pola, * into_polb;
+  unsigned ichan;
+
+  cudaStream_t stream = (cudaStream_t) gpu_stream;
+  cudaError error;
+
+  for (ichan=0; ichan<nchan; ichan++)
+  {
+    into_pola = output->get_datptr(ichan, 0);
+    into_polb = output->get_datptr(ichan, 1);
+
+    caspsr_unpack (stream, ndat*ndim, table->get_scale(), 
+                   from, into_pola, into_polb,
+                   threadsPerBlock);
+
+    from += ndat*ndim*npol;
+  }
+}
+
+#endif
+
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/caspsr/CASPSRUnpacker.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/caspsr/CASPSRUnpacker.C
--- bl-dspsr-0+git20160405/Kernel/Formats/caspsr/CASPSRUnpacker.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/caspsr/CASPSRUnpacker.C	2018-03-12 23:02:35.000000000 +0000
@@ -42,6 +42,14 @@
   state = Idle;
   thread_count = 0;
 
+#if HAVE_CUDA
+  int device;
+  struct cudaDeviceProp gpu;
+  cudaGetDevice(&device);
+  cudaGetDeviceProperties (&gpu, device);
+  threadsPerBlock = gpu.maxThreadsPerBlock;
+#endif
+
   device_prepared = false;
   single_thread = true;
 }
@@ -82,18 +90,8 @@
   if (gpu_mem)
   {
     gpu_stream = (void *) gpu_mem->get_stream();
-#ifdef USE_TEXTURE_MEMORY
-    if (verbose)
-      cerr << "dsp::CASPSRUnpacker::set_device using texture memory" << endl;
-    CUDA::TextureMemory * texture_mem = new CUDA::TextureMemory (gpu_mem->get_stream());
-    texture_mem->set_format_signed(8, 0, 0, 0);
-    texture_mem->set_symbol("caspsr_unpack_tex");
-    staging.set_memory( texture_mem );
-#else
     if (verbose)
       cerr << "dsp::CASPSRUnpacker::set_device using gpu memory" << endl;
-    staging.set_memory( memory );
-#endif
   }
   else
   {
@@ -346,43 +344,28 @@
 void dsp::CASPSRUnpacker::unpack_on_gpu ()
 {
   const uint64_t ndat = input->get_ndat();
+  const unsigned nchan = input->get_nchan();
+  const unsigned ndim = input->get_ndim();
+  const unsigned npol = input->get_npol();
 
-  staging.Observation::operator=( *input );
-  staging.resize(ndat);
-
-  // staging buffer on the GPU for packed data
-  unsigned char* d_staging = staging.get_rawptr();
-#ifdef USE_TEXTURE_MEMORY
-  if (verbose)
-    cerr << "dsp::CASPSRUnpacker::unpack_on_gpu: creating TextureMemory" << endl;
-
-  CUDA::TextureMemory * gpu_mem = dynamic_cast< CUDA::TextureMemory*>( staging.get_memory() );
-  if (ndat > 0)
-    gpu_mem->activate ( d_staging );
-#endif
- 
-  const unsigned char* from= input->get_rawptr();
-
-  float* into_pola = output->get_datptr(0,0);
-  float* into_polb = output->get_datptr(0,1);
+  const unsigned char* from = input->get_rawptr();
+  float * into_pola, * into_polb;
+  unsigned ichan;
 
   cudaStream_t stream = (cudaStream_t) gpu_stream;
-
   cudaError error;
 
-  if (stream)
-    error = cudaMemcpyAsync (d_staging, from, ndat*2,
-                             cudaMemcpyHostToDevice, stream);
-  else
-    error = cudaMemcpy (d_staging, from, ndat*2, cudaMemcpyHostToDevice);
+  for (ichan=0; ichan<nchan; ichan++)
+  {
+    into_pola = output->get_datptr(ichan, 0);
+    into_polb = output->get_datptr(ichan, 1);
 
-  if (error != cudaSuccess)
-    throw Error (FailedCall, "CASPSRUnpacker::unpack_on_gpu",
-                 "cudaMemcpy%s %s", stream?"Async":"", 
-                 cudaGetErrorString (error));
+    caspsr_unpack (stream, ndat*ndim, table->get_scale(), 
+                   from, into_pola, into_polb,
+                   threadsPerBlock);
 
-  caspsr_unpack (stream, ndat, table->get_scale(), 
-                 d_staging, into_pola, into_polb);
+    from += ndat*ndim*npol;
+  }
 }
 
 #endif
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/caspsr/CASPSRUnpackerCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/caspsr/CASPSRUnpackerCUDA.cu
--- bl-dspsr-0+git20160405/Kernel/Formats/caspsr/CASPSRUnpackerCUDA.cu	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/caspsr/CASPSRUnpackerCUDA.cu	2018-03-12 23:02:35.000000000 +0000
@@ -11,14 +11,10 @@
 
 #include "Error.h"
 
-// threads per block - C1060=256 [TODO CHECK below if changing]
-#define __CASPSR_UNPACK_TPB 256
-
-// global static texture declaration for CASPSR gpu unpacker
-texture<int8_t, 1, cudaReadModeElementType> caspsr_unpack_tex;
-
 using namespace std;
 
+void check_error_stream (const char*, cudaStream_t);
+
 /* 
    Unpack the two real-valued input polarizations into an interleaved
    array suited to the twofft algorithm described in Section 12.3
@@ -45,100 +41,56 @@
   output[7] = convert(scale,input[index].val[7]);
 }
 
-void check_error (const char*);
-
-#ifdef USE_TEXTURE_MEMORY
-// ndim 1 unpacker uses texture memory for reads
-__global__ void unpack_real_ndim1 (float* into_pola, float* into_polb, float scale)
+__global__ void unpack_real_ndim1 (uint64_t ndat, float scale,
+				   int8_t * from, float* into_pola, float* into_polb) 
 {
-  const int idx                 = blockIdx.x*blockDim.x + threadIdx.x;
-  const int sample_idx          = idx * 8;
-  unsigned int shared_idx       = threadIdx.x * 4;
-  const uint64_t output_idx     = blockIdx.x * blockDim.x * 4;
-  const unsigned int half_block = blockDim.x / 2;
-
-  // n.b. this is blockDim.x * 4 [hardcoded by default]
-  __shared__ float pola[4 * __CASPSR_UNPACK_TPB];
-  __shared__ float polb[4 * __CASPSR_UNPACK_TPB];
+  extern __shared__ int8_t sdata[];
 
-  // loads 8 samples per thread (4 per poln)
-  unsigned i = 0;
+  unsigned idx_shm = threadIdx.x;
+  unsigned idx     = (8 * blockIdx.x * blockDim.x) + threadIdx.x;
+  unsigned i;
 
-  // write 4 samples from each poln into shared memory
-  for (i=0; i<4; i++)
+  // each thread will load 8 values (coalesced) from GMEM to SHM
+  for (i=0; i<8; i++)
   {
-
-    pola[shared_idx + i] = (((float) tex1Dfetch(caspsr_unpack_tex, sample_idx + i)) + 0.5) * scale;
-    polb[shared_idx + i] = (((float) tex1Dfetch(caspsr_unpack_tex, sample_idx + i + 4)) + 0.5) * scale;
+    if (idx < 2*ndat)
+    {
+      sdata[idx_shm] = from[idx];
+
+      idx     += blockDim.x;
+      idx_shm += blockDim.x;
+    }
   }
 
   __syncthreads();
 
-  // first half threads write poln A
-  if (threadIdx.x < half_block)
-  {
-    unsigned int tid = 2 * threadIdx.x + (48 * ((int) (threadIdx.x/8)));
-    float * to = into_pola + output_idx;
+  idx     = (4 * blockIdx.x * blockDim.x) + threadIdx.x;
+  idx_shm = threadIdx.x + ((threadIdx.x / 4) * 4);
 
-    to[tid + 0]  = pola[tid + 0];
-    to[tid + 1]  = pola[tid + 1];
-    to[tid + 16] = pola[tid + 16];
-    to[tid + 17] = pola[tid + 17];
-    to[tid + 32] = pola[tid + 32];
-    to[tid + 33] = pola[tid + 33];
-    to[tid + 48] = pola[tid + 48];
-    to[tid + 49] = pola[tid + 49];
-  }
-  // second half threads write poln B
-  else
+  // each thread will write 4 values (coalesced) from SHM to GMEM
+  for (i=0; i<4; i++)
   {
-    unsigned int tid = 2 * (threadIdx.x - half_block) + (48 * ((int) ((threadIdx.x-half_block)/8)));
-    float * to = into_polb + output_idx;
-
-    to[tid + 0]  = polb[tid + 0];
-    to[tid + 1]  = polb[tid + 1];
-    to[tid + 16] = polb[tid + 16];
-    to[tid + 17] = polb[tid + 17];
-    to[tid + 32] = polb[tid + 32];
-    to[tid + 33] = polb[tid + 33];
-    to[tid + 48] = polb[tid + 48];
-    to[tid + 49] = polb[tid + 49];
+    if (idx < ndat)
+    {
+      into_pola[idx] = ((float) sdata[idx_shm]   + 0.5) * scale; 
+      into_polb[idx] = ((float) sdata[idx_shm+4] + 0.5) * scale;
+
+      idx += blockDim.x;
+      idx_shm += blockDim.x * 2;
+    }
   }
 }
-#else
-__global__ void unpack_real_ndim1 (uint64_t ndat, float scale,
-				   const unsigned char* stagingBufGPU,
-				   float* into_pola, float* into_polb) 
-{
-  uint64_t sampleTmp = blockIdx.x*blockDim.x + threadIdx.x; 
-
-  uint64_t outputIndex = sampleTmp * 4;
-  sampleTmp = sampleTmp * 8;
- 
-  float* to_A = into_pola + outputIndex;
-  float* to_B = into_polb + outputIndex;
-
-  const int8_t* from = reinterpret_cast<const int8_t*>( stagingBufGPU ) + sampleTmp;
-
-  to_A[0] = ((float) from[0] + 0.5) * scale;
-  to_A[1] = ((float) from[1] + 0.5) * scale;
-  to_A[2] = ((float) from[2] + 0.5) * scale;
-  to_A[3] = ((float) from[3] + 0.5) * scale;
-
-  to_B[0] = ((float) from[4] + 0.5) * scale;
-  to_B[1] = ((float) from[5] + 0.5) * scale;
-  to_B[2] = ((float) from[6] + 0.5) * scale;
-  to_B[3] = ((float) from[7] + 0.5) * scale;
-}
-#endif
 
 void caspsr_unpack (cudaStream_t stream, const uint64_t ndat, float scale, 
-                    unsigned char const* input, float* pol0, float* pol1)
+                    unsigned char const* input, float* pol0, float* pol1,
+                    int nthread)
 {
-  int nthread = __CASPSR_UNPACK_TPB;
 
   // each thread will unpack 4 time samples from each polarization
-  int nblock = ndat / (4*nthread);
+  int nsamp_per_block = 4 * nthread;
+  int nblock = ndat / nsamp_per_block;
+  if (ndat % nsamp_per_block)
+    nblock++;
 
 #ifdef _DEBUG
   cerr << "caspsr_unpack ndat=" << ndat << " scale=" << scale 
@@ -146,22 +98,10 @@
        << " nthread=" << nthread << endl;
 #endif
 
-#ifdef USE_TEXTURE_MEMORY
-  unpack_real_ndim1<<<nblock,nthread,0,stream>>> (pol0, pol1, scale);
-#else
-  unpack_real_ndim1<<<nblock,nthread,0,stream>>> (ndat, scale, input, pol0, pol1);
-#endif
-
-  // AJ's theory... 
-  // If there are no stream synchronises on the input then the CPU pinned memory load from the
-  // input class might be able to get ahead of a whole sequence of GPU operations, and even exceed
-  // one I/O loop. Therefore this should be a reuqirement to have a stream synchronize some time
-  // after the data are loaded from pinned memory to GPU ram and the next Input copy to pinned memory
-
-  // put it here for now
-  cudaStreamSynchronize(stream);
-
+  int8_t * from = (int8_t *) input;
+  size_t shm_bytes = 8 * nthread;
+  unpack_real_ndim1<<<nblock,nthread,shm_bytes,stream>>> (ndat, scale, from, pol0, pol1);
 
   if (dsp::Operation::record_time || dsp::Operation::verbose)
-    check_error ("caspsr_unpack");
+    check_error_stream ("caspsr_unpack", stream);
 }
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/caspsr/dsp/CASPSRSingleUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/caspsr/dsp/CASPSRSingleUnpacker.h
--- bl-dspsr-0+git20160405/Kernel/Formats/caspsr/dsp/CASPSRSingleUnpacker.h	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/caspsr/dsp/CASPSRSingleUnpacker.h	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,60 @@
+/*
+
+ */
+
+#ifndef __dsp_CASPSRSingleUnpacker_h
+#define __dsp_CASPSRSingleUnpacker_h
+
+#include "dsp/EightBitUnpacker.h"
+
+namespace dsp {
+  
+  class CASPSRSingleUnpacker : public HistUnpacker
+  {
+  public:
+
+    //! Constructor
+    CASPSRSingleUnpacker (const char* name = "CASPSRSingleUnpacker");
+    ~CASPSRSingleUnpacker ();
+
+    //! Cloner (calls new)
+    virtual CASPSRSingleUnpacker * clone () const;
+
+    //! Return true if the unpacker can operate on the specified device
+    bool get_device_supported (Memory*) const;
+
+    //! Set the device on which the unpacker will operate
+    void set_device (Memory*);
+
+  protected:
+    
+    Reference::To<BitTable> table;
+
+    //! Return true if we can convert the Observation
+    bool matches (const Observation* observation);
+
+    void unpack ();
+
+    void unpack_default ();
+
+    void unpack (uint64_t ndat, const unsigned char* from, 
+		             float* into, const unsigned fskip,
+		             unsigned long* hist);
+
+    void * gpu_stream;
+
+    void unpack_on_gpu ();
+
+    unsigned get_resolution ()const ;
+
+  private:
+
+    bool device_prepared;
+
+    //! maximum number of GPU threads per block
+    int threadsPerBlock;
+
+  };
+}
+
+#endif
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/caspsr/dsp/CASPSRUnpackerCUDA.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/caspsr/dsp/CASPSRUnpackerCUDA.h
--- bl-dspsr-0+git20160405/Kernel/Formats/caspsr/dsp/CASPSRUnpackerCUDA.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/caspsr/dsp/CASPSRUnpackerCUDA.h	2018-03-12 23:02:35.000000000 +0000
@@ -5,8 +5,6 @@
 #ifndef __dsp_CASPSRUnpackerCUDA_h
 #define __dsp_CASPSRUnpackerCUDA_h
 
-// #define USE_TEXTURE_MEMORY 1
-
 #include<stdint.h>
 #include<cuda_runtime.h>
 
@@ -14,7 +12,8 @@
 
 void caspsr_unpack (cudaStream_t stream, const uint64_t ndat,
 		    float scale,
-		    const unsigned char* stagingBufGPU,
-		    float* pol0, float* pol1);
+		    const unsigned char* from,
+		    float* pol0, float* pol1,
+        int nthread);
 
 #endif
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/caspsr/dsp/CASPSRUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/caspsr/dsp/CASPSRUnpacker.h
--- bl-dspsr-0+git20160405/Kernel/Formats/caspsr/dsp/CASPSRUnpacker.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/caspsr/dsp/CASPSRUnpacker.h	2018-03-12 23:02:35.000000000 +0000
@@ -40,8 +40,8 @@
 		             float* into, const unsigned fskip,
 		             unsigned long* hist);
 
-    BitSeries staging;
     void * gpu_stream;
+
     void unpack_on_gpu ();
 
     unsigned get_resolution ()const ;
@@ -89,6 +89,9 @@
     //! sk_thread states
     std::vector <State> states;
 
+    //! maximum number of GPU threads per block
+    int threadsPerBlock;
+
   };
 }
 
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/cpsr/dsp/CPSRFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/dsp/CPSRFile.h
--- bl-dspsr-0+git20160405/Kernel/Formats/cpsr/dsp/CPSRFile.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/dsp/CPSRFile.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/cpsr/dsp/CPSRFile.h,v $
-   $Revision: 1.15 $
-   $Date: 2008/05/28 21:12:42 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/cpsr/dsp/CPSRFile.h
 
 
 #ifndef __CPSRFile_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/cpsr/dsp/CPSRTwoBitCorrection.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/dsp/CPSRTwoBitCorrection.h
--- bl-dspsr-0+git20160405/Kernel/Formats/cpsr/dsp/CPSRTwoBitCorrection.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/dsp/CPSRTwoBitCorrection.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/cpsr/dsp/CPSRTwoBitCorrection.h,v $
-   $Revision: 1.15 $
-   $Date: 2006/07/09 13:27:03 $
-   $Author: wvanstra $ */
+// dspsr/Kernel/Formats/cpsr/dsp/CPSRTwoBitCorrection.h
 
 #ifndef __CPSRTwoBitCorrection_h
 #define __CPSRTwoBitCorrection_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/cpsr/dsp/pspmXfer.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/dsp/pspmXfer.h
--- bl-dspsr-0+git20160405/Kernel/Formats/cpsr/dsp/pspmXfer.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/dsp/pspmXfer.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/cpsr/dsp/pspmXfer.h,v $
-   $Revision: 1.6 $
-   $Date: 2006/10/15 23:26:47 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/cpsr/dsp/pspmXfer.h
 
 #ifndef __pspmXfer_h
 #define __pspmXfer_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/cpsr/dsp/rdisk.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/dsp/rdisk.h
--- bl-dspsr-0+git20160405/Kernel/Formats/cpsr/dsp/rdisk.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/dsp/rdisk.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/cpsr/dsp/rdisk.h,v $
-   $Revision: 1.6 $
-   $Date: 2006/10/15 23:26:47 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/cpsr/dsp/rdisk.h
 
 #ifndef __RDISK_H
 #define __RDISK_H
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/cpsr/dsp/xfer_tape.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/dsp/xfer_tape.h
--- bl-dspsr-0+git20160405/Kernel/Formats/cpsr/dsp/xfer_tape.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/dsp/xfer_tape.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/cpsr/dsp/xfer_tape.h,v $
-   $Revision: 1.2 $
-   $Date: 2006/07/09 13:27:06 $
-   $Author: wvanstra $ */
+// dspsr/Kernel/Formats/cpsr/dsp/xfer_tape.h
 
 #ifndef __XFER_TAPE_H
 #define __XFER_TAPE_H
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/cpsr/pspmDbase.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/pspmDbase.h
--- bl-dspsr-0+git20160405/Kernel/Formats/cpsr/pspmDbase.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/pspmDbase.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/cpsr/pspmDbase.h,v $
-   $Revision: 1.8 $
-   $Date: 2009/06/17 10:32:32 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/cpsr/pspmDbase.h
 
 #ifndef __pspmDbase_h
 #define __pspmDbase_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/cpsr/pspm++.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/pspm++.h
--- bl-dspsr-0+git20160405/Kernel/Formats/cpsr/pspm++.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/pspm++.h	2018-03-12 23:02:35.000000000 +0000
@@ -4,10 +4,7 @@
  *   Licensed under the Academic Free License version 2.1
  *
  ***************************************************************************/
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/cpsr/pspm++.h,v $
-   $Revision: 1.9 $
-   $Date: 2006/10/15 23:26:47 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/cpsr/pspm++.h
 
 #ifndef __PSPM_H
 #define __PSPM_H
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/cpsr/pspm_search_header.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/pspm_search_header.h
--- bl-dspsr-0+git20160405/Kernel/Formats/cpsr/pspm_search_header.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/pspm_search_header.h	2018-03-12 23:02:35.000000000 +0000
@@ -24,7 +24,7 @@
  *      some error message;
  *
  *
- * $Log: pspm_search_header.h,v $
+ * $Log: pspm_search_header.h
  * Revision 1.5  2009/06/17 10:16:54  straten
  * use ISO C99 integer types directly
  *
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/cpsr2/dsp/CPSR2File.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr2/dsp/CPSR2File.h
--- bl-dspsr-0+git20160405/Kernel/Formats/cpsr2/dsp/CPSR2File.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr2/dsp/CPSR2File.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/cpsr2/dsp/CPSR2File.h,v $
-   $Revision: 1.20 $
-   $Date: 2009/06/17 10:16:54 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/cpsr2/dsp/CPSR2File.h
 
 #ifndef __CPSR2File_h
 #define __CPSR2File_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/cpsr2/dsp/CPSR2_Observation.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr2/dsp/CPSR2_Observation.h
--- bl-dspsr-0+git20160405/Kernel/Formats/cpsr2/dsp/CPSR2_Observation.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr2/dsp/CPSR2_Observation.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/cpsr2/dsp/CPSR2_Observation.h,v $
-   $Revision: 1.9 $
-   $Date: 2008/11/11 06:14:09 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/cpsr2/dsp/CPSR2_Observation.h
 
 #ifndef __CPSR2_Observation_h
 #define __CPSR2_Observation_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/cpsr2/dsp/CPSR2TwoBitCorrection.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr2/dsp/CPSR2TwoBitCorrection.h
--- bl-dspsr-0+git20160405/Kernel/Formats/cpsr2/dsp/CPSR2TwoBitCorrection.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr2/dsp/CPSR2TwoBitCorrection.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/cpsr2/dsp/CPSR2TwoBitCorrection.h,v $
-   $Revision: 1.9 $
-   $Date: 2006/07/09 13:27:07 $
-   $Author: wvanstra $ */
+// dspsr/Kernel/Formats/cpsr2/dsp/CPSR2TwoBitCorrection.h
 
 #ifndef __CPSR2TwoBitCorrection_h
 #define __CPSR2TwoBitCorrection_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/dada/DADABuffer.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/dada/DADABuffer.C
--- bl-dspsr-0+git20160405/Kernel/Formats/dada/DADABuffer.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/dada/DADABuffer.C	2018-03-12 23:02:35.000000000 +0000
@@ -12,6 +12,11 @@
 #include "ascii_header.h"
 #include "FilePtr.h"
 
+#if HAVE_CUDA
+#include "dada_cuda.h"
+#include "ipcio_cuda.h"
+#endif
+
 #include <stdlib.h>
 #include <string.h>
 
@@ -60,6 +65,12 @@
     }
   }
 
+#if HAVE_CUDA
+  if (!passive && dada_cuda_dbunregister (hdu) < 0)
+    throw Error (InvalidState, "dsp::DADABuffer::close",
+      "cannot unregister ring buffer blocks as Pinned memory");
+#endif
+
   if (!passive && dada_hdu_unlock_read (hdu) < 0)
     cerr << "dsp::DADABuffer::close error during dada_hdu_unlock_read" << endl;
 
@@ -203,6 +214,14 @@
     throw Error (InvalidState, "dsp::DADABuffer::open_file",
 		 "cannot lock DADA ring buffer read client status");
 
+#if HAVE_CUDA
+	if (verbose)
+		cerr << "dsp::DADABuffer::open_file registering dada buffers with CUDA for pinned transfers" << endl;
+  if (!passive && dada_cuda_dbregister (hdu) < 0)
+    throw Error (InvalidState, "dsp::DADABuffer::open_file",
+      "cannot register DADA ring buffer blocks as Pinned memory");
+#endif
+
   if (passive && dada_hdu_open_view (hdu) < 0)
     throw Error (InvalidState, "dsp::DADABuffer::open_file",
 		 "cannot open DADA ring buffer for viewing");
@@ -220,6 +239,7 @@
   if (ascii_header_get (hdu->header, "RESOLUTION", "%u", &byte_resolution) < 0)
     byte_resolution = 1;
 
+
   // the resolution is the _byte_ resolution; convert to _sample_ resolution
   resolution = get_info()->get_nsamples (byte_resolution);
   if (resolution == 0)
@@ -237,32 +257,55 @@
 int64_t dsp::DADABuffer::load_bytes (unsigned char* buffer, uint64_t bytes)
 {
   if (verbose)
-    cerr << "DADABuffer::load_bytes ipcio_read "
+    cerr << "dsp::DADABuffer::load_bytes ipcio_read "
          << bytes << " bytes" << endl;
 
   int64_t bytes_read = ipcio_read (hdu->data_block, (char*)buffer, bytes);
   if (bytes_read < 0)
-    cerr << "DADABuffer::load_bytes error ipcio_read" << endl;
+    cerr << "dsp::DADABuffer::load_bytes error ipcio_read" << endl;
 
   if (verbose)
-    cerr << "DADABuffer::load_bytes read " << bytes_read << " bytes" << endl;
+    cerr << "dsp::DADABuffer::load_bytes read " << bytes_read << " bytes" << endl;
 
   return bytes_read;
 }
 
+#if HAVE_CUDA
+//! Load bytes from shared memory directory to GPU memory
+int64_t dsp::DADABuffer::load_bytes_device (unsigned char* device_memory, uint64_t bytes, void * device_handle)
+{
+  cudaStream_t stream = (cudaStream_t) device_handle;
+
+  if (verbose)
+    cerr << "dsp::DADABuffer::load_bytes_device ipcio_read_cuda "
+         << bytes << " bytes" << endl;
+
+  int64_t bytes_read = ipcio_read_cuda (hdu->data_block, (char*) device_memory, bytes, stream);
+  //int64_t bytes_read = (int64_t) bytes;
+	cudaStreamSynchronize(stream);
+  if (bytes_read < 0)
+    cerr << "dsp::DADABuffer::load_bytes_device error ipcio_read_cuda" << endl;
+
+  if (verbose)
+    cerr << "dsp::DADABuffer::load_bytes_device read " << bytes_read << " bytes" << endl;
+
+  return bytes_read;
+}
+#endif
+
 //! Adjust the shared memory pointer
 int64_t dsp::DADABuffer::seek_bytes (uint64_t bytes)
 {
   if (verbose)
-    cerr << "DADABuffer::seek_bytes ipcio_seek "
+    cerr << "dsp::DADABuffer::seek_bytes ipcio_seek "
          << bytes << " bytes" << endl;
 
   int64_t absolute_bytes = ipcio_seek (hdu->data_block, bytes, SEEK_SET);
   if (absolute_bytes < 0)
-    cerr << "DADABuffer::seek_bytes error ipcio_seek" << endl;
+    cerr << "dsp::DADABuffer::seek_bytes error ipcio_seek" << endl;
 
   if (verbose)
-    cerr << "DADABuffer::seek_bytes absolute_bytes=" << absolute_bytes << endl;
+    cerr << "dsp::DADABuffer::seek_bytes absolute_bytes=" << absolute_bytes << endl;
 
   return absolute_bytes;
 }
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/dada/dsp/DADABuffer.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/dada/dsp/DADABuffer.h
--- bl-dspsr-0+git20160405/Kernel/Formats/dada/dsp/DADABuffer.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/dada/dsp/DADABuffer.h	2018-03-12 23:02:35.000000000 +0000
@@ -56,7 +56,12 @@
 
     //! Load bytes from shared memory
     virtual int64_t load_bytes (unsigned char* buffer, uint64_t bytes);
-    
+ 
+#if HAVE_CUDA
+    //! Load bytes from shared memory directory to GPU memory
+    int64_t load_bytes_device (unsigned char* device_memory, uint64_t bytes, void * device_handle);
+#endif
+
     //! Set the offset in shared memory
     virtual int64_t seek_bytes (uint64_t bytes);
 
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/dada/Makefile.am bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/dada/Makefile.am
--- bl-dspsr-0+git20160405/Kernel/Formats/dada/Makefile.am	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/dada/Makefile.am	2018-03-12 23:02:35.000000000 +0000
@@ -12,5 +12,5 @@
 
 include $(top_srcdir)/config/Makefile.include
 
-AM_CPPFLAGS += @PSRDADA_CFLAGS@
+AM_CPPFLAGS += @PSRDADA_CFLAGS@ @CUDA_CFLAGS@
 
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/emerlin/dsp/EmerlinFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/emerlin/dsp/EmerlinFile.h
--- bl-dspsr-0+git20160405/Kernel/Formats/emerlin/dsp/EmerlinFile.h	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/emerlin/dsp/EmerlinFile.h	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,40 @@
+
+#ifndef __EmerlinFile_h
+#define __EmerlinFile_h
+
+
+#include <inttypes.h>
+#include "dsp/File.h"
+#include "dsp/BlockFile.h"
+
+
+namespace dsp {
+
+
+    class EmerlinFile : public File {
+
+        public:
+            EmerlinFile(const char* filename=0, const char* headername=0);
+
+            ~EmerlinFile();
+
+            bool is_valid(const char* filename) const ;
+
+        protected:
+            virtual void open_file(const char* filename);
+
+            virtual int64_t seek_bytes(uint64_t bytes);
+            virtual int64_t load_bytes(unsigned char* buffer, uint64_t nbytes);
+
+        private:
+            char datafile[1024];
+            uint64_t cur_frame;
+            uint64_t first_second;
+            uint64_t dropped;
+
+    };
+}
+
+
+#endif
+
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/emerlin/dsp/EmerlinTwoBitTable.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/emerlin/dsp/EmerlinTwoBitTable.h
--- bl-dspsr-0+git20160405/Kernel/Formats/emerlin/dsp/EmerlinTwoBitTable.h	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/emerlin/dsp/EmerlinTwoBitTable.h	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,41 @@
+//-*-C++-*-
+/***************************************************************************
+ *
+ *   Copyright (C) 2011 by Paul Demorest
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#ifndef __EmerlinTwoBitTable_h
+#define __EmerlinTwoBitTable_h
+
+#include "dsp/TwoBitTable.h"
+
+namespace dsp {
+
+  //! Look-up tables for conversion from Emerlin two-bit to floating point numbers
+  /*! Emerlin defines bits to run in time order from LSB to MSB, this is
+   * the opposite of the standard dspsr TwoBitTable convention so
+   * we need to override the 'extract' function here.
+  */
+  class EmerlinTwoBitTable : public TwoBitTable {
+
+  public:
+
+    //! Constructor
+    EmerlinTwoBitTable () : TwoBitTable (TwoBitTable::OffsetBinary) { 
+    destroy();
+    build();
+    }
+    
+    //! Destructor
+    ~EmerlinTwoBitTable () { }
+ 
+    //! Return the 2-bit number from byte corresponding to sample
+    virtual unsigned extract (unsigned byte, unsigned sample) const;
+
+  };
+
+}
+
+#endif // !defined(__EmerlinTwoBitTable_h)
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/emerlin/dsp/EmerlinUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/emerlin/dsp/EmerlinUnpacker.h
--- bl-dspsr-0+git20160405/Kernel/Formats/emerlin/dsp/EmerlinUnpacker.h	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/emerlin/dsp/EmerlinUnpacker.h	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,43 @@
+
+
+#ifndef __EmerlinUnpacker_h
+#define __EmerlinUnpacker_h
+
+
+#include "dsp/Unpacker.h"
+#include "dsp/TimeSeries.h"
+#include "dsp/WeightedTimeSeries.h"
+#include "dsp/EmerlinTwoBitTable.h"
+#include "dsp/TwoBitTable.h"
+
+
+namespace dsp {
+    class EmerlinUnpacker : public Unpacker {
+
+        public:
+            EmerlinUnpacker (const char* name="EmerlinUnpacker");
+            unsigned get_ndig() const;
+
+
+        protected:
+            void unpack();
+            bool matches(const Observation* observation);
+
+            void reserve();
+            void set_output(TimeSeries* _output);
+            int get_ndat_per_weight();
+
+
+        private:
+            dsp::EmerlinTwoBitTable bittable;
+            WeightedTimeSeries* weighted_output;
+
+    };
+
+}
+
+#endif
+
+
+
+
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/emerlin/EmerlinFile.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/emerlin/EmerlinFile.C
--- bl-dspsr-0+git20160405/Kernel/Formats/emerlin/EmerlinFile.C	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/emerlin/EmerlinFile.C	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,278 @@
+
+#include <iostream>
+#include <cstdio>
+#include <cstring>
+#include <fcntl.h>
+
+#include "dsp/EmerlinFile.h"
+#include "dsp/ASCIIObservation.h"
+#include "vdifio.h"
+#include "ascii_header.h"
+#include <unistd.h>
+
+
+
+using namespace std;
+
+dsp::EmerlinFile::EmerlinFile(const char* filename, const char* headername) : File("emerlin"),dropped(0) {
+}
+
+dsp::EmerlinFile::~EmerlinFile() {
+}
+
+
+bool dsp::EmerlinFile::is_valid(const char* filename) const {
+    FILE *fptr = fopen(filename, "r");
+    if (!fptr)
+    {
+        if (verbose)
+            cerr << "dsp::EmerlinFile::is_valid Error opening file." << endl;
+        return false;
+    }
+
+    char header[4096];
+    fread(header, sizeof(char), 4096, fptr);
+    fclose(fptr);
+
+    char inst[64];
+    if ( ascii_header_get(header, "INSTRUMENT", "%s", inst) < 0 )
+    {
+        if (verbose)
+            cerr << "dsp::EmerlinFile::is_valid no INSTRUMENT line" << endl;
+        return false;
+    }
+    if ( std::string(inst) != "EMERLIN" )
+    {
+        if (verbose)
+            cerr << "dsp::EmerlinFile::is_valid INSTRUMENT != 'EMERLIN'" << endl;
+        return false;
+    }
+
+    return true;
+}
+
+
+void dsp::EmerlinFile::open_file(const char* filename) {
+    // This is the header file
+    FILE *fptr = fopen (filename, "r");
+    if (!fptr)
+        throw Error (FailedSys, "dsp::EmerlinFile::open_file",
+                "fopen(%s) failed", filename);
+
+    // Read the header
+    char header[4096];
+    fread(header, sizeof(char), 4096, fptr);
+    fclose(fptr);
+
+    // Get the data file
+    if (ascii_header_get (header, "DATAFILE", "%s", datafile) < 0)
+        throw Error (InvalidParam, "dsp::EmerlinFile::open_file",
+                "Missing DATAFILE keyword");
+
+    // Parse the standard ASCII info.  Timestamps are in VDIF packets
+    // so not required.  Also we'll assume VDIF's "nchan" really gives
+    // the number of polns for now, and NCHAN is 1.  NBIT is in VDIF packets.
+    // We'll compute TSAMP from the bandwidth.  NDIM (real vs complex sampling)
+    // is in VDIF packets via the iscomplex param.
+    ASCIIObservation* info_tmp = new ASCIIObservation;
+    info = info_tmp;
+
+    info_tmp->set_required("UTC_START", false);
+    info_tmp->set_required("OBS_OFFSET", false);
+    info_tmp->set_required("NPOL",true);
+    info_tmp->set_required("NBIT", false);
+    info_tmp->set_required("NDIM", false);
+    info_tmp->set_required("NCHAN", false);
+    info_tmp->set_required("TSAMP", false);
+    info_tmp->set_required("CALFREQ", false);
+    info_tmp->load(header);
+
+
+
+
+    // open the file
+    fd = ::open (datafile, O_RDONLY);
+    if (fd < 0)
+        throw Error (FailedSys, "dsp::EmerlinFile::open_file()", 
+                "open(%s) failed", filename);
+
+
+  // Read until we get a valid frame
+  bool got_valid_frame = false;
+  char rawhdr_bytes[VDIF_HEADER_BYTES];
+  vdif_header *rawhdr = (vdif_header *)rawhdr_bytes;
+  int nbyte;
+  while (!got_valid_frame)
+  {
+    size_t rv = read(fd, rawhdr_bytes, VDIF_HEADER_BYTES);
+    if (rv != VDIF_HEADER_BYTES)
+        throw Error (FailedSys, "EmerlinFile::open_file",
+                "Error reading first header");
+
+    // Get frame size
+    nbyte = getVDIFFrameBytes(rawhdr);
+    if (verbose) cerr << "EmerlinFile::open_file FrameBytes = " << nbyte << endl;
+    //header_bytes = 0;
+    //block_bytes = nbyte;
+    //block_header_bytes = VDIF_HEADER_BYTES; // XXX what about "legacy" mode
+
+    resolution=(nbyte-VDIF_HEADER_BYTES)*2*4*1; // in samples
+
+    // If this first frame is invalid, go to the next one
+    if (getVDIFFrameInvalid(rawhdr)==0)
+      got_valid_frame = true;
+    else
+    {
+      rv = lseek(fd, nbyte-VDIF_HEADER_BYTES, SEEK_CUR);
+      if (rv<0)
+        throw Error (FailedSys, "EmerlinFile::lseek",
+            "Error seeking to next VDIF frame");
+    }
+  }
+
+  // Rewind file
+  lseek(fd, 0, SEEK_SET);
+// Get basic params
+
+  int nbit = getVDIFBitsPerSample(rawhdr);
+  if (verbose) cerr << "EmerlinFile::open_file NBIT = " << nbit << endl;
+  get_info()->set_nbit (nbit);
+
+  bool iscomplex = rawhdr->iscomplex;
+  if (iscomplex)
+  {
+    get_info()->set_ndim(2);
+    get_info()->set_state(Signal::Analytic);
+  }
+  else
+  {
+    get_info()->set_ndim(1);
+    get_info()->set_state(Signal::Nyquist);
+  }
+  if (verbose) cerr << "EmerlinFile::open_file iscomplex = " << iscomplex << endl;
+
+  get_info()->set_npol( 2 );
+  get_info()->set_nchan( 1 );
+  get_info()->set_rate( (double) get_info()->get_bandwidth() * 1e6
+      / (double) get_info()->get_nchan()
+      * (get_info()->get_state() == Signal::Nyquist ? 2.0 : 1.0));
+  if (verbose) cerr << "EmerlinFile::open_file rate = " << get_info()->get_rate() << endl;
+
+  // Figure frames per sec from bw, pkt size, etc
+  //double frames_per_sec = 64000.0;
+  int frame_data_size = nbyte - VDIF_HEADER_BYTES;
+  double frames_per_sec = get_info()->get_nbit() * get_info()->get_nchan() * get_info()->get_npol()
+    * get_info()->get_rate() / 8.0 / (double) frame_data_size;
+  if (verbose) cerr << "EmerlinFile::open_file frame_data_size = "
+    << frame_data_size << endl;
+  if (verbose) cerr << "EmerlinFile::open_file frames_per_sec = "
+    << frames_per_sec << endl;
+
+  // Set load resolution equal to one frame? XXX
+  // This broke file unloading somehow ... wtf..
+  //resolution = info.get_nsamples(frame_data_size);
+
+
+  int mjd = getVDIFFrameMJD(rawhdr);
+  int sec = getVDIFFrameSecond(rawhdr);
+  int fn = getVDIFFrameNumber(rawhdr);
+  first_second = getVDIFFullSecond(rawhdr);
+  cur_frame=fn;
+  if (verbose) cerr << "EmerlinFile::open_file MJD = " << mjd << endl;
+  if (verbose) cerr << "EmerlinFile::open_file sec = " << sec << endl;
+  if (verbose) cerr << "EmerlinFile::open_file fn  = " << fn << endl;
+  get_info()->set_start_time( MJD(mjd,sec,(double)fn/frames_per_sec) );
+
+  // Figures out how much data is in file based on header sizes, etc.
+  set_total_samples();
+
+    if (verbose)
+        cerr << "EmerlinFile::open exit" << endl;
+}
+
+
+
+
+
+int64_t dsp::EmerlinFile::load_bytes(unsigned char* buffer, uint64_t nbytes) {
+
+    if (nbytes % 16000){
+        // trim to an integer number of frames
+        std::cerr << "dsp::EmerlinFile::load_bytes ERROR: Need to read integer number of frames" << std::endl;
+        nbytes = 16000*(nbytes/16000);
+    }
+
+
+    unsigned nframe = nbytes / 16000;
+    unsigned npacket = nframe/2;
+
+    std::memset(buffer, 0, nbytes); // zero the memory
+
+    unsigned char* write_to = buffer;
+
+    uint64_t to_load=nbytes;
+
+    int ipol=0; // should always start at pol zero please.
+    
+    while (to_load > 0){
+
+        char rawhdr_bytes[VDIF_HEADER_BYTES];
+        vdif_header *rawhdr = (vdif_header *)rawhdr_bytes;
+
+        size_t rv = read(fd, rawhdr_bytes, VDIF_HEADER_BYTES);
+        if (rv != VDIF_HEADER_BYTES)
+            throw Error (FailedSys, "EmerlinFile::load_bytes",
+                    "Error reading header");
+
+        int64_t sec = getVDIFFullSecond(rawhdr);
+        int64_t fn = getVDIFFrameNumber(rawhdr);
+        int64_t sn = getVDIFThreadID(rawhdr);
+
+        fn += 4000*(sec-first_second);
+
+        int byte_offset = ((fn-cur_frame)*2 + sn)*8000;
+        //fprintf(stderr,"read %d/%d, pkt=%d to_load=%d %ld\n",fn,sn,byte_offset/8000,to_load,sec-first_second);
+        if ((byte_offset+8000) > nbytes) {
+            // we are past the requested data.
+            // there is surely a better way than this!
+            dropped += to_load/8000;
+            std::cerr << "Some packets missing (left toload=" << to_load<<", total dropped so far = " << dropped << ")" << std::endl;
+            fprintf(stderr,"read %d/%d, pkt=%d to_load=%d %ld\n",fn,sn,byte_offset/8000,to_load,sec-first_second);
+            rv = lseek(fd, -VDIF_HEADER_BYTES, SEEK_CUR);
+            if (rv<0)
+                throw Error (FailedSys, "EmerlinFile::lseek",
+                        "Error seeking to next VDIF frame");
+
+            break;
+        }
+
+        write_to = buffer+byte_offset;
+
+
+            rv = read(fd,write_to, 8000);
+
+        if (rv!=8000){
+            std::cerr << "dsp::EmerlinFile::load_bytes couldn't load data" << std::endl;
+        }
+
+
+/*        if(fn%2==sn){
+            for(int i=0; i < 8000; ++i){
+                write_to[i]=85;
+            }
+        } else {
+        }*/
+
+        to_load -= rv;
+    }
+    cur_frame += nframe;
+
+    return nbytes;
+}
+
+
+int64_t dsp::EmerlinFile::seek_bytes(uint64_t bytes) {
+    std::cerr << "dsp::EmerlinFile::seek_bytes NOT IMPLEMENTED "<<bytes << std::endl;
+    return 0;
+}
+
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/emerlin/EmerlinTwoBitTable.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/emerlin/EmerlinTwoBitTable.C
--- bl-dspsr-0+git20160405/Kernel/Formats/emerlin/EmerlinTwoBitTable.C	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/emerlin/EmerlinTwoBitTable.C	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,16 @@
+/***************************************************************************
+ *
+ *   Copyright (C) 2011 by Paul Demorest
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+#include "dsp/EmerlinTwoBitTable.h"
+#include <iostream>
+
+unsigned dsp::EmerlinTwoBitTable::extract (unsigned byte, unsigned sample) const
+{
+  unsigned char shifts[4] = { 0, 2, 4, 6 }; // LSB is first sample. VDIF standard
+//  unsigned char shifts[4] = { 6, 4, 2, 0 };
+  //std::cout << "dsp::EmerlinTwoBitTable::extract()" << std::endl;
+  return byte >> shifts[sample] & 0x03;
+}
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/emerlin/EmerlinUnpacker.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/emerlin/EmerlinUnpacker.C
--- bl-dspsr-0+git20160405/Kernel/Formats/emerlin/EmerlinUnpacker.C	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/emerlin/EmerlinUnpacker.C	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,148 @@
+
+#include "dsp/EmerlinUnpacker.h"
+#include "dsp/WeightedTimeSeries.h"
+
+
+dsp::EmerlinUnpacker::EmerlinUnpacker(const char* name) : Unpacker(name) {
+}
+
+bool dsp::EmerlinUnpacker::matches (const Observation* observation) {
+
+    return observation->get_machine() == "EMERLIN"
+        && observation->get_nbit() == 2
+        && observation->get_nbit() == 2;
+
+}
+
+int dsp::EmerlinUnpacker::get_ndat_per_weight() {
+    return 1000; 
+}
+
+
+void dsp::EmerlinUnpacker::reserve() {
+    if (weighted_output)
+    {
+        weighted_output -> set_ndat_per_weight (get_ndat_per_weight());
+        weighted_output -> set_nchan_weight (1);
+        weighted_output -> set_npol_weight (input->get_npol());
+    }
+
+    output->resize ( input->get_ndat() );
+
+    if (weighted_output)
+        weighted_output -> neutral_weights ();
+}
+
+void dsp::EmerlinUnpacker::set_output (TimeSeries* _output)
+{
+    if (verbose)
+        std::cerr << "dsp::EmerlinUnpacker::set_output (" << _output << ")" << std::endl;
+
+    Unpacker::set_output (_output);
+    weighted_output = dynamic_cast<WeightedTimeSeries*> (_output);
+}
+
+void dsp::EmerlinUnpacker::unpack() {
+    if(verbose) {
+        std::cerr << "dsp::EmerlinUnpacker::unpack()" << std::endl;
+        std::cerr << "dsp::EmerlinUnpacker input->ndat = "<< input->get_ndat() << std::endl;
+        std::cerr << "dsp::EmerlinUnpacker input->nbit = "<< input->get_nbit() << std::endl;
+        std::cerr << "dsp::EmerlinUnpacker input->ndim = "<< input->get_ndim() << std::endl;
+        std::cerr << "dsp::EmerlinUnpacker input->npol = "<< input->get_npol() << std::endl;
+        std::cerr << "dsp::EmerlinUnpacker input->nchan = "<< input->get_nchan() << std::endl;
+        std::cerr << "dsp::EmerlinUnpacker output->ndat = "<< output->get_ndat() << std::endl;
+    }
+
+
+    const unsigned samples_per_byte=4;
+
+    const unsigned total_bytes = 2*input->get_ndat()/samples_per_byte;
+
+    const unsigned nframe = total_bytes/16000;
+    const unsigned nword = 2000;
+    const unsigned byte_per_word=4;
+    unsigned offset=0;
+
+    const unsigned dat_per_frame = nword*byte_per_word*samples_per_byte;
+    unsigned weights_per_frame = 0;
+    if(weighted_output){
+        weights_per_frame = dat_per_frame / weighted_output->get_ndat_per_weight();
+        if(verbose)
+            std::cerr << "dsp::EmerlinUnpacker weighted output. weights per frame = " << weights_per_frame << std::endl;
+    }
+
+    const unsigned char *iarray = input->get_rawptr();
+    const unsigned char *iarray_orig = iarray;
+    unsigned char word[byte_per_word];
+
+    int count[4];
+
+    unsigned* weights = NULL;
+    for (unsigned iframe=0; iframe < nframe; ++iframe) {
+        for (unsigned ipol=0; ipol < 2; ++ipol) {
+            if(weighted_output){
+                weights = weighted_output->get_weights(0,ipol)+weights_per_frame*iframe;
+            }
+            count[0]=0;
+            count[1]=0;
+            count[2]=0;
+            count[3]=0;
+            float ss=0;
+            if(offset > output->get_ndat()){
+                std::cerr << "dsp::EmerlinUnpacker::unpack error" << std::endl;
+            }
+
+            float* oarray = output->get_datptr (0, ipol) + offset;
+            for (unsigned wd=0; wd < nword; ++wd) {
+                for (unsigned bt = 0; bt < byte_per_word; bt++){
+                    //                    word[bt] = iarray[byte_per_word-1-bt]; // first samples are in last byte of word.
+                    word[bt] = iarray[bt]; // first sample is byte zero on disk.
+                }
+
+                iarray += 4;
+
+
+                for (unsigned bt = 0; bt < byte_per_word; bt++){
+                    const float* four = bittable.get_values(word[bt]);
+                    //    std::cerr << (int)(word[bt]) << std::endl;
+                    //    std::cerr << four[0] << " " << four[1] <<
+                    //        " " << four[2] << " " << four[3] << std::endl;
+
+                    for (unsigned pt=0; pt < samples_per_byte; ++pt) {
+                        if (four[pt] < -0.5)count[0]++;
+                        else if(four[pt] < 0)count[1]++;
+                        else if(four[pt] < 0.5) count[2]++;
+                        else count[3]++;
+                        *oarray = four[pt];
+                        ss+=four[pt]*four[pt];
+                        ++oarray;
+                    }
+                }
+            }
+            if(count[3]==0 && count[2]==0 && count[1]==0){
+                std::cerr << "Zero weight Dropped Frame (weights_per_frame="<<weights_per_frame<<")" << std::endl;
+                std::cerr << -bittable.get_hi_val() << " : " << count[0] << std::endl;
+                std::cerr << -bittable.get_lo_val() << " : " << count[1] << std::endl;
+                std::cerr << bittable.get_lo_val() << " : " << count[2] << std::endl;
+                std::cerr << bittable.get_hi_val() << " : " << count[3] << std::endl;
+
+                for (int iw=0; iw < weights_per_frame; ++iw) {
+                    weights[iw] = 0;
+                }
+            }
+            if(verbose){
+                std::cerr << "dsp::EmerlinUnpacker::unpack frame=" <<iframe<<" pol="<<ipol << std::endl;
+                std::cerr << -bittable.get_hi_val() << " : " << count[0] << std::endl;
+                std::cerr << -bittable.get_lo_val() << " : " << count[1] << std::endl;
+                std::cerr << bittable.get_lo_val() << " : " << count[2] << std::endl;
+                std::cerr << bittable.get_hi_val() << " : " << count[3] << std::endl;
+                std::cerr << "SSSS" << ipol << " " << ss << std::endl;
+            }
+
+
+        }
+
+        offset += 8000*samples_per_byte;
+    }
+
+}
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/emerlin/Makefile.am bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/emerlin/Makefile.am
--- bl-dspsr-0+git20160405/Kernel/Formats/emerlin/Makefile.am	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/emerlin/Makefile.am	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,17 @@
+
+noinst_LTLIBRARIES = libemerlin.la
+
+nobase_include_HEADERS =  dsp/EmerlinFile.h \
+			  dsp/EmerlinUnpacker.h \
+			  dsp/EmerlinTwoBitTable.h
+
+libemerlin_la_SOURCES = EmerlinFile.C \
+						EmerlinUnpacker.C \
+						EmerlinTwoBitTable.C \
+		     vdifio.c vdifio.h
+
+#############################################################################
+#
+
+include $(top_srcdir)/config/Makefile.include
+
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/emerlin/vdifio.c bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/emerlin/vdifio.c
--- bl-dspsr-0+git20160405/Kernel/Formats/emerlin/vdifio.c	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/emerlin/vdifio.c	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,238 @@
+/***************************************************************************
+ *  Copyright (C) 2009-2011 by Adam Deller/Walter Brisken/Chris Phillips   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 3 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+//===========================================================================
+// SVN properties (DO NOT CHANGE)
+//
+// $Id: vdifio.c 5231 2013-04-04 22:38:56Z WalterBrisken $
+// $HeadURL: https://svn.atnf.csiro.au/difx/libraries/vdifio/trunk/src/vdifio.c $
+// $LastChangedRevision: 5231 $
+// $Author: WalterBrisken $
+// $LastChangedDate: 2013-04-04 16:38:56 -0600 (Thu, 04 Apr 2013) $
+//
+//============================================================================
+
+#include <string.h>
+#include <stdio.h>
+#include "vdifio.h"
+
+
+#define VDIF_VERSION 0
+
+#define UNIXZERO_MJD 40587
+
+void mjd2ymd(int mjd, int *year, int *month, int *day) {
+  int jd, temp1, temp2;
+
+  jd = mjd + 2400001;
+
+  // Do some rather cryptic calculations
+  
+  temp1 = 4*(jd+((6*(((4*jd-17918)/146097)))/4+1)/2-37);
+  temp2 = 10*(((temp1-237)%1461)/4)+5;
+
+  *year = temp1/1461-4712;
+  *month =((temp2/306+2)%12)+1;
+  *day = (temp2%306)/10+1;
+}
+
+int ymd2doy(int yr, int mo, int day)
+{
+        int monstart1[] = {0,31,59,90,120,151,181,212,243,273,304,334};
+        int monstart2[] = {0,31,60,91,121,152,182,213,244,274,305,335};
+        int L2;
+
+        L2 = yr/4-(yr+7)/4-yr/100+(yr+99)/100+yr/400-(yr+399)/400;
+        if(L2 == -1)
+        {
+                return day + monstart2[mo-1];
+        }
+        else
+        {
+                return day + monstart1[mo-1];
+        }
+}
+
+int ymd2mjd(int yr, int mo, int day)
+{
+        int doy;
+        int yr1 = yr - 1;
+
+        doy = ymd2doy(yr, mo, day);
+
+        return doy-678576+365*yr1+yr1/4-yr1/100+yr1/400;
+}
+
+//int epoch2mjd(int epoch) {
+//  return ymd2mjd(2000 + epoch/2, (epoch%2)*6+1, 1); // Year and Jan/July
+//}
+
+int createVDIFHeader(vdif_header *header, int framelength, int threadid, int bits, int nchan,
+		      int iscomplex, char stationid[3]) {
+  int lognchan;
+
+  header->epoch = 0;
+
+  if (VDIF_VERSION>7) return(VDIF_ERROR);
+  if (bits>32 || bits<1) return(VDIF_ERROR);
+  if (framelength%8!=0 || framelength<0) return(VDIF_ERROR);
+  if (threadid>1023 || threadid<0) return(VDIF_ERROR);
+
+  // Number of channels encoded as power of 2
+  if (nchan<1) return(VDIF_ERROR);
+  lognchan = 0;
+  while (nchan>1) {
+    if (nchan%2==1) return(VDIF_ERROR);
+    lognchan++;
+    nchan /=2;
+  }
+  if (lognchan>31) return(VDIF_ERROR);
+
+  memset(header, 0, VDIF_HEADER_BYTES);
+
+  header->version = VDIF_VERSION;
+  header->nchan = lognchan;
+  header->framelength8 = framelength/8;
+  if (iscomplex)
+    header->iscomplex = 1;
+  else
+    header->iscomplex = 0;
+  header->nbits = bits-1;
+  header->threadid = threadid;
+  header->stationid = stationid[0]<<8 | stationid[1];
+
+  header->frame=0;
+  //header->framepersec=framepersec;
+
+  return(VDIF_NOERROR);
+}
+
+
+void setVDIFThreadID(vdif_header *header, int threadid)
+{
+  // Should check bounds
+  header->threadid = threadid;
+}
+
+void setVDIFFrameBytes(vdif_header *header, int bytes)
+{
+  // Should check modulo8 and not too big
+  header->framelength8 = bytes/8;
+}
+
+int getVDIFEpochMJD(const vdif_header *header)
+{
+  int epoch = (int)header->epoch;
+  return ymd2mjd(2000 + epoch/2, (epoch%2)*6+1, 1);
+}
+
+void setVDIFNumChannels(vdif_header *header, int numchannels)
+{
+  unsigned int logchans = 0;
+  while(numchannels > 1)
+    {
+      numchannels /= 2;
+      logchans++;
+    }
+  header->nchan = logchans;
+}
+
+int getVDIFNumChannels(const vdif_header *header)
+{
+  int logchans = header->nchan;
+  int numchannels = 1;
+  while(logchans > 0)
+  {
+    numchannels *= 2;
+    logchans--;
+  }
+  return numchannels;
+}
+
+int getVDIFFrameMJD(const vdif_header *header)
+{
+  int mjd = getVDIFEpochMJD(header);
+
+  return mjd + header->seconds/86400; // Seconds may be greater than one day
+}
+
+double getVDIFDMJD(const vdif_header *header, int framepersec) 
+{
+  int mjd = getVDIFFrameMJD(header);
+  int sec = getVDIFFrameSecond(header);
+  return (double)mjd+(sec+(double)header->frame/(double)framepersec)/(24*60*60);
+}
+
+// Note assumes the Epoch is already set
+void setVDIFFrameMJD(vdif_header *header, int framemjd)
+{
+  int emjd = getVDIFEpochMJD(header);
+  int seconds = (int)header->seconds;
+  int mjd = emjd + seconds/86400;    // BUG? I think this step is wrong CJP
+  if(emjd == framemjd) return; //its already right
+  header->seconds = (framemjd-mjd)*86400;
+}
+
+void setVDIFMJDSec(vdif_header *header, uint64_t mjdsec)
+{
+  int epoch = (int)header->epoch;
+  int emjd = ymd2mjd(2000 + epoch/2, (epoch%2)*6+1, 1);
+  header->seconds = (int)(mjdsec - ((uint64_t)emjd)*86400);
+}
+
+void setVDIFEpoch(vdif_header *header, int mjd) {
+  int year, month, day;
+  mjd2ymd(mjd, &year, &month, &day);
+  header->epoch = (year-2000)*2;
+  if (month>6) header->epoch++;
+}
+
+int nextVDIFHeader(vdif_header *header, int framepersec) {
+  header->frame++;
+  if (header->frame>framepersec) {
+    return(VDIF_ERROR);
+  } else if (header->frame==framepersec) {
+    header->seconds++;
+    header->frame = 0;
+  }
+  return(VDIF_NOERROR);
+}
+
+uint64_t time2mjdsec(time_t time) {
+  return ((uint64_t)UNIXZERO_MJD*24*60*60 + (uint64_t)time);
+
+}
+int setVDIFTime(vdif_header *header, time_t time) {
+  int epoch;
+  struct tm t;
+
+  gmtime_r(&time, &t);
+  epoch = (t.tm_year-100)*2;
+  if (epoch<0)     // Year is year since 2000
+    return(VDIF_ERROR);
+  if (t.tm_mon>=6) {
+    epoch++;
+  }
+  epoch %= 32;
+  header->epoch = epoch;
+
+  uint64_t mjdsec = time2mjdsec(time);
+  setVDIFMJDSec(header, mjdsec);
+
+  return(VDIF_NOERROR);
+}
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/emerlin/vdifio.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/emerlin/vdifio.h
--- bl-dspsr-0+git20160405/Kernel/Formats/emerlin/vdifio.h	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/emerlin/vdifio.h	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,145 @@
+/***************************************************************************
+ *   Copyright (C) 2009-2013 by Adam Deller / Walter Brisken               *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 3 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+//===========================================================================
+// SVN properties (DO NOT CHANGE)
+//
+// $Id: vdifio.h 5240 2013-04-09 16:33:24Z WalterBrisken $
+// $HeadURL: https://svn.atnf.csiro.au/difx/libraries/vdifio/trunk/src/vdifio.h $
+// $LastChangedRevision: 5240 $
+// $Author: WalterBrisken $
+// $LastChangedDate: 2013-04-09 10:33:24 -0600 (Tue, 09 Apr 2013) $
+//
+//============================================================================
+	
+#ifndef __VDIFIO_H__
+#define __VDIFIO_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <time.h>
+
+#define VDIF_HEADER_BYTES		32
+#define VDIF_LEGACY_HEADER_BYTES	16
+#define MAX_VDIF_FRAME_BYTES		9032
+#define MAX_VDIF_THREADS                1024
+
+#define VDIF_NOERROR 0
+#define VDIF_ERROR 1
+
+typedef struct vdif_header {
+   uint32_t seconds : 30;
+   uint32_t legacymode : 1;
+   uint32_t invalid : 1;
+   uint32_t frame : 24;
+   uint32_t epoch : 6;
+   uint32_t unassigned : 2;
+   uint32_t framelength8 : 24;
+   uint32_t nchan : 5;
+   uint32_t version : 3;
+   uint32_t stationid : 16;
+   uint32_t threadid : 10;
+   uint32_t nbits : 5;
+   uint32_t iscomplex : 1;
+   uint32_t eversion : 8;
+   uint32_t extended1 : 24;
+   uint32_t extended2;
+   uint32_t extended3;
+   uint32_t extdended4;
+ } vdif_header;
+
+/* Date manipulation functions */
+int ymd2doy(int yr, int mo, int day);
+int ymd2mjd(int yr, int mo, int day);
+
+/* Function to completely fill header struct, returns non-zero on error */
+int createVDIFHeader(vdif_header *header, int framelength, int threadid, int bits, int nchan,
+		     int iscomplex, char stationid[3]);
+
+/* Functions to grab just one value from the raw header */
+static inline int getVDIFThreadID(const vdif_header *header) { return (int)header->threadid; }
+static inline int getVDIFFrameBytes(const vdif_header *header) { return (int)(header->framelength8)*8; }
+int getVDIFFrameMJD(const vdif_header *header);
+double getVDIFDMJD(const vdif_header *header, int framepersec);
+static inline int getVDIFFrameSecond(const vdif_header *header) { return ((int)header->seconds)%86400; }
+static inline int getVDIFFrameNumber(const vdif_header *header) { return (int)header->frame; }
+static inline int getVDIFStationID(const vdif_header *header) { return (int)header->stationid; }
+static inline int getVDIFBitsPerSample(const vdif_header *header) { return ((int)header->nbits+1); }
+int getVDIFNumChannels(const vdif_header *header);
+static inline int getVDIFFrameInvalid(const vdif_header *header) { return (int)header->invalid; }
+static inline int getVDIFFullSecond(const vdif_header *header) { return (int)header->seconds; }
+static inline int getVDIFEpoch(const vdif_header *header) { return (int)header->epoch; }
+
+/* Functions to set just one value from a raw header */
+void setVDIFFrameMJD(vdif_header *header, int framemjd);
+void setVDIFMJDSec(vdif_header *header, uint64_t mjdsec);
+static inline void setVDIFFrameSecond(vdif_header *header, int framesecond) { header->seconds = framesecond; }
+static inline void setVDIFFrameNumber(vdif_header *header, int framenumber) { header->frame = framenumber; }
+static inline void setVDIFFrameInvalid(vdif_header *header, unsigned int invalid) { header->invalid = invalid; }
+void setVDIFFrameBytes(vdif_header *header, int bytes);
+void setVDIFNumChannels(vdif_header *header, int numchannels);
+void setVDIFThreadID(vdif_header *header, int threadid);
+int setVDIFTime(vdif_header *header, time_t time);
+void setVDIFEpoch(vdif_header *header, int mjd);
+int nextVDIFHeader(vdif_header *header, int framepersec);
+
+
+struct vdif_mux_statistics {
+  /* The first 8 accumulate over multiple calls to vdifmux */
+  long long nValidFrame;		/* number of valid VDIF input frames encountered */
+  long long nInvalidFrame;		/* number of real VDIF frames discarded because of invalid bit being set */
+  long long nDiscardedFrame;		/* number of valid input frames discarded because of out-of-order issues */
+  long long nWrongThread;		/* number of otherwise good frames with incorrect thread */
+  long long nSkippedByte;		/* number of bytes skipped (interloper frames) */
+  long long nFillByte;			/* counts number of bytes skipped that were identified as fill pattern */
+  long long nDuplicateFrame;		/* number of frames found with the same time & thread */
+  long long bytesProcessed;		/* total bytes consumed from */
+  long long nGoodFrame;			/* number of fully usable output frames */
+  int nCall;				/* how many calls to vdifmux since last reset */
+
+  /* These remaining fields are set each time */
+  int srcSize;			/* length of input array (bytes) */
+  int srcUsed;			/* amount of input array consumed (bytes) */
+  int destSize;			/* length of output array (bytes) */
+  int destUsed;			/* amount of output array populated */
+  int inputFrameSize;		/* length in bytes of one input data frame (provided to call) */
+  int outputFrameSize;		/* length in bytes of one output data frame (calculated) */
+  int outputFrameGranularity;	/* number of output frames required to make an integer number of nanoseconds */
+  int outputFramesPerSecond;	/* from call */
+  int nOutputFrame;		/* length of usable output data measured in frames */
+  int epoch;			/* from first header */
+  long long startFrameNumber;
+
+  /* start time of output data */
+  /* duration of output data */
+};
+
+int vdifmux(unsigned char *dest, int nFrame, const unsigned char *src, int length, int inputFrameSize, int inputFramesPerSecond, int nBit, int nThread, const int *threadIds, int nSort, int nGap, long long startOutputFrameNumber, struct vdif_mux_statistics *stats);
+
+void printvdifmuxstatistics(const struct vdif_mux_statistics *stats);
+
+void resetvdifmuxstatistics(struct vdif_mux_statistics *stats);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/File_registry.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/File_registry.C
--- bl-dspsr-0+git20160405/Kernel/Formats/File_registry.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/File_registry.C	2018-03-12 23:02:35.000000000 +0000
@@ -125,6 +125,11 @@
 static dsp::File::Register::Enter<dsp::Mark5File> register_mark5;
 #endif
 
+#if HAVE_mark5b
+#include "dsp/Mark5bFile.h"
+static dsp::File::Register::Enter<dsp::Mark5bFile> register_mark5b;
+#endif
+
 #if HAVE_maxim
 #include "dsp/MaximFile.h"
 static dsp::File::Register::Enter<dsp::MaximFile> register_maxim;
@@ -180,6 +185,12 @@
 static dsp::File::Register::Enter<dsp::SpigotFile> register_spigot;
 #endif
 
+
+#if HAVE_emerlin
+#include "dsp/EmerlinFile.h"
+static dsp::File::Register::Enter<dsp::EmerlinFile> register_emerlin;
+#endif
+
 #if HAVE_vdif
 #include "dsp/VDIFFile.h"
 static dsp::File::Register::Enter<dsp::VDIFFile> register_vdif;
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/fits/dsp/FITSOutputFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/fits/dsp/FITSOutputFile.h
--- bl-dspsr-0+git20160405/Kernel/Formats/fits/dsp/FITSOutputFile.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/fits/dsp/FITSOutputFile.h	2018-03-12 23:02:35.000000000 +0000
@@ -54,6 +54,12 @@
     //! Set the output filename convention
     void set_atnf ( bool );
 
+    //! Set output mangling
+    void set_mangle_output ( bool );
+
+    //! Set length of output file (seconds)
+    void set_max_length( double );
+
   protected:
 
     //! Need a custom implementation of operation to handle FITS I/O
@@ -71,6 +77,9 @@
     //! Write nbyte bytes with cfitsio
     virtual int64_t unload_bytes (const void* buffer, uint64_t bytes);
 
+    //! Interface to CFITSIO with error checking and bookkeeping
+    unsigned char* write_bytes (int colnum, int isub, int offset, unsigned bytes_to_write, unsigned char** buffer);
+
     //! samples per block (FITS row)
     unsigned nsblk;
 
@@ -89,6 +98,9 @@
     //! convenience store channel nuumber
     unsigned nchan;
 
+    //! maximum length of output file
+    double max_length;
+
     //! buffer for channels weights
     std::vector<float> dat_wts;
 
@@ -113,7 +125,10 @@
     unsigned offset;
 
     //! keep track of bytes written so far
-    uint64_t written;
+    int64_t written;
+
+    //! optional maximum bytes per file
+    int64_t max_bytes;
 
     //! set up buffers, etc.
     void initialize ();
@@ -124,6 +139,10 @@
     //! Use ATNF datestr convention
     bool use_atnf;
 
+    //! Use a mangled file name for output; rename on file close
+    bool mangle_output;
+    std::string mangled_output_filename;
+
   };
 
 }
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/fits/FITSDigitizer.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/fits/FITSDigitizer.C
--- bl-dspsr-0+git20160405/Kernel/Formats/fits/FITSDigitizer.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/fits/FITSDigitizer.C	2018-03-12 23:02:35.000000000 +0000
@@ -10,6 +10,7 @@
 #include "dsp/FITSDigitizer.h"
 #include "dsp/InputBuffering.h"
 #include <assert.h>
+#include <omp.h>
 
 void dsp::FITSDigitizer::set_digi_scales()
 {
@@ -375,7 +376,6 @@
   // with F in inner loop
   case TimeSeries::OrderTFP:
   {
-#pragma omp parallel for
     for (uint64_t idat=0; idat < ndat; idat++)
     {
       unsigned char* outptr = output->get_rawptr() + (idat*nchan*npol)/samp_per_byte;
@@ -607,6 +607,7 @@
     int bit_counter=0;
     unsigned inner_stride = nchan * npol;
     unsigned idx = 0, bit_shift = 0; // make gcc happy
+#pragma omp parallel for
     for (unsigned ichan=0; ichan < nchan; ichan++)
     {
       unsigned mapped_chan = channel (ichan);
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/fits/FITSFile.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/fits/FITSFile.C
--- bl-dspsr-0+git20160405/Kernel/Formats/fits/FITSFile.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/fits/FITSFile.C	2018-03-12 23:02:35.000000000 +0000
@@ -36,6 +36,7 @@
   : File("FITSFile")
 {
   current_byte = 0;
+  zero_off = 0;
 }
 
 bool dsp::FITSFile::is_valid (const char* filename) const
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/fits/FITSOutputFile.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/fits/FITSOutputFile.C
--- bl-dspsr-0+git20160405/Kernel/Formats/fits/FITSOutputFile.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/fits/FITSOutputFile.C	2018-03-12 23:02:35.000000000 +0000
@@ -24,6 +24,7 @@
 #include "psrfitsio.h"
 
 #include <fcntl.h>
+#include <cstring>
 
 using namespace std;
 
@@ -104,7 +105,9 @@
   nbblk = 0;
   nbit = 2;
 
-  use_atnf = true;
+  use_atnf = false;
+  mangle_output = false;
+  max_length = 0;
 }
 
 dsp::FITSOutputFile::~FITSOutputFile ()
@@ -112,11 +115,30 @@
   finalize_fits ();
 }
 
+unsigned char* dsp::FITSOutputFile::write_bytes (int colnum, int isub, int offset, unsigned bytes_to_write, unsigned char** buffer) {
+  int status = 0;
+  fits_write_col_byt (fptr, colnum, isub, offset, bytes_to_write, *buffer, &status);
+  if (status)
+    throw FITSError(status,"dsp::FITSOutputFile::write_bytes");
+  written += bytes_to_write;
+  *buffer += bytes_to_write;
+}
+
 void dsp::FITSOutputFile::set_atnf (bool _use_atnf)
 {
   use_atnf = _use_atnf;
 }
 
+void dsp::FITSOutputFile::set_mangle_output (bool _mangle_output)
+{
+  mangle_output = _mangle_output;
+}
+
+void dsp::FITSOutputFile::set_max_length ( double _max_length )
+{
+  max_length = _max_length;
+}
+
 void dsp::FITSOutputFile::set_nsblk (unsigned nblk)
 {
   if ( fptr && (nblk != nsblk) )
@@ -280,8 +302,8 @@
 
   // set_model must be called after the Integration::MJD has been set
 
-  //archive-> set_filename (get_filename (phase));
-  if (output_filename.empty())
+  // if using a maximum file size, re-generate file name
+  if (output_filename.empty() || max_bytes)
   {
     MJD epoch = get_input()->get_start_time();
     vector<char> buffer (FILENAME_MAX);
@@ -303,8 +325,17 @@
 		     "error MJD::datestr("+datestr_pattern+")");
     }
     output_filename = filename + get_extension();
+    if (mangle_output)
+    {
+      char buff [L_tmpnam];
+      tmpnam(buff);
+      mangled_output_filename = output_filename + (buff+strlen(buff)-6);
+    }
   }
-  archive -> unload (output_filename);
+  if (mangle_output)
+    archive -> unload (mangled_output_filename);
+  else
+    archive -> unload (output_filename);
 }
 
 void dsp::FITSOutputFile::write_row ()
@@ -347,8 +378,16 @@
     }
   }
 
+  // reset bytes written and current row, etc.
+  written = 0;
+  isub = 0;
+  offset = 0;
+
   int status = 0;
-  fits_open_file (&fptr,output_filename.c_str(), READWRITE, &status);
+  if (mangle_output)
+    fits_open_file (&fptr,mangled_output_filename.c_str(), READWRITE, &status);
+  else
+    fits_open_file (&fptr,output_filename.c_str(), READWRITE, &status);
   if (status)
     throw FITSError (status, "dsp::FITSOutputFile::initialize",
         "unable to open FITS file for writing");
@@ -392,6 +431,13 @@
 
   // TODO -- will need to fix this later on
   psrfits_update_key<int> (fptr, "NSUBOFFS", 0);
+
+  max_bytes = max_length*get_input()->get_rate() / (8/nbit) * nchan * npol;
+  if ( max_bytes && (max_bytes < nbblk) )
+    throw Error (InvalidState, "must set maximum file size > data block size (1 FITS row)" );
+  if ( max_bytes && (max_bytes % nbblk != 0))
+    cerr << "WARNING: maximum file size is not an integer number of data blocks; output files will not be contiguous under PSRFITS conventions." << endl;
+
 }
 
 void dsp::FITSOutputFile::operation ()
@@ -404,6 +450,27 @@
   if (verbose)
     cerr << "dsp::FITSOutputFile::operation" << endl;
 
+
+  // should handle both case where data block is larger than maximum file
+  // size and more typical case where file ends within a data block
+  if (max_bytes)
+  {
+    int64_t nbytes = get_input()->get_nbytes();
+    if (nbytes == 0) return;
+    nbytes -= unload_bytes (get_input()->get_rawptr(), 
+        std::min(max_bytes - written, nbytes));
+    while (nbytes)
+    {
+      finalize_fits ();
+      write_header ();
+      initialize ();
+      // NB written will == 0 here
+      nbytes -= unload_bytes (get_input()->get_rawptr(), 
+          std::min(max_bytes - written, nbytes));
+    }
+    return;
+  }
+
   unload_bytes (get_input()->get_rawptr(), get_input()->get_nbytes());
 
 }
@@ -424,7 +491,6 @@
          << " buffer=" << void_buffer << endl;
 
   unsigned to_write = bytes;
-  int status = 0;
   int colnum = dsp::get_colnum (fptr, "DATA");
   
   // write to incomplete block first
@@ -437,9 +503,7 @@
     // finish remainder of subint
     if (bytes >= remainder)
     {
-      fits_write_col_byt (fptr, colnum, isub, offset, remainder, 
-          buffer, &status);
-      buffer += remainder;
+      write_bytes (colnum, isub, offset, remainder, &buffer);
       to_write -= remainder;
       offset = 0;
     }
@@ -447,9 +511,7 @@
     // write all available bytes without advancing subint
     else
     {
-      fits_write_col_byt (fptr, colnum, isub, offset, bytes, 
-          buffer, &status);
-      written += bytes;
+      write_bytes (colnum, isub, offset, bytes, &buffer);
       offset += bytes;
       return bytes;
     }
@@ -467,9 +529,8 @@
     write_row ();
 
     // Now write that data into a subintegration in the PSRFITS file
-    fits_write_col_byt (fptr, colnum, isub, 1, nbblk, buffer, &status);
+    write_bytes (colnum, isub, 1, nbblk, &buffer);
     to_write -= nbblk;
-    buffer += nbblk;
   }
 
   // write out remaining bytes to partial subbint
@@ -477,7 +538,7 @@
   {
     isub += 1;
     write_row();
-    fits_write_col_byt (fptr, colnum, isub, 1, to_write, buffer, &status);
+    write_bytes (colnum, isub, 1, to_write, &buffer);
     offset += to_write;
   }
 
@@ -491,12 +552,20 @@
     cerr << "dsp::FITSOutputFile::finalize_fits" << endl;
   if (fptr) {
     psrfits_update_key<int> (fptr, "NAXIS2", isub);
-    psrfits_update_key<int> (fptr, "NSTOT", written * (8/nbit) );
+    int nstot = (written*8)/(npol * nchan * nbit);
+    psrfits_update_key<int> (fptr, "NSTOT", nstot );
+    int nsuboffs =  get_input()->get_input_sample()/nsblk - written/nbblk;
+    psrfits_update_key<int> (fptr, "NSUBOFFS", nsuboffs);
     int status = 0;
     fits_close_file(fptr, &status);
-    if (status)
-      throw FITSError(status, "dsp::FITSOutputFile");
     fptr = NULL;
+    if (status)
+      throw FITSError(status, "dsp::FITSOutputFile::finalize_fits");
+    if (mangle_output)
+    {
+      if (rename( mangled_output_filename.c_str(), output_filename.c_str()))
+        throw Error(FailedSys, "dsp::FITSOutputFile::finalize_fits");
+    }
   }
 }
 
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/fits/FITSUnpacker.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/fits/FITSUnpacker.C
--- bl-dspsr-0+git20160405/Kernel/Formats/fits/FITSUnpacker.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/fits/FITSUnpacker.C	2018-03-12 23:02:35.000000000 +0000
@@ -53,14 +53,14 @@
 
 void dsp::FITSUnpacker::unpack()
 {
-  if (verbose) {
-    cerr << "dsp::FITSUnpacker::unpack" << endl;
-  }
 
   // Allocate mapping method to use depending on how many bits per value.
   BitNumberFn p;
   const unsigned nbit = input->get_nbit();
 
+  if (verbose)
+    cerr << "dsp::FITSUnpacker::unpack with nbit=" << nbit << endl;
+
   switch (nbit) {
     case 1:
       p = &dsp::FITSUnpacker::oneBitNumber;
@@ -83,6 +83,13 @@
   const unsigned nchan = input->get_nchan();
   const unsigned ndat  = input->get_ndat();
 
+  // Make sure scales and offsets exist
+  if (dat_scl.size() == 0)
+  {
+    dat_scl.assign(nchan,1);
+    dat_offs.assign(nchan,0);
+  }
+
   // Number of samples in one byte.
   const int samples_per_byte = BYTE_SIZE / nbit;
   const int mod_offset = samples_per_byte - 1;
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/fits/Makefile.am bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/fits/Makefile.am
--- bl-dspsr-0+git20160405/Kernel/Formats/fits/Makefile.am	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/fits/Makefile.am	2018-03-12 23:02:35.000000000 +0000
@@ -11,9 +11,9 @@
 #############################################################################
 #
 
-libfits_la_LIBADD = @CFITSIO_LIBS@
+libfits_la_LIBADD = @CFITSIO_LIBS@ -lgomp
 
 include $(top_srcdir)/config/Makefile.include
 
-AM_CPPFLAGS += @CFITSIO_CFLAGS@
+AM_CPPFLAGS += @CFITSIO_CFLAGS@ -fopenmp
 
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/gmrt/dsp/GMRTBinaryFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/gmrt/dsp/GMRTBinaryFile.h
--- bl-dspsr-0+git20160405/Kernel/Formats/gmrt/dsp/GMRTBinaryFile.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/gmrt/dsp/GMRTBinaryFile.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/gmrt/dsp/GMRTBinaryFile.h,v $
-   $Revision: 1.1 $
-   $Date: 2011/05/08 07:02:00 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/gmrt/dsp/GMRTBinaryFile.h
 
 
 #ifndef __GMRTBinaryFile_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/gmrt/dsp/GMRTFilterbank16.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/gmrt/dsp/GMRTFilterbank16.h
--- bl-dspsr-0+git20160405/Kernel/Formats/gmrt/dsp/GMRTFilterbank16.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/gmrt/dsp/GMRTFilterbank16.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/gmrt/dsp/GMRTFilterbank16.h,v $
-   $Revision: 1.2 $
-   $Date: 2009/03/10 06:26:05 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/gmrt/dsp/GMRTFilterbank16.h
 
 #ifndef __GMRTFilterbank16_h
 #define __GMRTFilterbank16_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/gmrt/dsp/GMRTFilterbankFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/gmrt/dsp/GMRTFilterbankFile.h
--- bl-dspsr-0+git20160405/Kernel/Formats/gmrt/dsp/GMRTFilterbankFile.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/gmrt/dsp/GMRTFilterbankFile.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/gmrt/dsp/GMRTFilterbankFile.h,v $
-   $Revision: 1.1 $
-   $Date: 2009/03/02 17:27:35 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/gmrt/dsp/GMRTFilterbankFile.h
 
 
 #ifndef __GMRTFilterbankFile_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/gmrt/dsp/GMRTUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/gmrt/dsp/GMRTUnpacker.h
--- bl-dspsr-0+git20160405/Kernel/Formats/gmrt/dsp/GMRTUnpacker.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/gmrt/dsp/GMRTUnpacker.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/gmrt/dsp/GMRTUnpacker.h,v $
-   $Revision: 1.2 $
-   $Date: 2011/07/15 04:55:14 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/gmrt/dsp/GMRTUnpacker.h
 
 #ifndef __GMRTUnpacker_h
 #define __GMRTUnpacker_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/guppi/fitshead.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/guppi/fitshead.h
--- bl-dspsr-0+git20160405/Kernel/Formats/guppi/fitshead.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/guppi/fitshead.h	2018-03-12 23:02:35.000000000 +0000
@@ -109,7 +109,8 @@
 
     char* hgetc(                /* Return pointer to value for FITS keyword */
         const char* hstring,    /* FITS header string */
-        const char* keyword);   /* FITS keyword */
+        const char* keyword,    /* FITS keyword */
+        char * value_buffer);   /* caller provided buffer to make re-entrant */
 
     char* ksearch(              /* Return pointer to keyword in FITS header */
         const char* hstring,    /* FITS header string */
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/guppi/hget.c bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/guppi/hget.c
--- bl-dspsr-0+git20160405/Kernel/Formats/guppi/hget.c	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/guppi/hget.c	2018-03-12 23:02:35.000000000 +0000
@@ -42,7 +42,7 @@
  * Subroutine:  hgetm  (hstring,keyword, lstr, str) returns multi-keyword string
  * Subroutine:  hgetdate (hstring,keyword,date) returns date as fractional year
  * Subroutine:  hgetndec (hstring, keyword, ndec) returns number of dec. places
- * Subroutine:  hgetc  (hstring,keyword) returns character string
+ * Subroutine:  hgetc  (hstring,keyword,value_buffer) returns character string
  * Subroutine:  blsearch (hstring,keyword) returns pointer to blank lines
                 before keyword
  * Subroutine:  ksearch (hstring,keyword) returns pointer to header string entry
@@ -179,9 +179,10 @@
     int lval;
     char *dchar;
     char val[VLENGTH+1];
+    char value_buffer[VLENGTH + 1];
 
     /* Get value and comment from header string */
-    value = hgetc (hstring,keyword);
+    value = hgetc(hstring, keyword, value_buffer);
 
     /* Translate value from ASCII to binary */
     if (value != NULL) {
@@ -239,9 +240,10 @@
     int lval;
     char *dchar;
     char val[VLENGTH+1];
+    char value_buffer[VLENGTH + 1];
 
     /* Get value and comment from header string */
-    value = hgetc (hstring,keyword);
+    value = hgetc(hstring, keyword, value_buffer);
 
     /* Translate value from ASCII to binary */
     if (value != NULL) {
@@ -296,9 +298,10 @@
     int lval;
     char *dchar;
     char val[VLENGTH+1];
+    char value_buffer[VLENGTH + 1];
 
     /* Get value and comment from header string */
-    value = hgetc (hstring,keyword);
+    value = hgetc(hstring, keyword, value_buffer);
 
     /* translate value from ASCII to binary */
     if (value != NULL) {
@@ -342,9 +345,10 @@
 double *dval;   /* Right ascension in degrees (returned) */
 {
     char *value;
+    char value_buffer[VLENGTH + 1];
 
     /* Get value from header string */
-    value = hgetc (hstring,keyword);
+    value = hgetc(hstring, keyword, value_buffer);
 
     /* Translate value from ASCII colon-delimited string to binary */
     if (value != NULL) {
@@ -371,9 +375,10 @@
 double *dval;   /* Right ascension in degrees (returned) */
 {
     char *value;
+    char value_buffer[VLENGTH + 1];
 
     /* Get value from header string */
-    value = hgetc (hstring,keyword);
+    value = hgetc(hstring, keyword, value_buffer);
 
     /* Translate value from ASCII colon-delimited string to binary */
     if (value != NULL) {
@@ -434,9 +439,10 @@
     int lval;
     char *dchar;
     char val[VLENGTH+1];
+    char value_buffer[VLENGTH + 1];
 
     /* Get value and comment from header string */
-    value = hgetc (hstring,keyword);
+    value = hgetc(hstring, keyword, value_buffer);
 
     /* Translate value from ASCII to binary */
     if (value != NULL) {
@@ -483,9 +489,10 @@
     char newval;
     int lval;
     char val[VLENGTH+1];
+    char value_buffer[VLENGTH + 1];
 
     /* Get value and comment from header string */
-    value = hgetc (hstring,keyword);
+    value = hgetc(hstring, keyword, value_buffer);
 
     /* Translate value from ASCII to binary */
     if (value != NULL) {
@@ -528,9 +535,10 @@
     int year, month, day, yday, i, hours, minutes;
     //static int mday[12] = {31,28,31,30,31,30,31,31,30,31,30,31};
     int mday[12] = {31,28,31,30,31,30,31,31,30,31,30,31};
+    char value_buffer[VLENGTH + 1];
 
     /* Get value and comment from header string */
-    value = hgetc (hstring,keyword);
+    value = hgetc(hstring, keyword, value_buffer);
 
     /* Translate value from ASCII to binary */
     if (value != NULL) {
@@ -721,10 +729,11 @@
     /* Loop through sequentially-named keywords */
     multiline = 1;
     for (ikey = 1; ikey < 500; ikey++) {
+        char value_buffer[VLENGTH + 1];
         sprintf (keywordi, keyform, keyword, ikey);
 
         /* Get value for this keyword */
-        value = hgetc (hstring, keywordi);
+        value = hgetc (hstring, keywordi, value_buffer);
         if (value != NULL) {
             lval = strlen (value);
             if (lval < lstri)
@@ -803,9 +812,10 @@
 {
     char *value;
     int lval;
+    char value_buffer[VLENGTH + 1];
 
     /* Get value and comment from header string */
-    value = hgetc (hstring,keyword);
+    value = hgetc(hstring, keyword, value_buffer);
 
     if (value != NULL) {
         lval = strlen (value);
@@ -838,9 +848,10 @@
 {
     char *value;
     int i, nchar;
+    char value_buffer[VLENGTH + 1];
 
     /* Get value and comment from header string */
-    value = hgetc (hstring,keyword);
+    value = hgetc(hstring, keyword, value_buffer);
 
     /* Find end of string and count backward to decimal point */
     *ndec = 0;
@@ -861,7 +872,7 @@
 /* Extract character value for variable from FITS header string */
 
 char *
-hgetc (hstring,keyword0)
+hgetc (hstring,keyword0,value_buffer)
 
 const char *hstring;    /* character string containing FITS header information
                    in the format <keyword>= <value> {/ <comment>} */
@@ -870,9 +881,10 @@
                    line beginning with this string.  if "[n]" is present,
                    the n'th token in the value is returned.
                    (the first 8 characters must be unique) */
+char * value_buffer;
 {
     //static char cval[80];
-    char cval[80];
+    char *cval;
     char *value;
     char cwhite[2];
     char squot[2], dquot[2], lbracket[2], rbracket[2], slash[2], comma[2];
@@ -890,6 +902,11 @@
 
     if( !use_saolib ){
 #endif
+    if (value_buffer == NULL)
+    {
+        return NULL;
+    }
+    cval = value_buffer;
 
     squot[0] = (char) 39;
     squot[1] = (char) 0;
@@ -1326,7 +1343,7 @@
 
 {
     double dec;         /* Declination in degrees (returned) */
-    double deg, min, sec, sign;
+    double deg, min=0.0, sec, sign;
     char *value, *c1, *c2;
     int lval;
     char *dchar;
@@ -1921,4 +1938,5 @@
  * Feb 28 2007  If header length is not set in hlength, set it to 0
  * May 31 2007  Add return value of 3 to isnum() if string has colon(s)
  * Aug 22 2007  If closing quote not found, make one up
+ * Sep  6 2016  Added third arg to hgetc() to correct a 'return ptr to stack' issue.
  */
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/kat/dsp/MeerKATUnpackerCUDA.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/kat/dsp/MeerKATUnpackerCUDA.h
--- bl-dspsr-0+git20160405/Kernel/Formats/kat/dsp/MeerKATUnpackerCUDA.h	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/kat/dsp/MeerKATUnpackerCUDA.h	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,45 @@
+/***************************************************************************
+ *
+ *   Copyright (C) 2015 by Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#ifndef __dsp_MeerKATUnpackerCUDA_h
+#define __dsp_MeerKATUnpackerCUDA_h
+
+#include "dsp/MeerKATUnpacker.h"
+
+#include <cuda_runtime.h>
+
+namespace CUDA
+{
+
+  class MeerKATUnpackerEngine : public dsp::MeerKATUnpacker::Engine
+  {
+  public:
+
+    //! Default Constructor
+    MeerKATUnpackerEngine (cudaStream_t stream);
+
+    void setup ();
+
+    bool get_device_supported (dsp::Memory* memory) const;
+
+    void set_device (dsp::Memory* memory);
+
+    void unpack (float scale, const dsp::BitSeries * input, dsp::TimeSeries * output);
+
+  protected:
+
+    cudaStream_t stream;
+
+    struct cudaDeviceProp gpu;
+
+    dsp::BitSeries staging;
+
+  };
+}
+
+
+#endif
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/kat/dsp/MeerKATUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/kat/dsp/MeerKATUnpacker.h
--- bl-dspsr-0+git20160405/Kernel/Formats/kat/dsp/MeerKATUnpacker.h	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/kat/dsp/MeerKATUnpacker.h	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,80 @@
+/*
+
+ */
+
+#ifndef __dsp_MeerKATUnpacker_h
+#define __dsp_MeerKATUnpacker_h
+
+#include "dsp/EightBitUnpacker.h"
+
+namespace dsp {
+  
+  class MeerKATUnpacker : public HistUnpacker
+  {
+  public:
+
+    //! Constructor
+    MeerKATUnpacker (const char* name = "MeerKATUnpacker");
+    ~MeerKATUnpacker ();
+
+    bool get_order_supported (TimeSeries::Order order) const;
+    void set_output_order (TimeSeries::Order order);
+
+
+    unsigned get_output_offset (unsigned idig) const;
+    unsigned get_output_ipol (unsigned idig) const;
+    unsigned get_output_ichan (unsigned idig) const;
+
+    //! Cloner (calls new)
+    virtual MeerKATUnpacker * clone () const;
+
+    //! Return true if the unpacker can operate on the specified device
+    bool get_device_supported (Memory*) const;
+
+    //! Set the device on which the unpacker will operate
+    void set_device (Memory*);
+
+    //! Engine used to unpack the data
+    class Engine;
+
+    void set_engine (Engine*);
+
+  protected:
+
+    //! Interface to alternate processing engine (e.g. GPU)
+    Reference::To<Engine> engine;
+
+    Reference::To<BitTable> table;
+
+    //! Return true if we can convert the Observation
+    bool matches (const Observation* observation);
+
+    void unpack ();
+
+  private:
+
+    bool device_prepared;
+
+    int8_t * tfp_buffer;
+
+    size_t tfp_buffer_size;
+
+  };
+
+  class MeerKATUnpacker::Engine : public Reference::Able
+  {
+  public:
+
+    virtual void setup() = 0;
+
+    virtual void unpack(float scale, const BitSeries * input, TimeSeries * output) = 0;
+
+    virtual bool get_device_supported (Memory* memory) const = 0;
+
+    virtual void set_device (Memory* memory) = 0;
+
+  };
+
+}
+
+#endif
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/kat/KAT7Unpacker.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/kat/KAT7Unpacker.C
--- bl-dspsr-0+git20160405/Kernel/Formats/kat/KAT7Unpacker.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/kat/KAT7Unpacker.C	2018-03-12 23:02:35.000000000 +0000
@@ -127,7 +127,7 @@
       for (unsigned ipol=0; ipol<npol; ipol++)
       {
         into = output->get_datptr (ichan, ipol) + (iblock*256);
-	for (unsigned isamp=0; isamp<256; isamp++)
+        for (unsigned isamp=0; isamp<256; isamp++)
         {
           into[isamp] = (float) from[isamp];
         }
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/kat/KAT7UnpackerCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/kat/KAT7UnpackerCUDA.cu
--- bl-dspsr-0+git20160405/Kernel/Formats/kat/KAT7UnpackerCUDA.cu	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/kat/KAT7UnpackerCUDA.cu	2018-03-12 23:02:35.000000000 +0000
@@ -13,11 +13,9 @@
 
 #include <cuComplex.h>
 
-//#define _DEBUG 
-
 using namespace std;
 
-void check_error (const char*);
+void check_error_stream (const char*, cudaStream_t);
 
 // each thread unpacks samples so that 1 warp does 128 contiguous samples
 __global__ void kat7_unpack_fpt_kernel (const uint64_t ndat, float scale, const int16_t * input, cuFloatComplex * output)
@@ -83,7 +81,7 @@
   // after the data are loaded from pinned memory to GPU ram and the next Input copy to pinned memory
 
   if (dsp::Operation::record_time || dsp::Operation::verbose)
-    check_error ("kat7_unpack_fpt_kernel");
+    check_error_stream ("kat7_unpack_fpt_kernel", stream);
 
   // put it here for now
   cudaStreamSynchronize(stream);
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/kat/Makefile.am bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/kat/Makefile.am
--- bl-dspsr-0+git20160405/Kernel/Formats/kat/Makefile.am	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/kat/Makefile.am	2018-03-12 23:02:35.000000000 +0000
@@ -1,14 +1,14 @@
 
 noinst_LTLIBRARIES = libkat.la
 
-nobase_include_HEADERS =  dsp/KAT7Unpacker.h
+nobase_include_HEADERS =  dsp/KAT7Unpacker.h dsp/MeerKATUnpacker.h
 
-libkat_la_SOURCES = KAT7Unpacker.C
+libkat_la_SOURCES = KAT7Unpacker.C MeerKATUnpacker.C
 
 if HAVE_CUDA
 
-nobase_include_HEADERS += dsp/KAT7UnpackerCUDA.h
-libkat_la_SOURCES += KAT7UnpackerCUDA.cu
+nobase_include_HEADERS += dsp/KAT7UnpackerCUDA.h dsp/MeerKATUnpackerCUDA.h
+libkat_la_SOURCES += KAT7UnpackerCUDA.cu MeerKATUnpackerCUDA.cu
 
 endif
 
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/kat/MeerKATUnpacker.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/kat/MeerKATUnpacker.C
--- bl-dspsr-0+git20160405/Kernel/Formats/kat/MeerKATUnpacker.C	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/kat/MeerKATUnpacker.C	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,262 @@
+//-*-C++-*-
+/***************************************************************************
+ *
+ *   Copyright (C) 2015 Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "dsp/MeerKATUnpacker.h"
+#include "dsp/BitTable.h"
+
+#include "Error.h"
+
+#if HAVE_CUDA
+#include "dsp/MemoryCUDA.h"
+#include "dsp/MeerKATUnpackerCUDA.h"
+#include <cuda_runtime.h>
+#endif
+
+#include <string.h>
+#include <errno.h>
+
+using namespace std;
+
+static void* const undefined_stream = (void *) -1;
+
+dsp::MeerKATUnpacker::MeerKATUnpacker (const char* _name) : HistUnpacker (_name)
+{
+  if (verbose)
+    cerr << "dsp::MeerKATUnpacker ctor" << endl;
+
+  set_nstate (256);
+
+  table = new BitTable (8, BitTable::TwosComplement);
+
+  device_prepared = false;
+  
+  engine = 0;
+
+  tfp_buffer = 0;
+  tfp_buffer_size = 0;
+
+}
+
+dsp::MeerKATUnpacker::~MeerKATUnpacker ()
+{
+}
+
+//! Return true if the unpacker support the specified output order
+bool dsp::MeerKATUnpacker::get_order_supported (TimeSeries::Order order) const
+{
+  //return ((order == TimeSeries::OrderFPT) || (order == TimeSeries::OrderTFP));
+  return (order == TimeSeries::OrderFPT);
+}
+
+//! Set the order of the dimensions in the output TimeSeries
+void dsp::MeerKATUnpacker::set_output_order (TimeSeries::Order order)
+{
+  output_order = order;
+}
+
+
+/*! The quadrature components are offset by one */
+unsigned dsp::MeerKATUnpacker::get_output_offset (unsigned idig) const
+{
+  return idig % 2;
+}
+
+/*! The first two digitizer channels are poln0, the last two are poln1 */
+unsigned dsp::MeerKATUnpacker::get_output_ipol (unsigned idig) const
+{
+  return (idig % 4) / 2;
+}
+
+/*! Each chan has 4 values (quadrature, dual pol) */
+unsigned dsp::MeerKATUnpacker::get_output_ichan (unsigned idig) const
+{
+  return idig / 4;
+}
+
+dsp::MeerKATUnpacker * dsp::MeerKATUnpacker::clone () const
+{
+  return new MeerKATUnpacker (*this);
+}
+
+void dsp::MeerKATUnpacker::set_engine (Engine* _engine)
+{
+  engine = _engine;
+}
+
+//! Return true if the unpacker can operate on the specified device
+bool dsp::MeerKATUnpacker::get_device_supported (Memory* memory) const
+{
+  // create a temporary engine in the default stream
+#if HAVE_CUDA
+  CUDA::DeviceMemory * gpu_mem = dynamic_cast< CUDA::DeviceMemory*>( memory );
+  if (gpu_mem)
+  {
+    CUDA::MeerKATUnpackerEngine * tmp = new CUDA::MeerKATUnpackerEngine(0);
+    return tmp->get_device_supported (memory);
+  }
+  else
+#endif
+  {
+    return false;
+  }
+}
+
+//! Set the device on which the unpacker will operate
+void dsp::MeerKATUnpacker::set_device (Memory* memory)
+{
+#if HAVE_CUDA
+  CUDA::DeviceMemory * gpu_mem = dynamic_cast< CUDA::DeviceMemory*>( memory );
+  if (gpu_mem)
+  {
+    cudaStream_t stream = gpu_mem->get_stream();
+    set_engine (new CUDA::MeerKATUnpackerEngine(stream));
+  }
+#endif
+
+  if (verbose)
+    cerr << "dsp::MeerKATUnpacker::set_device" << endl;
+  if (engine)
+  {
+    engine->set_device(memory);
+    engine->setup();
+  }
+  else
+    Unpacker::set_device (memory);
+  device_prepared = true;
+}
+
+bool dsp::MeerKATUnpacker::matches (const Observation* observation)
+{
+  return observation->get_machine() == "MeerKAT"
+    && observation->get_ndim() == 2
+    && (observation->get_npol() == 2 || observation->get_npol() == 1)
+    && observation->get_nbit() == 8;
+}
+
+void dsp::MeerKATUnpacker::unpack ()
+{
+  const uint64_t ndat  = input->get_ndat();
+  if (ndat == 0)
+    return;
+
+  if (engine)
+  {
+    if (verbose)
+      cerr << "dsp::MeerKATUnpacker::unpack using Engine" << endl;
+    engine->unpack(table->get_scale(), input, output);
+    return;
+  }
+
+  // some programs (digifil) do not call set_device
+  if ( ! device_prepared )
+    set_device ( Memory::get_manager ());
+
+  int16_t * from = (int16_t *) input->get_rawptr();
+  int16_t from16;
+  int8_t * from8 = (int8_t * ) &from16;
+  float * into;
+  const float scale = table->get_scale();
+  const unsigned nchan = input->get_nchan();
+  const unsigned npol  = input->get_npol();
+  const unsigned ndim  = 2;
+  const unsigned nsamp_per_heap = 256;
+  const unsigned nheap = ndat / nsamp_per_heap;
+  const float* lookup = table->get_values ();
+
+  // data is stored as sample blocks of FPT ordered data
+  const uint64_t nval = nsamp_per_heap * ndim;
+
+  if (verbose)
+    cerr << "dsp::MeerKATUnpacker::unpack nheap=" << nheap << " ndat=" << ndat << " nchan=" << nchan 
+         << " npol=" << npol << " nval=" << nval << endl;
+
+  unsigned long * digs[2];
+
+  switch ( output->get_order() )
+  {
+    case TimeSeries::OrderFPT:
+    {
+#ifdef _DEBUG
+      cerr << "dsp::MeerKATUnpacker::unpack TimeSeries::OrderFPT" << endl;
+#endif
+      for (unsigned iheap=0; iheap<nheap; iheap++)
+      {
+        for (unsigned ipol=0; ipol<npol; ipol++)
+        {
+          for (unsigned ichan=0; ichan<nchan; ichan++)
+          {
+            unsigned idig = ichan*ndim*npol + ipol*ndim;
+            digs[0] = get_histogram (idig+0);
+            digs[1] = get_histogram (idig+1);
+            into = output->get_datptr (ichan, ipol) + iheap*nsamp_per_heap * ndim; 
+
+            for (unsigned isamp=0; isamp<nsamp_per_heap; isamp++)
+            {
+              from16 = from[isamp];
+              digs[0][(int) from8[0] + 128]++;
+              digs[1][(int) from8[1] + 128]++;
+              into[2*isamp+0] = (float(from8[0]) + 0.5) * scale;
+              into[2*isamp+1] = (float(from8[1]) + 0.5) * scale;
+            }
+            from += nsamp_per_heap;
+          }
+        }
+      }
+    }
+    break;
+    case TimeSeries::OrderTFP:
+    {
+#ifdef _DEBUG
+      cerr << "dsp::MeerKATUnpacker::unpack TimeSeries::OrderTFP" << endl;
+#endif
+      into = output->get_dattfp();
+      const unsigned heap_stride = nchan * npol * ndim * nsamp_per_heap;
+      const unsigned into_stride = nchan * npol * ndim;
+      for (unsigned iheap=0; iheap<nheap; iheap++)
+      {
+        // memcpy a heap into a local buffer
+        memcpy (tfp_buffer, from, tfp_buffer_size);
+
+        for (unsigned ipol=0; ipol<npol; ipol++)
+        {
+          for (unsigned ichan=0; ichan<nchan; ichan++)
+          {      
+            unsigned idig = ichan*ndim*npol + ipol*ndim;
+            //digs[0] = get_histogram (idig+0);
+            //digs[1] = get_histogram (idig+1);
+
+            float * into_ptr = into + (ichan*npol*ndim) + (ipol*ndim);
+
+            for (unsigned isamp=0; isamp<nsamp_per_heap; isamp++)
+            {
+              from16 = from[isamp];
+              //digs[0][(int) from8[0] + 127]++;
+              //digs[1][(int) from8[1] + 127]++;
+              into_ptr[0] = (float(from8[0]) + 0.5) * scale;
+              into_ptr[1] = (float(from8[1]) + 0.5) * scale;
+
+              into_ptr += into_stride;
+            }
+            from += nsamp_per_heap;;
+          }
+        }
+        into += heap_stride;
+      }
+    }
+    break;
+    default:
+      throw Error (InvalidState, "dsp::MeerKATUnpacker::unpack",
+                   "unrecognized output order");
+    break;
+
+  }
+}
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/kat/MeerKATUnpackerCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/kat/MeerKATUnpackerCUDA.cu
--- bl-dspsr-0+git20160405/Kernel/Formats/kat/MeerKATUnpackerCUDA.cu	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/kat/MeerKATUnpackerCUDA.cu	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,114 @@
+//-*-C++-*-
+/***************************************************************************
+ *
+ *   Copyright (C) 2015 by Willem van Straten and Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#include "dsp/MeerKATUnpackerCUDA.h"
+#include "dsp/Operation.h"
+#include "dsp/MemoryCUDA.h"
+
+#include "Error.h"
+
+#include <cuComplex.h>
+
+using namespace std;
+
+void check_error_stream (const char*, cudaStream_t);
+
+// each thread unpacks 1 complex sample
+__global__ void meerkat_unpack_fpt_kernel (const uint64_t ndat, float scale, const char2 * input, cuFloatComplex * output, uint64_t ostride)
+{
+  // blockIdx.x is the heap number, threadIdx.x is the sample number in the heap
+  const uint64_t idat = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (idat >= ndat)
+    return;
+
+  const unsigned ichanpol    = blockIdx.y * gridDim.z + blockIdx.z; // ichan * npol + ipol
+  const unsigned pol_stride  = gridDim.y * blockDim.x;   // nchan * heap_size
+  const unsigned heap_stride = gridDim.z * pol_stride;  // npol * pol_stride
+
+  //                    iheap                        ipol                        ichan      * heap_size
+  const uint64_t idx = (blockIdx.x * heap_stride) + (blockIdx.z * pol_stride) + (blockIdx.y * blockDim.x) + threadIdx.x;
+  const uint64_t odx = (ichanpol * ostride) + idat;
+
+  char2 in16 = input[idx];
+
+  cuFloatComplex out64;
+  out64.x  = ((float) in16.x + 0.5) * scale;
+  out64.y  = ((float) in16.y + 0.5) * scale;
+
+  output[odx] = out64;
+}
+
+CUDA::MeerKATUnpackerEngine::MeerKATUnpackerEngine (cudaStream_t _stream)
+{
+  stream = _stream;
+}
+
+void CUDA::MeerKATUnpackerEngine::setup ()
+{
+  // determine cuda device properties for block & grid size
+  int device;
+  cudaGetDevice(&device);
+  cudaGetDeviceProperties (&gpu, device);
+}
+
+bool CUDA::MeerKATUnpackerEngine::get_device_supported (dsp::Memory* memory) const
+{
+  return dynamic_cast< CUDA::DeviceMemory*> ( memory );
+}
+
+void CUDA::MeerKATUnpackerEngine::set_device (dsp::Memory* memory)
+{
+  //CUDA::DeviceMemory * gpu_mem = dynamic_cast< CUDA::DeviceMemory*>( memory );
+  //staging.set_memory (gpu_mem);
+}
+
+
+void CUDA::MeerKATUnpackerEngine::unpack (float scale, const dsp::BitSeries * input, dsp::TimeSeries * output)
+{
+  const uint64_t ndat = input->get_ndat();
+  const unsigned nchan = input->get_nchan();
+  const unsigned ndim = input->get_ndim();
+  const unsigned npol = input->get_npol();
+
+#ifdef _DEBUG
+  cerr << "CUDA::MeerKATUnpackerEngine::unpack scale=" << scale 
+       << " ndat=" << ndat << " nchan=" << nchan << " ndim=" << ndim 
+       << " npol=" << npol << endl;
+#endif
+
+  // copy from CPU Bitseries to GPU staging Bitseries
+  char2 * from   = (char2 *) input->get_rawptr();
+
+  cuFloatComplex * into = (cuFloatComplex *) output->get_datptr(0, 0);
+  size_t pol_span = (output->get_datptr(0, 1) - output->get_datptr(0,0)) / ndim;
+
+  if (dsp::Operation::verbose)
+    cerr << "CUDA::MeerKATUnpackerEngine::unpack from=" << (void *) from
+         << " to=" << (void *) into << " pol_span=" << pol_span << endl;
+
+  // since 256 samples per heap
+  int nthread = 256;
+
+  // each thread will unpack 4 time samples
+  dim3 blocks = dim3 (ndat / nthread, nchan, npol);
+
+  if (ndat % nthread != 0)
+    blocks.x++;
+
+#ifdef _DEBUG
+  cerr << "CUDA::MeerKATUnpackerEngine::unpack meerkat_unpack ndat=" << ndat 
+       << " scale=" << scale << " input=" << (void*) input << " nblock=(" 
+       << blocks.x << "," << blocks.y << "," << blocks.z << ")" << " nthread=" << nthread << endl;
+#endif
+
+  meerkat_unpack_fpt_kernel<<<blocks,nthread,0,stream>>> (ndat, scale, from, into, pol_span);
+
+  if (dsp::Operation::record_time || dsp::Operation::verbose)
+    check_error_stream ("CUDA::MeerKATUnpackerEngine::unpack", stream);
+}
+
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/lbadr/dsp/SMROFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/lbadr/dsp/SMROFile.h
--- bl-dspsr-0+git20160405/Kernel/Formats/lbadr/dsp/SMROFile.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/lbadr/dsp/SMROFile.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/lbadr/dsp/SMROFile.h,v $
-   $Revision: 1.6 $
-   $Date: 2009/06/17 10:16:54 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/lbadr/dsp/SMROFile.h
 
 #ifndef __SMROFile_h
 #define __SMROFile_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/lbadr/dsp/SMROTwoBitCorrection.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/lbadr/dsp/SMROTwoBitCorrection.h
--- bl-dspsr-0+git20160405/Kernel/Formats/lbadr/dsp/SMROTwoBitCorrection.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/lbadr/dsp/SMROTwoBitCorrection.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/lbadr/dsp/SMROTwoBitCorrection.h,v $
-   $Revision: 1.6 $
-   $Date: 2009/06/17 10:16:54 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/lbadr/dsp/SMROTwoBitCorrection.h
 
 #ifndef __SMROTwoBitCorrection_h
 #define __SMROTwoBitCorrection_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/lbadr64/dsp/LBADR64_File.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/lbadr64/dsp/LBADR64_File.h
--- bl-dspsr-0+git20160405/Kernel/Formats/lbadr64/dsp/LBADR64_File.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/lbadr64/dsp/LBADR64_File.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/lbadr64/dsp/LBADR64_File.h,v $
-   $Revision: 1.3 $
-   $Date: 2009/06/17 10:16:54 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/lbadr64/dsp/LBADR64_File.h
 
 #ifndef __LBADR64_File_h
 #define __LBADR64_File_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/lbadr64/dsp/LBADR64_TwoBitCorrection.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/lbadr64/dsp/LBADR64_TwoBitCorrection.h
--- bl-dspsr-0+git20160405/Kernel/Formats/lbadr64/dsp/LBADR64_TwoBitCorrection.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/lbadr64/dsp/LBADR64_TwoBitCorrection.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/lbadr64/dsp/LBADR64_TwoBitCorrection.h,v $
-   $Revision: 1.1 $
-   $Date: 2007/02/23 04:29:36 $
-   $Author: ahotan $ */
+// dspsr/Kernel/Formats/lbadr64/dsp/LBADR64_TwoBitCorrection.h
 
 #ifndef __LBADR64_TwoBitCorrection_h
 #define __LBADR64_TwoBitCorrection_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/lofar_dal/dsp/LOFAR_DALUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/lofar_dal/dsp/LOFAR_DALUnpacker.h
--- bl-dspsr-0+git20160405/Kernel/Formats/lofar_dal/dsp/LOFAR_DALUnpacker.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/lofar_dal/dsp/LOFAR_DALUnpacker.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/LOFAR_DALUnpacker.h,v $
-   $Revision: 1.1 $
-   $Date: 2011/08/01 10:07:00 $
-   $Author: straten $ */
+// dspsr/Kernel/Classes/dsp/LOFAR_DALUnpacker.h
 
 #ifndef __LOFAR_DALUnpacker_h
 #define __LOFAR_DALUnpacker_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/lofar_dal/Makefile.am bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/lofar_dal/Makefile.am
--- bl-dspsr-0+git20160405/Kernel/Formats/lofar_dal/Makefile.am	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/lofar_dal/Makefile.am	2018-03-12 23:02:35.000000000 +0000
@@ -5,11 +5,12 @@
 
 liblofar_dal_la_SOURCES = LOFAR_DALFile.C LOFAR_DALUnpacker.C 
 
-liblofar_dal_la_LIBADD = -L$(LOFAR_DAL_INSTALL_PREFIX)/lib -llofardal -lhdf5
+liblofar_dal_la_LIBADD = -L$(LOFAR_DAL_INSTALL_PREFIX)/lib -llofardal @HDF5_LDFLAGS@ @HDF5_LIBS@
 
 #############################################################################
 
 include $(top_srcdir)/config/Makefile.include
 
-AM_CPPFLAGS += -I$(LOFAR_DAL_INSTALL_PREFIX)/include/dal
+AM_CPPFLAGS += -I$(LOFAR_DAL_INSTALL_PREFIX)/include/dal @HDF5_CPPFLAGS@
+
 
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/Makefile.am bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/Makefile.am
--- bl-dspsr-0+git20160405/Kernel/Formats/Makefile.am	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/Makefile.am	2018-03-12 23:02:35.000000000 +0000
@@ -10,5 +10,5 @@
 
 include $(top_srcdir)/config/Makefile.include
 
-AM_CPPFLAGS += @PSRDADA_CFLAGS@ @CFITSIO_CFLAGS@ @GUPPI_DAQ_CFLAGS@
+AM_CPPFLAGS += @PSRDADA_CFLAGS@ @CFITSIO_CFLAGS@ @GUPPI_DAQ_CFLAGS@ @CUDA_CFLAGS@
 
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mark4/dsp/Mark4File.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark4/dsp/Mark4File.h
--- bl-dspsr-0+git20160405/Kernel/Formats/mark4/dsp/Mark4File.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark4/dsp/Mark4File.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/mark4/dsp/Mark4File.h,v $
-   $Revision: 1.5 $
-   $Date: 2009/06/17 10:16:54 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/mark4/dsp/Mark4File.h
 
 #ifndef __Mark4File_h
 #define __Mark4File_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mark4/dsp/Mark4TwoBitCorrection.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark4/dsp/Mark4TwoBitCorrection.h
--- bl-dspsr-0+git20160405/Kernel/Formats/mark4/dsp/Mark4TwoBitCorrection.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark4/dsp/Mark4TwoBitCorrection.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/mark4/dsp/Mark4TwoBitCorrection.h,v $
-   $Revision: 1.2 $
-   $Date: 2006/07/09 13:27:08 $
-   $Author: wvanstra $ */
+// dspsr/Kernel/Formats/mark4/dsp/Mark4TwoBitCorrection.h
 
 #ifndef __Mark4TwoBitCorrection_h
 #define __Mark4TwoBitCorrection_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mark4/dsp/Mark4TwoBitTable.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark4/dsp/Mark4TwoBitTable.h
--- bl-dspsr-0+git20160405/Kernel/Formats/mark4/dsp/Mark4TwoBitTable.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark4/dsp/Mark4TwoBitTable.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/mark4/dsp/Mark4TwoBitTable.h,v $
-   $Revision: 1.2 $
-   $Date: 2006/07/09 13:27:08 $
-   $Author: wvanstra $ */
+// dspsr/Kernel/Formats/mark4/dsp/Mark4TwoBitTable.h
 
 
 #ifndef __Mark4TwoBitTable_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mark5/dsp/Mark5TwoBitCorrection.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark5/dsp/Mark5TwoBitCorrection.h
--- bl-dspsr-0+git20160405/Kernel/Formats/mark5/dsp/Mark5TwoBitCorrection.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark5/dsp/Mark5TwoBitCorrection.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/mark5/dsp/Mark5TwoBitCorrection.h,v $
-   $Revision: 1.5 $
-   $Date: 2009/06/17 10:16:54 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/mark5/dsp/Mark5TwoBitCorrection.h
 
 #ifndef __Mark5TwoBitCorrection_h
 #define __Mark5TwoBitCorrection_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mark5b/dsp/Mark5bFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark5b/dsp/Mark5bFile.h
--- bl-dspsr-0+git20160405/Kernel/Formats/mark5b/dsp/Mark5bFile.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark5b/dsp/Mark5bFile.h	2018-03-12 23:02:35.000000000 +0000
@@ -1,7 +1,7 @@
 //-*-C++-*-
 /***************************************************************************
  *
- *   Copyright (C) 2015 by Stuart Weston and Willem van Straten
+ *   Copyright (C) 2016 by Willem van Straten
  *   Licensed under the Academic Free License version 2.1
  *
  ***************************************************************************/
@@ -13,7 +13,7 @@
 
 namespace dsp {
 
-  //! Loads BitSeries data from a MkV file
+  //! Loads BitSeries data from a MkV file using the mark5access library
   class Mark5bFile : public BlockFile
   {
   public:
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mark5b/Makefile.am bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark5b/Makefile.am
--- bl-dspsr-0+git20160405/Kernel/Formats/mark5b/Makefile.am	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark5b/Makefile.am	2018-03-12 23:02:35.000000000 +0000
@@ -3,10 +3,13 @@
 
 nobase_include_HEADERS = dsp/Mark5bFile.h dsp/Mark5bUnpacker.h
 
-libmark5_la_SOURCES = Mark5bFile.C Mark5bUnpacker.C
+libmark5b_la_SOURCES = Mark5bFile.C Mark5bUnpacker.C
+
+libmark5b_la_LIBADD = @MARK5ACCESS_LIBS@
 
 #############################################################################
 #
 
 include $(top_srcdir)/config/Makefile.include
 
+AM_CPPFLAGS += @MARK5ACCESS_CFLAGS@
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mark5b/Mark5bFile.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark5b/Mark5bFile.C
--- bl-dspsr-0+git20160405/Kernel/Formats/mark5b/Mark5bFile.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark5b/Mark5bFile.C	2018-03-12 23:02:35.000000000 +0000
@@ -1,30 +1,23 @@
 /***************************************************************************
  *
- *   Copyright (C) 2015 by Stuart Weston and Willem van Straten
+ *   Copyright (C) 2016 by Willem van Straten
  *   Licensed under the Academic Free License version 2.1
  *
  ***************************************************************************/
 
-using namespace std;
-
 #include "dsp/Mark5bFile.h"
-#include "vlba_stream.h"
 #include "Error.h"
 
 #include "coord.h"
 #include "strutil.h"	
 #include "ascii_header.h"
 
-#include <iomanip>
+#include <mark5access.h>
 
-#include <time.h>
-#include <errno.h>
+#include <memory>
 #include <stdio.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <string.h>
+
+using namespace std;
 
 dsp::Mark5bFile::Mark5bFile (const char* filename,const char* headername)
   : BlockFile ("Mark5b")
@@ -43,8 +36,10 @@
   headername += ".hdr";
 
   FILE* fptr = fopen (headername.c_str(), "r");
-  if( !fptr ) {
-      if (verbose) cerr << "Mark5bFile: no hdr file (" << headername << ")" << endl;
+  if( !fptr )
+  {
+    if (verbose)
+      cerr << "Mark5bFile: no hdr file (" << headername << ")" << endl;
     return false;
   }
 
@@ -52,8 +47,8 @@
   fread (header.get(), sizeof(char),1024, fptr);
   fclose (fptr);
 
-  int dummy_fanout = 0;
-  if (ascii_header_get (header.get(), "FANOUT", "%d", &dummy_fanout) < 0)
+  char dummy_format[64];
+  if (ascii_header_get (header.get(), "FORMAT", "%d", &dummy_format) < 0)
     return false;
 	
   return true;
@@ -76,48 +71,51 @@
   fclose (ftext);
 
   // ///////////////////////////////////////////////////////////////
-  //  NBIT
+  //  FORMAT
   //
-  int nbit = 0;
-  if (ascii_header_get (header,"NBIT","%d",&nbit) < 0)
+  char format[64];
+  if (ascii_header_get (header,"FORMAT","%s",&format) < 0)
    throw Error (InvalidParam, "Mark5bFile::open_file", 
-		 "failed read NBIT");
+		 "failed read FORMAT");
 	
-  cerr << "NBIT = " << nbit << endl;
-  get_info()->set_nbit (nbit);
-
+  cerr << "FORMAT = " << format << endl;
 
-  // ///////////////////////////////////////////////////////////////
-  //  FANOUT
-  //
-  int fanout = 0;
-  if (ascii_header_get (header,"FANOUT","%d",&fanout) < 0)
-   throw Error (InvalidParam, "Mark5bFile::open_file", 
-		 "failed read FANOUT");
-	
-  cerr << "FANOUT = " << fanout << endl;
+  /* From the mark5access library documentation:
 
-  struct VLBA_stream* vlba_stream = 0;
+     3.3.1 struct mark5_format_generic* 
+                  new_mark5_format_from_string(const char *formatname)
 
-  stream = vlba_stream = VLBA_stream_open (filename, nbit, fanout, 0);
-
-  if (!stream)
-    throw Error (InvalidParam, "Mark5bFile::open_file",
-		 "failed VLBA_stream_open");
+     A function to create a (struct mark5_format_generic) representing
+     one of the built-in formats.  The string pointed to by
+     "formatname" should be of the form: FORMAT-Mbps-nChannels-nBits.
+     Examples for the three formats currently built into mark5acces
+     include: "VLBA1_4-256-4-2", "MKIV1_2-128-8-2",
+     "Mark5B-1024-16-2".  Note that the string is case insensitive.
+     Also note here that in the case of VLBA and Mark4 (MKIV) the
+     fanout is built into the FORMAT portion of "formatname".
+  */
+
+  struct mark5_format_generic* m5format = 0;
+  m5format = new_mark5_format_generic_from_string (format);
+  if (!m5format)
+    throw Error (FailedCall, "Mark5bFile::open_file",
+		 "failed new_mark5_format_generic_from_string (%s)", format);
 
   fd = 0;
 
-  // instruct the loader to only take gulps in 32/16 lots of nbits
-  // necessary since Mk5 files are written in 64-/32-bit words
-  cerr << "TRACKS = " << vlba_stream->tracks << endl;
-  Input::resolution = vlba_stream->tracks / nbit;  
-
-  // The factor of 2 should only apply for dual-pol data.
-  cerr << "NCHAN = " << vlba_stream->nchan / 2 << endl;
-  get_info()->set_nchan( vlba_stream->nchan / 2 ); 
+  struct mark5_stream_generic* m5file = 0;
+  m5file = new_mark5_stream_file (filename, 0);
+  if (!m5file)
+    throw Error (FailedCall, "Mark5bFile::open_file",
+		 "failed new_mark5_stream_file (%s)", filename);
+
 
-  cerr << "SAMPRATE = " << vlba_stream->samprate << endl;
-  get_info()->set_rate ( vlba_stream->samprate );
+  struct mark5_stream* m5stream = new_mark5_stream (m5file,m5format);
+
+  stream = m5stream;
+  
+  // instruct the loader to only take gulps of samplegranularity samples
+  Input::resolution = m5stream->samplegranularity;
 
   int refmjd = 0;
   if (ascii_header_get (header,"REFMJD","%d",&refmjd) < 0)
@@ -125,12 +123,12 @@
 		 "failed read REFMJD");
 
   cerr << "REFMJD " << refmjd << endl;
-  vlba_stream->mjd += refmjd;
+  m5stream->mjd += refmjd;
 
-  cerr << "MJD = " << vlba_stream->mjd << endl;
-  cerr << "SEC = " << vlba_stream->sec << endl;
+  cerr << "MJD = " << m5stream->mjd << endl;
+  cerr << "SEC = " << m5stream->sec << endl;
 
-  get_info()->set_start_time( MJD(vlba_stream->mjd, vlba_stream->sec, 0) );
+  get_info()->set_start_time( MJD(m5stream->mjd, m5stream->sec, 0) );
 
   // ///////////////////////////////////////////////////////////////
   // TELESCOPE
@@ -213,18 +211,27 @@
 		 "failed read BW");
 
   get_info()->set_bandwidth (bw);
-	
-  // ///////////////////////////////////////////////////////////////
-  // NPOL
-  //	
-  //  -- generalise this later
-	
-  get_info()->set_npol(2);    // read in both polns at once
+
+  double Mega_samples_per_second = m5stream->Mbps / m5stream->nbit;
+
+  double npol = round( (bw * 2) / Mega_samples_per_second );
+  cerr << "NPOL=" << npol << endl;
+  
+  cerr << "NCHAN = " << m5stream->nchan / npol << endl;
+  get_info()->set_nchan( m5stream->nchan / npol ); 
+
+  cerr << "NBIT = " << m5stream->nbit << endl;
+  get_info()->set_nbit ( m5stream->nbit );
+  
+  cerr << "SAMPRATE = " << m5stream->samprate << endl;
+  get_info()->set_rate ( m5stream->samprate );
+
+  get_info()->set_npol(npol);
 
   // ///////////////////////////////////////////////////////////////	
   // NDIM  --- whether the data are Nyquist or Quadrature sampled
   //
-  // VLBA data are Nyquist sampled
+  // MARK5 data are Nyquist sampled
 
   get_info()->set_state (Signal::Nyquist);
 	  
@@ -250,17 +257,12 @@
   get_info()->set_machine("Mark5b");	
 }
 
-extern "C" int next_frame (struct VLBA_stream *vs);
-
-/*! Uses Walter's next_frame to take care of the modbits business, then
- copies the result from the VLBA_stream::frame buffer into the buffer
- argument. */
 int64_t dsp::Mark5bFile::load_bytes (unsigned char* buffer, uint64_t bytes)
 {
   if (verbose) cerr << "Mark5bFile::load_bytes nbytes =" << bytes << endl;
 
   if (verbose) 
-    cerr << "Mark5bFile::load_bytes leave it to VLBA_stream_get_data" << endl;
+    cerr << "Mark5bFile::load_bytes leave it to MARK5_stream_get_data" << endl;
   return bytes;
 }
 
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mark5b/Mark5bUnpacker.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark5b/Mark5bUnpacker.C
--- bl-dspsr-0+git20160405/Kernel/Formats/mark5b/Mark5bUnpacker.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark5b/Mark5bUnpacker.C	2018-03-12 23:02:35.000000000 +0000
@@ -1,13 +1,14 @@
 /***************************************************************************
  *
- *   Copyright (C) 2015 by Stuart Weston and Willem van Straten
+ *   Copyright (C) 2016 by Willem van Straten
  *   Licensed under the Academic Free License version 2.1
  *
  ***************************************************************************/
 
 #include "dsp/Mark5bUnpacker.h"
 #include "dsp/Mark5bFile.h"
-#include "vlba_stream.h"
+
+#include <mark5access.h>
 
 using namespace std;
 
@@ -43,17 +44,18 @@
     throw Error (InvalidState, "dsp::Mark5bUnpacker::unpack",
 		 "Input is not a Mark5bFile");
 
-  struct VLBA_stream* vlba_stream = (struct VLBA_stream*) file->stream;
+  struct mark5_stream* m5stream = (struct mark5_stream*) file->stream;
 
   float* data [npol * nchan];
 
+  /* Stuart: this is the place in the code where we rearrange channels */
   for (unsigned ipol = 0 ; ipol < npol ; ipol++)
     for (unsigned ichan=0; ichan < nchan; ichan++)
-      data[ipol + 2*ichan] = output->get_datptr(ichan,ipol);
+      data[ipol + npol*ichan] = output->get_datptr(ichan,ipol);
 
-  if (VLBA_stream_get_data (vlba_stream, ndat, data) < 0)
+  if (mark5_stream_decode(m5stream, ndat, data) < 0)
     throw Error (InvalidState, "dsp::Mark5bUnpacker::unpack",
-                 "error VLBA_stream_get_data (most likely EOD)");
+                 "error mark5_stream_decode (most likely EOD)");
 
 }
 
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/maxim/dsp/MaximFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/maxim/dsp/MaximFile.h
--- bl-dspsr-0+git20160405/Kernel/Formats/maxim/dsp/MaximFile.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/maxim/dsp/MaximFile.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/maxim/dsp/MaximFile.h,v $
-   $Revision: 1.3 $
-   $Date: 2008/05/28 21:12:43 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/maxim/dsp/MaximFile.h
 
 #ifndef __MaximFile_h
 #define __MaximFile_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/maxim/dsp/MaximUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/maxim/dsp/MaximUnpacker.h
--- bl-dspsr-0+git20160405/Kernel/Formats/maxim/dsp/MaximUnpacker.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/maxim/dsp/MaximUnpacker.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/maxim/dsp/MaximUnpacker.h,v $
-   $Revision: 1.2 $
-   $Date: 2006/07/09 13:27:08 $
-   $Author: wvanstra $ */
+// dspsr/Kernel/Formats/maxim/dsp/MaximUnpacker.h
 
 
 #ifndef __MaximUnpacker_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mopsr/dsp/MOPSRUnpackerCUDA.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mopsr/dsp/MOPSRUnpackerCUDA.h
--- bl-dspsr-0+git20160405/Kernel/Formats/mopsr/dsp/MOPSRUnpackerCUDA.h	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mopsr/dsp/MOPSRUnpackerCUDA.h	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,34 @@
+/***************************************************************************
+ *
+ *   Copyright (C) 2013 by Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#ifndef __dsp_MOPSRUnpackerCUDA_h
+#define __dsp_MOPSRUnpackerCUDA_h
+
+//#define USE_TEXTURE_MEMORY 
+
+#include <stdint.h>
+#include <cuda_runtime.h>
+
+#include <dsp/MemoryCUDA.h>
+
+void mopsr_texture_alloc (void * d_staging, size_t size);
+
+void mopsr_unpack_prepare (cudaStream_t stream, const float scale);
+
+#ifdef USE_TEXTURE_MEMORY
+void mopsr_unpack (cudaStream_t stream, const uint64_t ndat,
+                   const unsigned char* stagingBufGPU,
+                   float* into, cudaTextureObject_t * tex);
+#else
+void mopsr_unpack_fpt (cudaStream_t stream, const uint64_t ndat, const unsigned nchan,
+                       float scale, int8_t const * input, float * output);
+#endif
+void mopsr_unpack_tfp (cudaStream_t stream, const uint64_t ndat, const unsigned nchan,
+                       float scale, int8_t const * input, float * output);
+
+#endif
+
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mopsr/dsp/MOPSRUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mopsr/dsp/MOPSRUnpacker.h
--- bl-dspsr-0+git20160405/Kernel/Formats/mopsr/dsp/MOPSRUnpacker.h	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mopsr/dsp/MOPSRUnpacker.h	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,92 @@
+/***************************************************************************
+ *
+ *   Copyright (C) 2013 by Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#ifndef __dsp_MOPSRUnpacker_h
+#define __dsp_MOPSRUnpacker_h
+
+#include "dsp/EightBitUnpacker.h"
+
+namespace dsp {
+
+  class MOPSRUnpacker : public HistUnpacker
+  {
+  public:
+
+    //! Constructor
+    MOPSRUnpacker (const char* name = "MOPSRUnpacker");
+    ~MOPSRUnpacker ();
+
+    unsigned get_output_offset (unsigned idig) const;
+
+    unsigned get_output_ipol (unsigned idig) const;
+
+    unsigned get_output_ichan (unsigned idig) const;
+
+    unsigned get_ndim_per_digitizer () const;
+
+    //! Cloner (calls new)
+    virtual MOPSRUnpacker * clone () const;
+
+    //! Return true if the unpacker can operate on the specified device
+    bool get_device_supported (Memory*) const;
+
+    //! synch with the Input resolution
+     void match_resolution (const Input*);
+
+    //! Set the device on which the unpacker will operate
+    void set_device (Memory*);
+
+  protected:
+    
+    Reference::To<BitTable> table;
+
+    //! Return true if we can convert the Observation
+    bool matches (const Observation* observation);
+
+    void unpack ();
+
+    //! Return true if support the output order
+    bool get_order_supported (TimeSeries::Order order) const;
+
+    //! Set the order of the dimensions in the output TimeSeries
+    virtual void set_output_order (TimeSeries::Order);
+
+    BitSeries staging;
+
+    void * gpu_stream;
+
+    unsigned get_resolution ()const;
+
+    void unpack_on_gpu ();
+
+  private:
+
+    void validate_transformation();
+
+    enum DataOrder {
+      //! unknown input order
+      NONE,
+      //! PFB single antenna input
+      TF,
+      //! PFB multi antenna input
+      FT,
+      //! Beam Formed single antenna input
+      T
+    };
+
+    DataOrder input_order;
+
+    bool device_prepared;
+
+    unsigned input_resolution;
+
+    int debugd;
+
+  };
+}
+
+#endif
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mopsr/Makefile.am bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mopsr/Makefile.am
--- bl-dspsr-0+git20160405/Kernel/Formats/mopsr/Makefile.am	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mopsr/Makefile.am	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,19 @@
+
+noinst_LTLIBRARIES = libmopsr.la
+
+nobase_include_HEADERS =  dsp/MOPSRUnpacker.h
+
+libmopsr_la_SOURCES = MOPSRUnpacker.C
+
+include $(top_srcdir)/config/Makefile.include
+
+if HAVE_CUDA
+
+nobase_include_HEADERS += dsp/MOPSRUnpackerCUDA.h
+libmopsr_la_SOURCES += MOPSRUnpackerCUDA.cu
+
+include $(top_srcdir)/config/Makefile.cuda
+endif
+
+
+AM_CPPFLAGS += @CUDA_CFLAGS@
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mopsr/MOPSRUnpacker.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mopsr/MOPSRUnpacker.C
--- bl-dspsr-0+git20160405/Kernel/Formats/mopsr/MOPSRUnpacker.C	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mopsr/MOPSRUnpacker.C	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,620 @@
+//-*-C++-*-
+/***************************************************************************
+ *
+ *   Copyright (C) 2013 by Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "dsp/MOPSRUnpacker.h"
+#include "dsp/ASCIIObservation.h"
+#include "dsp/BitTable.h"
+
+#include "Error.h"
+
+#if HAVE_CUDA
+#include "dsp/MemoryCUDA.h"
+#include "dsp/MOPSRUnpackerCUDA.h"
+#include <cuda_runtime.h>
+#endif
+
+#include <errno.h>
+#include <string.h>
+
+using namespace std;
+
+static void* const undefined_stream = (void *) -1;
+
+#ifdef _DEBUG
+#define CHECK_ERROR(x) check_error(x)
+#define CHECK_ERROR_STREAM(x,y) check_error_stream(x,y)
+#else
+#define CHECK_ERROR(x)
+#define CHECK_ERROR_STREAM(x,y)
+#endif
+
+#if HAVE_CUDA
+void check_error (const char *);
+void check_error_stream (const char *, cudaStream_t);
+#endif
+
+dsp::MOPSRUnpacker::MOPSRUnpacker (const char* _name) : HistUnpacker (_name)
+{
+  if (verbose)
+    cerr << "dsp::MOPSRUnpacker ctor" << endl;
+
+  gpu_stream = undefined_stream;
+
+  table = new BitTable (8, BitTable::TwosComplement);
+  device_prepared = false;
+
+  // complex input data, 1 polarisation
+  set_ndig (2);
+  set_nstate (256);
+
+  input_order = NONE;
+  input_resolution = 1;
+
+  debugd = 0;
+}
+
+dsp::MOPSRUnpacker::~MOPSRUnpacker ()
+{
+}
+
+/*! The quadrature components must be offset by one */
+unsigned dsp::MOPSRUnpacker::get_output_offset (unsigned idig) const
+{
+  return idig % 2;
+}
+
+/*! */
+unsigned dsp::MOPSRUnpacker::get_output_ipol (unsigned idig) const
+{
+  return 0;
+}
+
+/*! */
+unsigned dsp::MOPSRUnpacker::get_output_ichan (unsigned idig) const
+{
+  return idig / 2;
+}
+
+unsigned dsp::MOPSRUnpacker::get_ndim_per_digitizer () const
+{
+  return 1;
+}
+
+//! Return true if the unpacker support the specified output order
+bool dsp::MOPSRUnpacker::get_order_supported (TimeSeries::Order order) const
+{
+  return ((order == TimeSeries::OrderFPT)  || (order == TimeSeries::OrderTFP));
+}
+
+//! Set the order of the dimensions in the output TimeSeries
+void dsp::MOPSRUnpacker::set_output_order (TimeSeries::Order order)
+{
+  if (verbose)
+  {
+    if (order == TimeSeries::OrderFPT)
+      cerr << "dsp::MOPSRUnpacker::set_output_order (TimeSeries::OrderFPT)" << endl;
+    if (order == TimeSeries::OrderTFP)
+      cerr << "dsp::MOPSRUnpacker::set_output_order (TimeSeries::OrderTFP)" << endl;
+  }
+  output_order = order;
+  output->set_order (order);
+}
+
+dsp::MOPSRUnpacker * dsp::MOPSRUnpacker::clone () const
+{
+  return new MOPSRUnpacker (*this);
+}
+
+//! Return true if the unpacker can operate on the specified device
+bool dsp::MOPSRUnpacker::get_device_supported (Memory* memory) const
+{
+#if HAVE_CUDA
+  if (verbose)
+    cerr << "dsp::MOPSRUnpacker::get_device_supported HAVE_CUDA" << endl;
+  return dynamic_cast< CUDA::DeviceMemory*> ( memory );
+#else
+  return false;
+#endif
+}
+
+//! Set the device on which the unpacker will operate
+void dsp::MOPSRUnpacker::set_device (Memory* memory)
+{
+#if HAVE_CUDA
+  CUDA::DeviceMemory * gpu_mem = dynamic_cast< CUDA::DeviceMemory*>( memory );
+  if (gpu_mem)
+  {
+    gpu_stream = (void *) gpu_mem->get_stream();
+    mopsr_unpack_prepare (gpu_mem->get_stream(), (float) table->get_scale());
+    if (verbose)
+      cerr << "dsp::MOPSRUnpacker::set_device gpu_stream=" << gpu_stream << endl;
+#ifdef USE_TEXTURE_MEMORY
+    CUDA::TextureMemory * texture_mem = new CUDA::TextureMemory (gpu_mem->get_stream());
+    if (verbose)
+      cerr << "dsp::MOPSRUnpacker::set_device using texture memory ptr=" << texture_mem << endl;
+    texture_mem->set_format_signed(8, 8, 0, 0);
+
+    cerr << "dsp::MOPSRUnpacker::set_device staging.set_memory (texture_mem)" << endl;
+    staging.set_memory( texture_mem );
+#else
+    if (verbose)
+      cerr << "dsp::MOPSRUnpacker::set_device: using gpu memory" << endl;
+    staging.set_memory ( gpu_mem );
+#endif
+  }
+  else
+  {
+    if (verbose)
+      cerr << "dsp::MOPSRUnpacker::set_device: using cpu memory" << endl;
+    gpu_stream = undefined_stream;
+  }
+#else
+  Unpacker::set_device (memory);
+#endif
+  device_prepared = true;
+}
+
+
+bool dsp::MOPSRUnpacker::matches (const Observation* observation)
+{
+  if (verbose)
+  {
+    if (observation->get_state() == Signal::Analytic)
+      cerr << "dsp::MOPSRUnpacker::matches state=Analytic" << endl;
+    else if (observation->get_state() == Signal::Intensity)
+      cerr << "dsp::MOPSRUnpacker::matches state=Intensity" << endl;
+    else 
+      cerr << "dsp::MOPSRUnpacker::matches states=" << observation->get_state() << endl;
+  }
+
+  return observation->get_machine()== "MOPSR"
+    && (observation->get_state() == Signal::Analytic || observation->get_state() == Signal::Intensity)
+    && (observation->get_nbit() == 8 || observation->get_nbit() == 32)
+    && (observation->get_ndim() == 2 || observation->get_ndim() == 1)
+    && (observation->get_npol() == 1 || observation->get_npol() == 2);
+}
+
+
+void dsp::MOPSRUnpacker::match_resolution (const Input* input)
+{
+  input_resolution = input->get_resolution();
+  if (verbose)
+    cerr << "dsp::MOPSRUnpacker::match_resolution input_resolution=" << input_resolution << endl;
+}
+
+/*! Validate whether the unpacker can handle the combination of input order 
+    and output order */
+void dsp::MOPSRUnpacker::validate_transformation ()
+{
+  // see if this unpacker already knows in order of the input data 
+  if (input_order == NONE)
+  {
+    const Input * in = input->get_loader();
+    const Observation * obs = in->get_info();
+    const ASCIIObservation * info = dynamic_cast<const ASCIIObservation *>(obs);
+    if (info)
+    {
+      char buffer[8];
+      if (info->custom_header_get ("ORDER", "%s", buffer) == 1)
+      {
+        if (strcmp(buffer, "TF") == 0)
+        {
+          input_order = TF;
+        }
+        else if (strcmp(buffer, "FT") == 0)
+        {
+          input_order = FT;
+        }
+        else if (strcmp(buffer, "T") == 0)
+        {
+          cerr << "input order=T" << endl;
+          input_order = T;
+        }
+        else
+        {
+          throw Error (InvalidState, "dsp::MOPSRUnpacker::valid_transformation", "unrecognized input order [%s]", buffer);
+        }
+      }
+    }
+    // have an assumed order when it cannot be determined
+    else
+    {
+      cerr << "dsp::MOPSRUnpacker::valid_transformation could not get ASCIIObservation reference" << endl;
+      input_order = TF;
+    }
+  }
+
+  const unsigned nchan = input->get_nchan();
+  if ((nchan == 1) && (input_order == TF))
+    throw Error (InvalidState, "dsp::MOPSRUnpacker::valid_transformation", "input order not compatible with nchan=%u", nchan);
+  if ((nchan != 1) && (input_order == T))
+    throw Error (InvalidState, "dsp::MOPSRUnpacker::valid_transformation", "input order not compatible with nchan=%u", nchan);
+}
+
+void dsp::MOPSRUnpacker::unpack ()
+{
+  const unsigned int nbit = input->get_nbit();
+
+  // 32-bit data does not have a digitizer
+  if ((nbit == 32) && (get_ndig() != 0))
+    set_ndig (0);
+
+  // 8-bit data has a digitizer for each channel
+  if ((nbit == 8) && (get_ndig() != input->get_nchan() * input->get_ndim()))
+    set_ndig(input->get_nchan() * input->get_ndim());
+
+#if HAVE_CUDA
+  if (gpu_stream != undefined_stream)
+  {
+    unpack_on_gpu ();
+    return;
+  }
+#endif
+
+  if (input_order == NONE)
+    validate_transformation();
+
+  const unsigned int nchan = input->get_nchan();
+  const unsigned int ndim = input->get_ndim();
+  const unsigned int npol = input->get_npol();
+  const unsigned int ipol = 0;
+  const uint64_t ndat = input->get_ndat();
+
+  // input channel stride - distance between successive (temporal) samples from same channel
+  unsigned int in_chan_stride = nchan * ndim;
+  unsigned int out_chan_stride = ndim;
+  const float* lookup = table->get_values ();
+
+  if (verbose)
+    cerr << "dsp::MOPSRUnpacker::unpack in_chan_stride="<< in_chan_stride << " input_resolution=" << input_resolution << endl;
+
+  if (debugd)
+    cerr << "ndat=" << ndat << " nchan=" << nchan << " ndim=" << ndim << " nbit=" << nbit << endl;
+
+  // TF order is produced by the beam-former, TFS produced by the AQ engines
+  if (input_order == TF)
+  {
+    if (output->get_order() == TimeSeries::OrderFPT)
+    {
+      // 32-bit floats are produced by the beam former
+      if (nbit == 32)
+      {
+        for (unsigned ichan=0; ichan<nchan; ichan++)
+        {
+          const unsigned int in_chan_off =  ndim * ichan;
+          const float * from = (float *) (input->get_rawptr()) + in_chan_off;
+          float* into = output->get_datptr (ichan, ipol);
+          for (uint64_t idat=0; idat < ndat; idat++)
+          {
+            for (unsigned idim=0; idim < ndim; idim++)
+              into[idim] = from[idim];
+            from += in_chan_stride;
+            into += out_chan_stride;
+          }
+        } 
+      }
+      // 8-bit signed integers products by the PFBs
+      else if (nbit == 8)
+      {
+        unsigned long * hists[2];
+
+        // transpose from TF order to FT order
+        for (unsigned ichan=0; ichan<nchan; ichan++)
+        {
+          const unsigned int in_chan_off =  ndim * ichan;
+          const int8_t * from = (int8_t *) (input->get_rawptr() + in_chan_off);
+          float* into = output->get_datptr (ichan, ipol);
+          const unsigned int nfloat = ndim;
+
+          for (unsigned idim=0; idim < ndim; idim++)
+            hists[idim] = get_histogram (ndim*ichan+idim);
+
+          for (uint64_t idat=0; idat < ndat; idat++)
+          {
+            for (unsigned idim=0; idim < ndim; idim++)
+            {
+              into[idim] = float ( from[idim] );
+              hists[idim][from[idim]+128]++;
+            }
+            from += in_chan_stride;
+            into += out_chan_stride;
+          }
+        }
+      }
+      else
+      {
+        throw Error (InvalidState, "dsp::MOPSRUnpacker::unpack", "unsupported unpacking bit width input=TF, output=FPT");
+      }
+    }
+    else if (output->get_order() == TimeSeries::OrderTFP)
+    {
+      // 32-bit floats are produced by the beam former
+      if (nbit == 32)
+      {
+        // direct unpack from TF to TF
+        const float * from = (float *) input->get_rawptr();
+        float * into = output->get_dattfp();
+        const uint64_t nfloat = npol * nchan * ndat * ndim;
+        for (uint64_t ifloat=0; ifloat < nfloat; ifloat++)
+        {
+          into[ifloat] = from[ifloat];
+        }
+      }
+      // 8-bit input are produced by the PFBs, Ndim == 2
+      else if (nbit == 8 && ndim == 2)
+      {
+        // direct unpack from TF to TF
+        const unsigned char* from = input->get_rawptr();
+        float* into = output->get_dattfp();
+        const uint64_t nfloat = npol * nchan * ndat;
+        unsigned long* hist_re;
+        unsigned long* hist_im;
+
+        for (uint64_t ifloat=0; ifloat < nfloat; ifloat++)
+        {
+          into[0] = lookup[ from[0] ];
+          into[1] = lookup[ from[1] ];
+
+          unsigned ichan = ifloat % nchan;
+
+          hist_re = get_histogram (2*ichan);
+          hist_im = get_histogram (2*ichan+1);
+
+          int bin_re = int8_t(from[0]) + 128;
+          int bin_im = int8_t(from[1]) + 128;
+
+          hist_re[bin_re]++;
+          hist_im[bin_im]++;
+
+          into += 2;
+          from += 2;
+        }
+      }
+      else if (nbit == 8 && ndim == 1)
+      {
+        // direct unpack from TF to TF
+        const unsigned char* from = input->get_rawptr();
+        float* into = output->get_dattfp();
+        const uint64_t nfloat = npol * nchan * ndat;
+        unsigned long* hist = get_histogram (0);
+
+        for (uint64_t ifloat=0; ifloat < nfloat; ifloat++)
+        {
+          into[ifloat] = lookup[ from[ifloat] ];
+        }
+      }
+      else
+      {
+        throw Error (InvalidState, "dsp::MOPSRUnpacker::unpack", "unsupported unpacking bit width input=TF, output=TFP");
+      }
+    }
+    else 
+    {
+      throw Error (InvalidState, "dsp::MOPSRUnpacker::unpack", "output order not suitable for input == TS");
+    }
+  }
+  else if (input_order == FT)
+  {
+    if (output->get_order() == TimeSeries::OrderFPT)
+    {
+      if (nbit == 32)
+      {
+        const unsigned nfloat = input_resolution * ndim;
+        cerr << "dsp::MOPSRUnpacker::unpack ndat="<<ndat << " nfloat=" << nfloat << 
+                " nchan=" << nchan << " output FPT input_resolution=" << input_resolution << endl;
+        float * from = (float *) input->get_rawptr();
+        unsigned nblock = ndat / input_resolution;
+        cerr << "dsp::MOPSRUnpacker::unpack nblock=" << nblock << endl;
+        if (ndat % input_resolution != 0)
+          cerr << "input block size error" << endl;
+
+        for (unsigned iblock=0; iblock<nblock; iblock++)
+        {
+          for (unsigned ichan=0; ichan<nchan; ichan++)
+          {
+            float* into = output->get_datptr (ichan, 0) + iblock * input_resolution;
+            for (uint64_t ifloat=0; ifloat < nfloat; ifloat++)
+              into[ifloat] = from[ifloat];
+            from += nfloat;
+          }
+        }
+      }
+      else
+      {
+        const unsigned nval = ndat * ndim;
+        int8_t * from = (int8_t *) input->get_rawptr();
+        for (unsigned ichan=0; ichan<nchan; ichan++)
+        {
+          unsigned long* hist_re = get_histogram (2*ichan);
+          unsigned long* hist_im = get_histogram (2*ichan+1);
+          float* into = output->get_datptr (ichan, 0);
+          for (uint64_t ival=0; ival < nval; ival++)
+            into[ival] = float (from[ival]);
+          from += nval;
+        }
+      }
+    }
+    else if (output->get_order() == TimeSeries::OrderTFP)
+    {
+      if (nbit == 32)
+      {
+        // transpose from FT to TF
+        const unsigned nchandim = nchan * ndim;
+        float * from = (float *) input->get_rawptr();
+        float * into = output->get_dattfp ();
+
+        cerr << "dsp::MOPSRUnpacker::unpack ndat=" << ndat << " nchan=" << nchan << " ndim=" << ndim << " output TFP" << endl;
+
+        for (unsigned ichan=0; ichan<nchan; ichan++)
+        {
+          for (uint64_t idat=0; idat < ndat; idat++)
+          {
+            for (unsigned idim=0; idim<ndim; idim++)
+            {
+              into[idat*nchandim + ichan*ndim + idim] = from[ndim*idat + idim];
+            }
+          }
+          from += ndat * ndim;
+        }
+      }
+    }
+    else 
+    {
+      throw Error (InvalidState, "dsp::MOPSRUnpacker::unpack", "output order not suitable for input == TS");
+    }
+  }
+  // input data is a single time series of complex value samples
+  else if (input_order == T)
+  {
+    if (nchan != 1)
+      throw Error (InvalidState, "dsp::MOPSRUnpacker::unpack", "input order == T, but nchan=%u", nchan);
+
+    // simple unpack of single chan/ant time series to either FPT or TFP
+    const int8_t * from8 = (int8_t *) input->get_rawptr();
+    const float * from32 = (float *) input->get_rawptr();
+    float * into;
+
+    if (output->get_order() == TimeSeries::OrderFPT)
+      into = output->get_datptr (0, 0);
+    else if (output->get_order() == TimeSeries::OrderTFP)
+      into = output->get_dattfp();
+    else
+      throw Error (InvalidState, "dsp::MOPSRUnpacker::unpack", "output order not suitable for input == T");
+     
+    
+    if (verbose)
+      cerr << "dsp::MOPSRUnpacker::unpack ndim=" << ndim << endl;
+    unsigned long* hist_re = get_histogram (0);
+    if (ndim > 1)
+      unsigned long* hist_im = get_histogram (1);
+    const uint64_t nfloat = ndat * ndim;
+
+    if (nbit == 32)
+      for (uint64_t ifloat=0; ifloat < nfloat; ifloat++)
+        into[ifloat] = from32[ifloat];
+    else
+      for (uint64_t ifloat=0; ifloat < nfloat; ifloat++)
+        into[ifloat] = lookup[ from8[ifloat] ];
+  }
+  debugd = 0;
+}
+
+unsigned dsp::MOPSRUnpacker::get_resolution () const { return 1024; }
+
+#if HAVE_CUDA
+
+void dsp::MOPSRUnpacker::unpack_on_gpu ()
+{
+  const uint64_t ndat  = input->get_ndat();
+  const unsigned nchan = input->get_nchan();
+  const unsigned ndim  = input->get_ndim();
+  const unsigned npol  = input->get_npol();
+
+  const uint64_t to_copy = ndat * nchan * ndim * npol;
+
+  staging.Observation::operator=( *input );
+  staging.resize(ndat);
+
+  // staging buffer on the GPU for packed data
+  int8_t * d_staging = (int8_t *) staging.get_rawptr();
+  const unsigned char * from = input->get_rawptr();
+  float * into;
+
+  if (ndat == 0)
+  {
+    if (verbose)
+      cerr << "dsp::MOPSRUnpacker::unpack_on_gpu ndat == 0" << endl;
+    return;
+  }
+
+  switch ( output->get_order() )
+  {
+    case TimeSeries::OrderFPT:
+    {
+      into = output->get_datptr(0,0);
+      break;
+    }
+
+    case TimeSeries::OrderTFP:
+    {
+      into = output->get_dattfp();
+      break;
+    }
+
+    default:
+    {
+      throw Error (InvalidState, "dsp::MOPSRUnpacker::unpack_on_gpu", "unrecognized order");
+    }
+    break;
+  }
+
+  cudaStream_t stream = (cudaStream_t) gpu_stream;
+  if (verbose)
+    cerr << "dsp::MOPSRUnpacker::unpack_on_gpu stream=" << stream << endl;
+
+  cudaError error;
+  if (stream)
+  {
+    error = cudaMemcpyAsync (d_staging, from, to_copy, cudaMemcpyHostToDevice, stream);
+    CHECK_ERROR_STREAM ("dsp::MOPSRUnpacker::unpack_on_gpu cudaMemcpyAsync", stream);
+  }
+  else
+  {
+    error = cudaMemcpy (d_staging, from, to_copy, cudaMemcpyHostToDevice);
+    CHECK_ERROR ("dsp::MOPSRUnpacker::unpack_on_gpu cudaMemcpy");
+  }
+  
+
+#ifdef USE_TEXTURE_MEMORY
+  if (verbose)
+    cerr << "dsp::MOPSRUnpacker::unpack_on_gpu binding TextureMemory" << endl;
+  CUDA::TextureMemory * gpu_mem = dynamic_cast< CUDA::TextureMemory*>( staging.get_memory() );
+  cerr << "dsp::MOPSRUnpacker::unpack_on_gpu textureMemory stream=" << stream << " gpu_mem->get_tex()= " << gpu_mem->get_tex() << endl;
+#endif
+
+  if (error != cudaSuccess)
+    throw Error (FailedCall, "MOPSRUnpacker::unpack_on_gpu",
+                 "cudaMemcpy%s %s", stream?"Async":"", 
+                 cudaGetErrorString (error));
+
+#ifdef USE_TEXTURE_MEMORY
+  mopsr_unpack (stream, ndat, d_staging, into, gpu_mem->get_tex());
+#else
+  if (output->get_order() == TimeSeries::OrderFPT)
+  {
+    if (verbose)
+      cerr << "dsp::MOPSRUnpacker::unpack_on_gpu mopsr_unpack_fpt ndat=" << ndat 
+           << " d_staging=" << (void *) d_staging << " into=" << (void *) into << endl;
+    mopsr_unpack_fpt (stream, ndat, nchan, table->get_scale(), d_staging, into);
+    if (dsp::Operation::record_time || dsp::Operation::verbose)
+      if (stream)
+        CHECK_ERROR_STREAM ("dsp::MOPSRUnpacker::unpack_on_gpu mopsr_unpack_fpt", stream);
+      else
+        CHECK_ERROR ("dsp::MOPSRUnpacker::unpack_on_gpu mopsr_unpack_fpt");
+  }
+  else if (output->get_order() == TimeSeries::OrderTFP)
+  {
+    if (verbose)
+      cerr << "dsp::MOPSRUnpacker::unpack_on_gpu mopsr_unpack_tfp ndat=" << ndat << endl;
+    mopsr_unpack_tfp (stream, ndat, nchan, table->get_scale(), d_staging, into);
+    if (dsp::Operation::record_time || dsp::Operation::verbose)
+      if (stream)
+        CHECK_ERROR_STREAM ("dsp::MOPSRUnpacker::unpack_on_gpu mopsr_unpack_tfp", stream);
+      else
+        CHECK_ERROR ("dsp::MOPSRUnpacker::unpack_on_gpu mopsr_unpack_tfp");
+  }
+  else
+    throw Error (InvalidState, "dsp::MOPSRUnpacker::unpack_on_gpu", "unrecognized order");
+#endif
+}
+
+#endif
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mopsr/MOPSRUnpackerCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mopsr/MOPSRUnpackerCUDA.cu
--- bl-dspsr-0+git20160405/Kernel/Formats/mopsr/MOPSRUnpackerCUDA.cu	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mopsr/MOPSRUnpackerCUDA.cu	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,219 @@
+//-*-C++-*-
+/***************************************************************************
+ *
+ *   Copyright (C) 2013 by Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#include <stdio.h>
+
+#include "dsp/MOPSRUnpackerCUDA.h"
+#include "dsp/Operation.h"
+#include "debug.h"
+#include "Error.h"
+
+// threads per block - C1060=256 [TODO CHECK below if changing]
+#define __MOPSR_UNPACK_TPB 1024
+
+#define __MOPSR_SAMPLES_PER_THREAD 4
+
+#define WARP_SIZE 32
+
+// global static texture declaration for MOPSR gpu unpacker
+//texture<char2, cudaTextureType1D, cudaReadModeNormalizedFloat> mopsr_tex1dfloat2;
+
+// real textutre version
+//texture<int8_t, cudaTextureType1D, cudaReadModeElementType> mopsr_tex1dfloat;
+
+using namespace std;
+
+//void check_error (const char*);
+void check_error_stream (const char*, cudaStream_t);
+
+__device__ __constant__ float mopsr_unpacker_scale;
+
+#ifdef USE_TEXTURE_MEMORY
+__global__ void mopsr_unpack_complex_1 (float2 * output, cudaTextureObject_t tex)
+{
+  const int idx = blockIdx.x*blockDim.x + threadIdx.x;
+  output[idx] = tex1Dfetch<float2>(tex, idx);
+}
+#else
+__global__ void mopsr_unpack_fpt_complex_1 (const int8_t * stagingBufGPU,
+                                            float * output,
+                                            const unsigned nval,
+                                            const unsigned nchan, 
+                                            const unsigned nsamp_per_block,
+                                            const unsigned chan_stride)
+{
+  extern __shared__ int8_t sdata[];
+  const unsigned ndim = 2;
+
+  // input index
+  const unsigned idx = (blockIdx.x * blockDim.x + threadIdx.x);
+  const unsigned in_idx = idx * ndim;
+
+  // shared memory index
+  const unsigned sin_idx = threadIdx.x * ndim;
+
+
+  if (idx >= nval)
+  {
+    sdata[sin_idx] = 0;
+    sdata[sin_idx+1] = 0;
+  }
+  else
+  {
+    sdata[sin_idx]   = stagingBufGPU[in_idx];
+    sdata[sin_idx+1] = stagingBufGPU[in_idx+1];
+  }
+
+  // synchronize all threads in the block
+  __syncthreads(); 
+
+  // now we have 1000 consective (complex) TF samples in the sdata array stored as int8_t
+
+  // determine the output index for this thread
+  const unsigned ichan    = threadIdx.x / nsamp_per_block;
+  const unsigned isamp    = threadIdx.x % nsamp_per_block;
+
+  // determine which shared memory index for this output ichan and isamp
+  const unsigned sout_idx = ((isamp * nchan) + ichan) * ndim;
+
+  // convert to float
+  float re = (float) sdata[sout_idx];
+  float im = (float) sdata[sout_idx+1];
+
+  // + 0.5 since scale is -128 to 127
+  re += 0.5;
+  im += 0.5;
+
+  // optimal scaling from bit table
+  re *= mopsr_unpacker_scale;
+  im *= mopsr_unpacker_scale;
+
+  // finally determine the output index for this thread
+  const unsigned ou_idx = (ichan * chan_stride) + (blockIdx.x * nsamp_per_block * ndim) + (isamp * ndim);
+
+  if (blockIdx.x * nsamp_per_block * nchan < nval)
+  {
+#if _KDEBUG
+    if (blockIdx.x == 0)
+      printf ("threadIdx.x=%d sin_idx=%d, ichan=%d, isamp=%d, sout_idx=%d, ou_idx=%d\n", threadIdx.x, sin_idx, ichan, isamp, sout_idx, ou_idx);
+#endif
+
+    output[ou_idx]   = re;
+    output[ou_idx+1] = im;
+  }
+  else
+  {
+    printf ("blockIdx.x=%d, threadIdx.x=%d val=%d >= nval=%d\n", blockIdx.x, threadIdx.x, blockIdx.x * nsamp_per_block * nchan, nval);
+  }
+
+#if _KDEBUG
+  if (blockIdx.x ==0 && threadIdx.x == 0)
+    printf ("=========================\n");
+#endif
+}
+#endif
+
+__global__ void mopsr_unpack_tfp_complex_1 (const int8_t * stagingBufGPU,
+                                            float2* output,
+                                            const unsigned nchan)
+{
+  const unsigned isamp = blockIdx.x * blockDim.x + threadIdx.x;
+  const unsigned ichan = blockIdx.y;
+  const unsigned ndim = 2;
+
+  // input and output will be in TFP order
+  const int8_t* from = reinterpret_cast<const int8_t*>( stagingBufGPU ) + (isamp * nchan * ndim) + (ichan * ndim);
+
+  __shared__ float2 out;
+
+  out.x = (float) from[0];
+  out.y = (float) from[1];
+
+  // + 0.5 since scale is -128 to 127
+  out.x += 0.5;
+  out.y += 0.5;
+
+  // optimal scaling from bit table
+  out.x *= mopsr_unpacker_scale;
+  out.y *= mopsr_unpacker_scale;
+
+  output[(isamp * nchan) + ichan] = out;
+}
+
+void mopsr_unpack_prepare (cudaStream_t stream, const float scale)
+{
+  cudaError_t error = cudaMemcpyToSymbolAsync ( mopsr_unpacker_scale, &scale, sizeof(scale), 0, cudaMemcpyHostToDevice, stream);
+  // TODO check return value
+}
+
+void mopsr_unpack_tfp (cudaStream_t stream, const uint64_t ndat, const unsigned nchan,
+                       float scale, int8_t const * input, float * output)
+{
+  int nthread = __MOPSR_UNPACK_TPB;
+  int nblocks = ndat / nthread;
+
+  // each thread will unpack 1 complex time sample from 1 channel
+  dim3 blocks (nblocks, nchan);
+
+  float2 * complex_output = (float2 *) output;
+
+  mopsr_unpack_tfp_complex_1<<<blocks,nthread,0,stream>>>(input, complex_output, nchan);
+
+  if (dsp::Operation::record_time || dsp::Operation::verbose)
+    check_error_stream ("mopsr_unpack_tfp", stream);
+}
+
+#ifdef USE_TEXTURE_MEMORY
+void mopsr_unpack (cudaStream_t stream, const uint64_t ndat,
+                   unsigned char const* input, float * output,
+                   cudaTextureObject_t * tex)
+#else
+void mopsr_unpack_fpt (cudaStream_t stream, const uint64_t ndat, const unsigned nchan,
+                       float scale, int8_t const * input, float * output)
+#endif
+{
+  const unsigned npol = 1;
+  const unsigned ndim = 2;
+  const unsigned nval = ndat * nchan;
+
+  // we want the number of threads to be module nchan
+  int nthread = (__MOPSR_UNPACK_TPB / nchan) * nchan;
+  int nblocks = nval / nthread;
+  if (nval % nthread)
+    nblocks++;
+
+  // each thread will unpack 1 complex time sample from 1 channel
+  size_t sdata_bytes = nthread * ndim;
+  const unsigned nsamp_per_block = nthread/nchan;
+  const unsigned chan_stride = ndat * npol * ndim;
+
+  if (dsp::Operation::verbose)
+    cerr << "mopsr_unpack_fpt nval=" << nval << " ndat=" << ndat << " nchan=" << nchan
+         << " input=" << (void*) input << " output=" << (void *) output 
+         << " nblocks=" << nblocks << " nthread=" << nthread
+         << " sdata_bytes=" << sdata_bytes << " nsamp_per_block=" << nsamp_per_block
+         << " chan_stride=" << chan_stride << endl;
+
+#ifdef  USE_TEXTURE_MEMORY
+  //mopsr_unpack_complex_1<<<nblock,nthread,0,stream>>>(complex_output, *tex);
+#else
+  mopsr_unpack_fpt_complex_1<<<nblocks,nthread,sdata_bytes,stream>>>(input, output, nval, nchan, nsamp_per_block, chan_stride);
+#endif
+
+  // AJ's theory... 
+  // If there are no stream synchronises on the input then the CPU pinned memory load from the
+  // input class might be able to get ahead of a whole sequence of GPU operations, and even exceed
+  // one I/O loop. Therefore this should be a reuqirement to have a stream synchronize some time
+  // after the data are loaded from pinned memory to GPU ram and the next Input copy to pinned memory
+
+  // put it here for now
+  cudaStreamSynchronize(stream);
+
+  if (dsp::Operation::record_time || dsp::Operation::verbose)
+    check_error_stream ("mopsr_unpack_fpt", stream);
+}
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mwa/dsp/EDAFourBit.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mwa/dsp/EDAFourBit.h
--- bl-dspsr-0+git20160405/Kernel/Formats/mwa/dsp/EDAFourBit.h	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mwa/dsp/EDAFourBit.h	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,36 @@
+//-*-C++-*-
+/***************************************************************************
+ *
+ *   Copyright (C) 2011 by Willem van Straten
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#ifndef __EDAFourBit_h
+#define __EDAFourBit_h
+
+#include "dsp/FourBitUnpacker.h"
+
+namespace dsp
+{
+  //! Converts single-dish EDA data from 4-bit to floating point values
+  class EDAFourBit: public FourBitUnpacker
+  {
+  public:
+
+    //! Constructor initializes bit table
+    EDAFourBit ();
+
+    //! Return true if this unpacker can handle the observation
+    bool matches (const Observation*);
+
+    //! Over-ride the default BitUnpacker::unpack method
+    void unpack ();
+
+    //! Over-ride the default FourBitUnpacker::get_histogram method
+    void get_histogram (std::vector<unsigned long>&, unsigned idig) const;
+
+  };
+}
+
+#endif
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mwa/EDAFourBit.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mwa/EDAFourBit.C
--- bl-dspsr-0+git20160405/Kernel/Formats/mwa/EDAFourBit.C	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mwa/EDAFourBit.C	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,94 @@
+/***************************************************************************
+ *
+ *   Copyright (C) 2017 by Willem van Straten
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#include "dsp/EDAFourBit.h"
+#include "dsp/BitTable.h"
+
+#include <assert.h>
+#include <iostream>
+using namespace std;
+
+dsp::EDAFourBit::EDAFourBit ()
+  : FourBitUnpacker ("EDAFourBit")
+{
+  BitTable* table = new BitTable (4, BitTable::OffsetBinary);
+  table->set_order( BitTable::LeastToMost );
+  set_table( table );
+}
+
+bool dsp::EDAFourBit::matches (const Observation* observation)
+{
+  if (verbose)
+    cerr << "dsp::EDAUnpacker::matches"
+      " machine=" << observation->get_machine() <<
+      " nbit=" << observation->get_nbit() << endl;
+
+  return observation->get_machine() == "EDA" 
+    && observation->get_nbit() == 4
+    && observation->get_npol() == 2
+    && observation->get_ndim() == 1;
+}
+
+void dsp::EDAFourBit::unpack ()
+{
+  cerr << "dsp::EDAFourBit::unpack" << endl;
+  
+  const uint64_t ndat  = input->get_ndat();
+
+  const unsigned nchan = input->get_nchan();
+  const unsigned npol  = input->get_npol();
+  const unsigned ndim  = input->get_ndim();
+
+  assert (nchan == 1);
+  assert (npol == 2);
+  assert (ndim == 1);
+  
+  const unsigned char* from = input->get_rawptr();
+
+  float* pol0 = output->get_datptr (0,0);
+  float* pol1 = output->get_datptr (0,1);
+
+  unsigned long* hist = BitUnpacker::get_histogram (0);
+
+  const float* lookup = table->get_values ();
+
+  for (uint64_t idat = 0; idat < ndat; idat++)
+  {
+    pol0[idat] = lookup[ from[idat] * 2 ];
+    pol1[idat] = lookup[ from[idat] * 2 + 1 ];
+    
+    hist[ from[idat] ] ++;
+  }
+}
+
+void dsp::EDAFourBit::get_histogram (std::vector<unsigned long>& hist,
+				     unsigned idig) const
+{
+  assert( get_nstate() == 16 );
+  assert( get_nstate_internal() == 256 );
+  assert( get_ndig() == 2 );
+  assert( idig < 2 );
+  
+  hist.resize( get_nstate() );
+  for (unsigned i=0; i<hist.size(); i++)
+    hist[i] = 0;
+  
+  unsigned mask = 0x0f;
+
+  const unsigned long* hist_internal = HistUnpacker::get_histogram (0);
+
+  for (unsigned i=0; i<get_nstate_internal(); i++)
+  {
+    unsigned s0;
+    if (idig == 0)
+      s0 = i & mask;
+    else
+      s0 = (i >> 4) & mask;
+
+    hist[s0] += hist_internal[i];
+  }
+}
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mwa/Makefile.am bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mwa/Makefile.am
--- bl-dspsr-0+git20160405/Kernel/Formats/mwa/Makefile.am	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mwa/Makefile.am	2018-03-12 23:02:35.000000000 +0000
@@ -1,9 +1,9 @@
 
 noinst_LTLIBRARIES = libmwa.la
 
-nobase_include_HEADERS =  dsp/MWAFile.h
+nobase_include_HEADERS = dsp/MWAFile.h dsp/EDAFourBit.h
 
-libmwa_la_SOURCES =  MWAFile.C
+libmwa_la_SOURCES = MWAFile.C EDAFourBit.C
 
 #############################################################################
 #
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/puma/dsp/PuMaFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/puma/dsp/PuMaFile.h
--- bl-dspsr-0+git20160405/Kernel/Formats/puma/dsp/PuMaFile.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/puma/dsp/PuMaFile.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/puma/dsp/PuMaFile.h,v $
-   $Revision: 1.7 $
-   $Date: 2008/05/28 21:12:43 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/puma/dsp/PuMaFile.h
 
 
 #ifndef __dsp_PuMaFile_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/puma/dsp/PuMaTwoBitCorrection.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/puma/dsp/PuMaTwoBitCorrection.h
--- bl-dspsr-0+git20160405/Kernel/Formats/puma/dsp/PuMaTwoBitCorrection.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/puma/dsp/PuMaTwoBitCorrection.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/puma/dsp/PuMaTwoBitCorrection.h,v $
-   $Revision: 1.2 $
-   $Date: 2006/07/09 13:27:08 $
-   $Author: wvanstra $ */
+// dspsr/Kernel/Formats/puma/dsp/PuMaTwoBitCorrection.h
 
 #ifndef __PuMaTwoBitCorrection_h
 #define __PuMaTwoBitCorrection_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/puma2/dsp/PuMa2File.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/puma2/dsp/PuMa2File.h
--- bl-dspsr-0+git20160405/Kernel/Formats/puma2/dsp/PuMa2File.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/puma2/dsp/PuMa2File.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/puma2/dsp/PuMa2File.h,v $
-   $Revision: 1.4 $
-   $Date: 2008/05/28 21:12:43 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/puma2/dsp/PuMa2File.h
 
 
 #ifndef __PuMa2File_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/puma2/dsp/PuMa2_Observation.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/puma2/dsp/PuMa2_Observation.h
--- bl-dspsr-0+git20160405/Kernel/Formats/puma2/dsp/PuMa2_Observation.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/puma2/dsp/PuMa2_Observation.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/puma2/dsp/PuMa2_Observation.h,v $
-   $Revision: 1.3 $
-   $Date: 2007/11/14 03:11:02 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/puma2/dsp/PuMa2_Observation.h
 
 #ifndef __PuMa2_Observation_h
 #define __PuMa2_Observation_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/puma2/dsp/PuMa2Unpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/puma2/dsp/PuMa2Unpacker.h
--- bl-dspsr-0+git20160405/Kernel/Formats/puma2/dsp/PuMa2Unpacker.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/puma2/dsp/PuMa2Unpacker.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/puma2/dsp/PuMa2Unpacker.h,v $
-   $Revision: 1.4 $
-   $Date: 2008/03/12 14:07:48 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/puma2/dsp/PuMa2Unpacker.h
 
 #ifndef __PuMa2Unpacker_h
 #define __PuMa2Unpacker_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/s2/dsp/S2File.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/s2/dsp/S2File.h
--- bl-dspsr-0+git20160405/Kernel/Formats/s2/dsp/S2File.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/s2/dsp/S2File.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/s2/dsp/S2File.h,v $
-   $Revision: 1.16 $
-   $Date: 2008/05/28 21:12:43 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/s2/dsp/S2File.h
 
 
 #ifndef __S2File_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/s2/dsp/S2TwoBitCorrection.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/s2/dsp/S2TwoBitCorrection.h
--- bl-dspsr-0+git20160405/Kernel/Formats/s2/dsp/S2TwoBitCorrection.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/s2/dsp/S2TwoBitCorrection.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/s2/dsp/S2TwoBitCorrection.h,v $
-   $Revision: 1.11 $
-   $Date: 2008/05/29 07:34:58 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/s2/dsp/S2TwoBitCorrection.h
 
 #ifndef __S2TwoBitCorrection_h
 #define __S2TwoBitCorrection_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/s2/dsp/S2TwoBitTable.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/s2/dsp/S2TwoBitTable.h
--- bl-dspsr-0+git20160405/Kernel/Formats/s2/dsp/S2TwoBitTable.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/s2/dsp/S2TwoBitTable.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/s2/dsp/S2TwoBitTable.h,v $
-   $Revision: 1.4 $
-   $Date: 2006/07/09 13:27:09 $
-   $Author: wvanstra $ */
+// dspsr/Kernel/Formats/s2/dsp/S2TwoBitTable.h
 
 
 #ifndef __S2TwoBitTable_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/s2/tci_file.c bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/s2/tci_file.c
--- bl-dspsr-0+git20160405/Kernel/Formats/s2/tci_file.c	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/s2/tci_file.c	2018-03-12 23:02:35.000000000 +0000
@@ -356,10 +356,9 @@
 {
   int i;
   
+  memset(header, ' ', sizeof(*header));
   header->hdr_size = 0;
   header->hdr_drate = 0;
-  sprintf (header->hdr_time, "%-*.*s", (TCI_HEADER_BASE_SIZE-6),
-	   (TCI_HEADER_BASE_SIZE-6), " ");
   header->hdr_time[0] = '\0';
   header->hdr_s2mode[0] = '\0';
   header->hdr_tapeid[0] = '\0';
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/s2/tci_file.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/s2/tci_file.h
--- bl-dspsr-0+git20160405/Kernel/Formats/s2/tci_file.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/s2/tci_file.h	2018-03-12 23:02:35.000000000 +0000
@@ -7,7 +7,7 @@
 
 /*
 $Id: tci_file.h,v 1.3 2009/05/04 23:17:13 straten Exp $
-$Log: tci_file.h,v $
+$Log: tci_file.h
 Revision 1.3  2009/05/04 23:17:13  straten
 verbosity mods
 
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/sigproc/dsp/SigProcDigitizer.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/sigproc/dsp/SigProcDigitizer.h
--- bl-dspsr-0+git20160405/Kernel/Formats/sigproc/dsp/SigProcDigitizer.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/sigproc/dsp/SigProcDigitizer.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/sigproc/dsp/SigProcDigitizer.h,v $
-   $Revision: 1.3 $
-   $Date: 2011/07/19 14:59:41 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/sigproc/dsp/SigProcDigitizer.h
 
 #ifndef __SigProcDigitizer_h
 #define __SigProcDigitizer_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/sigproc/dsp/SigProcFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/sigproc/dsp/SigProcFile.h
--- bl-dspsr-0+git20160405/Kernel/Formats/sigproc/dsp/SigProcFile.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/sigproc/dsp/SigProcFile.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/sigproc/dsp/SigProcFile.h,v $
-   $Revision: 1.1 $
-   $Date: 2008/10/31 05:59:55 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/sigproc/dsp/SigProcFile.h
 
 
 #ifndef __SigProcFile_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/sigproc/dsp/SigProcObservation.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/sigproc/dsp/SigProcObservation.h
--- bl-dspsr-0+git20160405/Kernel/Formats/sigproc/dsp/SigProcObservation.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/sigproc/dsp/SigProcObservation.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/sigproc/dsp/SigProcObservation.h,v $
-   $Revision: 1.2 $
-   $Date: 2008/10/31 06:00:50 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/sigproc/dsp/SigProcObservation.h
 
 #ifndef __SigProcObservation_h
 #define __SigProcObservation_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/sigproc/dsp/SigProcOutputFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/sigproc/dsp/SigProcOutputFile.h
--- bl-dspsr-0+git20160405/Kernel/Formats/sigproc/dsp/SigProcOutputFile.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/sigproc/dsp/SigProcOutputFile.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/sigproc/dsp/SigProcOutputFile.h,v $
-   $Revision: 1.2 $
-   $Date: 2011/09/19 01:56:42 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/sigproc/dsp/SigProcOutputFile.h
 
 
 #ifndef __SigProcOutputFile_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/sigproc/dsp/SigProcUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/sigproc/dsp/SigProcUnpacker.h
--- bl-dspsr-0+git20160405/Kernel/Formats/sigproc/dsp/SigProcUnpacker.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/sigproc/dsp/SigProcUnpacker.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/sigproc/dsp/SigProcUnpacker.h,v $
-   $Revision: 1.1 $
-   $Date: 2010/05/04 15:30:40 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/sigproc/dsp/SigProcUnpacker.h
 
 #ifndef __SigProcUnpacker_h
 #define __SigProcUnpacker_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/sigproc/send_stuff.c bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/sigproc/send_stuff.c
--- bl-dspsr-0+git20160405/Kernel/Formats/sigproc/send_stuff.c	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/sigproc/send_stuff.c	2018-03-12 23:02:35.000000000 +0000
@@ -2,8 +2,8 @@
 #include <string.h>
 
 #include "sigproc.h"
-FILE *input, *output;
-int swapout;
+extern FILE *input, *output;
+extern int swapout;
 void send_string(char *string) /* includefile */
 {
   int len;
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/sigproc/SigProcObservation.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/sigproc/SigProcObservation.C
--- bl-dspsr-0+git20160405/Kernel/Formats/sigproc/SigProcObservation.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/sigproc/SigProcObservation.C	2018-03-12 23:02:35.000000000 +0000
@@ -90,6 +90,8 @@
       return "Effelsberg";
     case 11:
       return "LOFAR";
+    case 12:
+      return "VLA";
     default:
       return "unknown";
       break;
@@ -124,6 +126,7 @@
     else if (itoa == "GM") return 7;
     else if (itoa == "EF") return 8;
     else if (itoa == "LF") return 11;
+    else if (itoa == "VL") return 12;
     else return 0;
   }
   catch (Error &error)
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/ska1/dsp/SKA1UnpackerCUDA.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/ska1/dsp/SKA1UnpackerCUDA.h
--- bl-dspsr-0+git20160405/Kernel/Formats/ska1/dsp/SKA1UnpackerCUDA.h	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/ska1/dsp/SKA1UnpackerCUDA.h	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,57 @@
+//-*-C++-*-
+/***************************************************************************
+ *
+ *   Copyright (C) 2014 by Andrew JAmeson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#ifndef __baseband_cuda_SKA1Unpacker_h
+#define __baseband_cuda_SKA1Unpacker_h
+
+#include <cuda_runtime.h>
+
+//#include "dsp/SKA1Unpacker.h"
+#ifdef SKA1_ENGINE_IMPLEMENTATION
+namespace CUDA
+{
+  class SKA1UnpackerEngine : public dsp::SKA1Unpacker::Engine
+  {
+  public:
+
+    //! Default Constructor
+    SKA1UnpackerEngine (cudaStream_t stream);
+
+    void setup ();
+
+    bool get_device_supported (dsp::Memory* memory) const;
+
+    void set_device (dsp::Memory* memory);
+
+    void unpack (float scale, const dsp::BitSeries * input, dsp::TimeSeries * output);
+
+  protected:
+
+    cudaStream_t stream;
+
+    struct cudaDeviceProp gpu;
+
+    dsp::BitSeries staging;
+
+  };
+}
+#else
+
+#include <inttypes.h>
+
+void ska1_unpack_tfp (cudaStream_t stream, uint64_t nval, float scale,
+                      float * into, void * staged,
+                      unsigned  nchan, unsigned npol, unsigned ndim,
+                      size_t pol_span);
+
+void ska1_unpack_fpt (cudaStream_t stream, uint64_t ndat, float scale,
+                      float * into, void * staged, unsigned  nchan,
+                      size_t pol_span);
+#endif
+
+#endif
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/ska1/dsp/SKA1Unpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/ska1/dsp/SKA1Unpacker.h
--- bl-dspsr-0+git20160405/Kernel/Formats/ska1/dsp/SKA1Unpacker.h	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/ska1/dsp/SKA1Unpacker.h	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,88 @@
+/*
+
+ */
+
+#ifndef __dsp_SKA1Unpacker_h
+#define __dsp_SKA1Unpacker_h
+
+//#define SKA1_ENGINE_IMPLEMENTATION
+
+#include "dsp/EightBitUnpacker.h"
+#include "ThreadContext.h"
+
+namespace dsp {
+  
+  class SKA1Unpacker : public HistUnpacker
+  {
+  public:
+
+    //! Constructor
+    SKA1Unpacker (const char* name = "SKA1Unpacker");
+
+    //! Destructor
+    ~SKA1Unpacker ();
+
+    //! Cloner (calls new)
+    virtual SKA1Unpacker * clone () const;
+
+    //! Return true if the unpacker can operate on the specified device
+    bool get_device_supported (Memory*) const;
+
+    //! Set the device on which the unpacker will operate
+    void set_device (Memory*);
+
+#ifdef SKA1_ENGINE_IMPLEMENTATION
+    //! Engine used to perform discrete convolution step
+    class Engine;
+    void set_engine (Engine*);
+#else
+    void unpack_on_gpu ();
+#endif
+
+  protected:
+    
+#ifdef SKA1_ENGINE_IMPLEMENTATION
+    //! Interface to alternate processing engine (e.g. GPU)
+    Reference::To<Engine> engine;
+#else
+    void * gpu_stream;
+#endif
+
+    Reference::To<BitTable> table;
+
+    //! Return true if we can convert the Observation
+    bool matches (const Observation* observation);
+
+    void unpack ();
+
+    //BitSeries staging;
+    //unsigned get_resolution () const ;
+
+  private:
+
+    unsigned ndim;
+
+    unsigned npol;
+
+    bool device_prepared;
+
+  };
+
+#ifdef SKA1_ENGINE_IMPLEMENTATION
+
+  class SKA1Unpacker::Engine : public Reference::Able
+  {
+  public:
+    virtual void unpack(float scale, const BitSeries * input, TimeSeries * output) = 0;
+
+    virtual bool get_device_supported (Memory* memory) const = 0;
+
+    virtual void set_device (Memory* memory) = 0;
+
+  };
+
+#endif
+
+}
+
+#endif
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/ska1/Makefile.am bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/ska1/Makefile.am
--- bl-dspsr-0+git20160405/Kernel/Formats/ska1/Makefile.am	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/ska1/Makefile.am	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,21 @@
+
+noinst_LTLIBRARIES = libska1.la
+
+nobase_include_HEADERS =  dsp/SKA1Unpacker.h
+
+libska1_la_SOURCES = SKA1Unpacker.C
+
+if HAVE_CUDA
+
+nobase_include_HEADERS += dsp/SKA1UnpackerCUDA.h
+libska1_la_SOURCES += SKA1UnpackerCUDA.cu
+
+endif
+
+#############################################################################
+#
+
+include $(top_srcdir)/config/Makefile.include
+include $(top_srcdir)/config/Makefile.cuda
+
+AM_CPPFLAGS += @CUDA_CFLAGS@
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/ska1/SKA1Unpacker.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/ska1/SKA1Unpacker.C
--- bl-dspsr-0+git20160405/Kernel/Formats/ska1/SKA1Unpacker.C	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/ska1/SKA1Unpacker.C	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,195 @@
+//-*-C++-*-
+/***************************************************************************
+ *
+ *   Copyright (C) 2014
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "dsp/SKA1Unpacker.h"
+#include "dsp/BitTable.h"
+
+#include "Error.h"
+
+#if HAVE_CUDA
+#include "dsp/MemoryCUDA.h"
+#include "dsp/SKA1UnpackerCUDA.h"
+#include <cuda_runtime.h>
+#endif
+
+#include <errno.h>
+
+using namespace std;
+
+static void* const undefined_stream = (void *) -1;
+
+dsp::SKA1Unpacker::SKA1Unpacker (const char* _name) : HistUnpacker (_name)
+{
+  if (verbose)
+    cerr << "dsp::SKA1Unpacker ctor" << endl;
+
+  set_nstate (256);
+  table = new BitTable (8, BitTable::TwosComplement);
+ 
+  npol = 2;
+  ndim = 2;
+}
+
+dsp::SKA1Unpacker::~SKA1Unpacker ()
+{
+}
+
+dsp::SKA1Unpacker * dsp::SKA1Unpacker::clone () const
+{
+  return new SKA1Unpacker (*this);
+}
+
+#ifdef SKA1_ENGINE_IMPLEMENTATION
+void dsp::SKA1Unpacker::set_engine (Engine* _engine)
+{
+  engine = _engine;
+}
+#endif
+
+//! Return true if the unpacker can operate on the specified device
+bool dsp::SKA1Unpacker::get_device_supported (Memory* memory) const
+{
+  if (verbose)
+    cerr << "dsp::SKA1Unpacker::get_device_supported" << endl;
+#ifdef SKA1_ENGINE_IMPLEMENTATION
+  if (engine)
+    return engine->get_device_supported (memory);
+  else
+    return false;
+#else
+#if HAVE_CUDA
+  if (verbose)
+    cerr << "dsp::SKA1Unpacker::get_device_supported HAVE_CUDA" << endl;
+  return dynamic_cast< CUDA::DeviceMemory*> ( memory );
+#else
+  return false;
+#endif
+
+#endif
+}
+
+//! Set the device on which the unpacker will operate
+void dsp::SKA1Unpacker::set_device (Memory* memory)
+{
+  if (verbose)
+    cerr << "dsp::SKA1Unpacker::set_device" << endl;
+#ifdef SKA1_ENGINE_IMPLEMENTATION
+  if (engine)
+    engine->set_device(memory);
+  else
+    Unpacker::set_device (memory);
+#else
+#if HAVE_CUDA
+  CUDA::DeviceMemory * gpu_mem = dynamic_cast< CUDA::DeviceMemory*>( memory );
+  if (gpu_mem)
+  {
+    //cerr << "dsp::SKA1Unpacker::set_device activating GPU" << endl;
+    gpu_stream = (void *) gpu_mem->get_stream();
+    //staging.set_memory( gpu_mem );
+  }
+  else
+    gpu_stream = undefined_stream;
+#else
+  Unpacker::set_device (memory);
+#endif
+#endif
+  device_prepared = true;
+}
+
+bool dsp::SKA1Unpacker::matches (const Observation* observation)
+{
+  return observation->get_machine()== "SKA1"
+    && observation->get_ndim() == 2
+    && observation->get_npol() == 2
+    && observation->get_nbit() == 8;
+}
+
+void dsp::SKA1Unpacker::unpack ()
+{
+  if (verbose)
+    cerr << "dsp::SKA1Unpacker::unpack()" << endl;
+
+#ifdef SKA1_ENGINE_IMPLEMENTATION
+  if (engine)
+  {
+    if (verbose)
+      cerr << "dsp::SKA1Unpacker::unpack using Engine" << endl;
+    engine->unpack(table->get_scale(), input, output);
+    return;
+  }
+#else
+#if HAVE_CUDA
+  if (gpu_stream != undefined_stream)
+  {
+    unpack_on_gpu ();
+    return;
+  }
+#endif
+#endif
+
+  // some programs (digifil) do not call set_device
+  if ( ! device_prepared )
+    set_device ( Memory::get_manager ());
+
+  // Data format is TFP
+
+  const uint64_t ndat   = input->get_ndat();
+  const unsigned nchan  = input->get_nchan();
+
+  unsigned in_offset         = 0;
+  const unsigned into_stride = ndim;
+  const unsigned from_stride = nchan * ndim * npol;
+  const float * lookup = table->get_values ();
+
+  for (unsigned ichan=0; ichan<nchan; ichan++)
+  {
+    for (unsigned ipol=0; ipol<npol; ipol++)
+    {
+      float * into = output->get_datptr (ichan, ipol);
+      const unsigned char * from = input->get_rawptr() + in_offset;
+
+      for (uint64_t idat=0; idat<ndat; idat++)
+      {
+        into[0] = lookup[ from[0] ];
+        into[1] = lookup[ from[1] ];
+        into += into_stride;
+        from += from_stride;
+      }
+      in_offset += ndim;
+    }
+  }
+}
+
+#ifndef SKA1_ENGINE_IMPLEMENTATION
+#if HAVE_CUDA
+void dsp::SKA1Unpacker::unpack_on_gpu ()
+{
+  const uint64_t ndat = input->get_ndat();
+  const unsigned nchan = input->get_nchan();
+  const unsigned ndim = input->get_ndim();
+  const unsigned npol = input->get_npol();
+
+  if (ndat == 0)
+    return;
+
+  void * from = (void *) input->get_rawptr();
+  cudaStream_t stream = (cudaStream_t) gpu_stream;
+
+  uint64_t nval = ndat * nchan * npol;
+
+  float * into    = (float *) output->get_datptr(0,0);
+  size_t pol_span = output->get_datptr(0, 1) - output->get_datptr(0,0);
+
+  ska1_unpack_fpt (stream, ndat, table->get_scale(), into, from, nchan, pol_span);
+}
+#endif
+#endif
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/ska1/SKA1UnpackerCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/ska1/SKA1UnpackerCUDA.cu
--- bl-dspsr-0+git20160405/Kernel/Formats/ska1/SKA1UnpackerCUDA.cu	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/ska1/SKA1UnpackerCUDA.cu	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,279 @@
+//-*-C++-*-
+/***************************************************************************
+ *
+ *   Copyright (C) 2010 by Willem van Straten
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#include <stdio.h>
+#include <cuda_runtime.h>
+
+#include "dsp/SKA1UnpackerCUDA.h"
+#include "dsp/Operation.h"
+#include "dsp/MemoryCUDA.h"
+
+#include "Error.h"
+#define WARP_SIZE 32
+#define BLOCK_SIZE 1024
+//#define _GDEBUG
+
+using namespace std;
+
+void check_error_stream (const char*, cudaStream_t);
+
+__global__ void k_unpack_fpt (float2 * to, const char2 * from,
+                              uint64_t ndat, uint64_t ostride,
+                              float scale)
+{
+  const uint64_t idat = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (idat >= ndat)
+    return;
+
+  const unsigned ichanpol = blockIdx.y * gridDim.z + blockIdx.z;
+  const unsigned blk_stride = gridDim.y * gridDim.z * BLOCK_SIZE;
+
+  //                   iblk                      ichanpol              isamp
+  const uint64_t idx = (blockIdx.x*blk_stride) + ichanpol*BLOCK_SIZE + threadIdx.x;
+  const uint64_t odx = (ichanpol * ostride) + idat;
+
+  char2 packed = from[idx];
+  float2 unpacked;
+  unpacked.x = ((float) packed.x + 0.5) * scale;
+  unpacked.y = ((float) packed.y + 0.5) * scale;
+  //unpacked.x = (float) scale;
+  //unpacked.y = (float) scale;
+  to[odx] = unpacked;
+}
+
+__global__ void k_unpack_tfp (uint64_t nval, float scale,
+                              float2 * to, const int16_t * from,
+                              const unsigned nchan, const unsigned npol,
+                              size_t pol_span, unsigned nval_per_thread,
+                              unsigned nval_per_block)
+{
+  extern __shared__ int16_t sdata[];
+
+  // shared memory for this block
+  const unsigned ndim = 2;
+  const unsigned warp_num = threadIdx.x / WARP_SIZE;
+  const unsigned warp_idx = threadIdx.x % WARP_SIZE;
+  const unsigned block_offset = blockIdx.x * nval_per_block;
+  unsigned idx = (warp_num * (WARP_SIZE * nval_per_thread)) + warp_idx;
+
+  // read input data as 2 x int8_t pairs into shm
+  unsigned ival;
+  for (ival=0; ival<nval_per_thread; ival++)
+  {
+    //if (blockIdx.x == 0)
+    //  printf ("[%d] sdata[%d]=from[%d]\n", threadIdx.x, idx, (block_offset + idx));
+
+    if (idx < nval_per_block && (block_offset + idx) < nval)
+      sdata[idx] = from[block_offset + idx];
+    //else
+    //  sdata[idx] = 0;
+
+    idx += WARP_SIZE;
+  }
+
+  __syncthreads();
+
+  // for use in access each 8bit value
+  int8_t * sdata8 = (int8_t *) sdata;
+
+  // determine which channel and polarisation this warp should be writing out (coalesced)
+  const unsigned nchanpol = nchan * npol;
+  const unsigned ichanpol_block = warp_num * nval_per_thread;
+  unsigned ichunk   = ichanpol_block / nchanpol;
+  unsigned ichanpol = ichanpol_block % nchanpol;
+  unsigned isamp    = (ichunk * WARP_SIZE) + warp_idx;
+
+  unsigned nsamp_per_block = nval_per_block / (nchan * ndim);
+  const unsigned out_block_offset = blockIdx.x * nsamp_per_block;
+  unsigned out_idx  = (pol_span * ichanpol) + out_block_offset + isamp;
+  unsigned sout_idx = isamp * nchanpol + ichanpol;
+
+  float2 val;
+  for (ival=0; ival<nval_per_thread; ival++)
+  {
+    if (2*sout_idx < nval_per_block)
+    {
+      val.x = scale * float(sdata8[2*sout_idx+0]) + 0.5;
+      val.y = scale * float(sdata8[2*sout_idx+1]) + 0.5;
+      to[out_idx] = val;
+
+      // increment the output ichan/pol 
+      ichanpol++;
+      if (ichanpol >= nchanpol)
+      { 
+        ichanpol = 0;
+        ichunk++;
+        isamp += WARP_SIZE;
+        out_idx = out_block_offset + isamp;
+      }
+      else
+        out_idx += pol_span;
+    }
+  }
+
+}
+
+#ifdef SKA1_ENGINE_IMPLEMENTATION
+
+CUDA::SKA1UnpackerEngine::SKA1UnpackerEngine (cudaStream_t _stream)
+{
+  stream = _stream;
+}
+
+void CUDA::SKA1UnpackerEngine::setup ()
+{
+  // determine cuda device properties for block & grid size
+  int device;
+  cudaGetDevice(&device);
+  cudaGetDeviceProperties (&gpu, device);
+}
+
+bool CUDA::SKA1UnpackerEngine::get_device_supported (dsp::Memory* memory) const
+{
+  return dynamic_cast< CUDA::DeviceMemory*> ( memory );
+}
+
+void CUDA::SKA1UnpackerEngine::set_device (dsp::Memory* memory)
+{
+  CUDA::DeviceMemory * gpu_mem = dynamic_cast< CUDA::DeviceMemory*>( memory );
+  staging.set_memory( gpu_mem);
+}
+
+
+void CUDA::SKA1UnpackerEngine::unpack (float scale, const dsp::BitSeries * input, dsp::TimeSeries * output)
+{
+  const uint64_t ndat = input->get_ndat();
+  const unsigned nchan = input->get_nchan();
+  const unsigned ndim = input->get_ndim();
+  const unsigned npol = input->get_npol();
+
+  // gpu staging buffer for input Bitseries Block
+  staging.Observation::operator=( *input );
+  staging.resize(ndat);
+
+  // copy from CPU Bitseries to GPU staging Bitseries
+  void * from   = (void *) input->get_rawptr();
+  void * staged = (void *) staging.get_rawptr();
+  uint64_t nval = ndat * nchan * npol;
+  uint64_t nbytes = nval * ndim;
+
+  if (dsp::Operation::verbose)
+    cerr << "CUDA::SKA1UnpackerEngine::unpack from=" << from
+         << " to=" << staged << " nbytes=" << nbytes << endl;
+
+  // ensure no GPU related operations are pending on this stream
+  cudaStreamSynchronize (stream);
+
+  cudaError_t error = cudaMemcpyAsync (staged, from, nbytes, cudaMemcpyHostToDevice, stream);
+  if (error != cudaSuccess)
+    throw Error (FailedCall, "CUDA::SKA1Unpacker::unpack",
+                     "cudaMemcpyAsync %s", cudaGetErrorString (error));
+
+  float * into    = (float *) output->internal_get_buffer();
+  size_t pol_span = output->get_datptr(0, 1) - output->get_datptr(0,0);
+
+  unsigned chunk_size = gpu.warpSize;
+  unsigned nchunk_per_block = gpu.sharedMemPerBlock / (chunk_size * nchan * npol * ndim);
+  unsigned nval_per_block = nchunk_per_block * chunk_size * nchan * npol;
+
+  unsigned nthreads = gpu.maxThreadsPerBlock;
+  unsigned nblocks = nval / nval_per_block;
+  if (nval % nval_per_block > 0)
+    nblocks++;
+
+  unsigned nval_per_thread = nval_per_block / nthreads;
+  if (nval_per_block % nthreads)
+    nval_per_thread++;
+
+  size_t sbytes = nval_per_block * ndim;
+
+  // unpack dem bits
+  k_unpack<<<nblocks,nthreads,sbytes,stream>>> (nval, scale, (float2 *) into, (int16_t *) staged, nchan, npol, pol_span, nval_per_thread, nval_per_block);
+
+  cudaStreamSynchronize(stream);
+
+  if (dsp::Operation::record_time || dsp::Operation::verbose)
+    check_error_stream ("CUDA::SKA1UnpackerEngine::unpack", stream);
+
+}
+#else
+void ska1_unpack_tfp (cudaStream_t stream, uint64_t nval, float scale, 
+                  float * into, void * staged, 
+                  unsigned  nchan, unsigned npol, unsigned ndim, 
+                  size_t pol_span)
+{
+  const unsigned warpSize = 32;
+  const unsigned sharedMemPerBlock = 49152;
+  const unsigned maxThreadsPerBlock = 1024;
+
+  unsigned chunk_size = warpSize;
+  unsigned nchunk_per_block = sharedMemPerBlock / (chunk_size * nchan * npol * ndim);
+  unsigned nval_per_block = nchunk_per_block * chunk_size * nchan * npol;
+
+  unsigned nthreads = maxThreadsPerBlock;
+  unsigned nblocks = nval / nval_per_block;
+  if (nval % nval_per_block > 0)
+    nblocks++;
+
+  unsigned nval_per_thread = nval_per_block / nthreads;
+  if (nval_per_block % nthreads)
+    nval_per_thread++;
+
+  size_t sbytes = nval_per_block * ndim;
+
+//#ifdef _GDEBUG
+  cerr << "nval=" << nval << " scale=" << scale << " nchan=" << nchan << " npol=" << npol << " pol_span=" << pol_span << endl;
+  cerr << "into=" << (void *) into << " staged = " << staged << endl;
+  cerr << "nblocks=" << nblocks << " nthreads=" << nthreads << " sbytes=" << sbytes << endl;
+  cerr << "nval_per_thread=" << nval_per_thread << " nval_per_block=" << nval_per_block << endl;
+//#endif
+
+  // unpack dem bits
+  k_unpack_tfp<<<nblocks,nthreads,sbytes,stream>>> (nval, scale, (float2 *) into, (int16_t *) staged, nchan, npol, pol_span, nval_per_thread, nval_per_block);
+  
+  if (dsp::Operation::record_time || dsp::Operation::verbose)
+    check_error_stream ("CUDA::SKA1UnpackerEngine::unpack", stream);
+
+  return;
+}
+
+void ska1_unpack_fpt (cudaStream_t stream, uint64_t ndat, float scale,
+                      float * into, void * from, unsigned nchan,
+                      size_t pol_span)
+{
+  const unsigned nthreads = 1024;
+  const unsigned npol = 2;
+  const unsigned ndim = 2;
+
+  dim3 blocks (ndat / nthreads,  nchan, npol);
+  if (ndat % nthreads)
+    blocks.x++;
+
+  // output pol stride in uints of float2
+  const uint64_t pol_stride = (uint64_t) pol_span / ndim;
+
+#ifdef _GDEBUG
+  cerr << "ndat=" << ndat << " nchan=" << nchan << " pol_span=" << pol_span << endl;
+  cerr << "pol_stride=" << pol_stride << endl;
+  cerr << "into=" << (void *) into << " from=" << from << endl;
+  cerr << "nblocks=" << blocks.x << " nthreads=" << nthreads << endl;
+#endif
+
+  //uint64_t myscale = reinterpret_cast<uint64_t>(stream);
+  //scale = (float) myscale;
+  //cerr << "stream=" << (void *) stream << " scale=" << scale << endl;
+
+  //const unsigned sdata_bytes = nthreads * ndim;
+  k_unpack_fpt<<<blocks,nthreads,0,stream>>> ((float2 *) into, (char2 *) from, ndat, pol_stride, scale);
+
+  if (dsp::Operation::record_time || dsp::Operation::verbose)
+    check_error_stream ("CUDA::SKA1UnpackerEngine::k_unpack_fpt", stream);
+
+  return;
+}
+#endif
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/spda1k/dsp/spda1k_File.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/spda1k/dsp/spda1k_File.h
--- bl-dspsr-0+git20160405/Kernel/Formats/spda1k/dsp/spda1k_File.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/spda1k/dsp/spda1k_File.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/spda1k/dsp/spda1k_File.h,v $
-   $Revision: 1.1 $
-   $Date: 2009/12/01 07:55:12 $
-   $Author: ahotan $ */
+// dspsr/Kernel/Formats/spda1k/dsp/spda1k_File.h
 
 #ifndef __SPDA1K_File_h
 #define __SPDA1K_File_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/spda1k/dsp/spda1k_Unpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/spda1k/dsp/spda1k_Unpacker.h
--- bl-dspsr-0+git20160405/Kernel/Formats/spda1k/dsp/spda1k_Unpacker.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/spda1k/dsp/spda1k_Unpacker.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/spda1k/dsp/spda1k_Unpacker.h,v $
-   $Revision: 1.1 $
-   $Date: 2009/12/01 07:55:12 $
-   $Author: ahotan $ */
+// dspsr/Kernel/Formats/spda1k/dsp/spda1k_Unpacker.h
 
 #ifndef __SPDA1K_Unpacker_h
 #define __SPDA1K_Unpacker_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/spigot/dsp/ACFUnpack.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/spigot/dsp/ACFUnpack.h
--- bl-dspsr-0+git20160405/Kernel/Formats/spigot/dsp/ACFUnpack.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/spigot/dsp/ACFUnpack.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/spigot/dsp/ACFUnpack.h,v $
-   $Revision: 1.3 $
-   $Date: 2006/07/09 13:27:09 $
-   $Author: wvanstra $ */
+// dspsr/Kernel/Formats/spigot/dsp/ACFUnpack.h
 
 #ifndef __ACFUnpack_h
 #define __ACFUnpack_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/spigot/dsp/SpigotFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/spigot/dsp/SpigotFile.h
--- bl-dspsr-0+git20160405/Kernel/Formats/spigot/dsp/SpigotFile.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/spigot/dsp/SpigotFile.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/spigot/dsp/SpigotFile.h,v $
-   $Revision: 1.4 $
-   $Date: 2008/05/28 21:12:43 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/spigot/dsp/SpigotFile.h
 
 
 #ifndef __dsp_SpigotFile_h
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/Unpacker_registry.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/Unpacker_registry.C
--- bl-dspsr-0+git20160405/Kernel/Formats/Unpacker_registry.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/Unpacker_registry.C	2018-03-12 23:02:35.000000000 +0000
@@ -60,6 +60,11 @@
 static dsp::Unpacker::Register::Enter<dsp::CASPSRUnpacker> caspsr;
 #endif
 
+#if HAVE_ska1
+#include "dsp/SKA1Unpacker.h"
+static dsp::Unpacker::Register::Enter<dsp::SKA1Unpacker> ska1;
+#endif
+
 #if HAVE_cpsr
 #include "dsp/CPSRTwoBitCorrection.h"
 static dsp::Unpacker::Register::Enter<dsp::CPSRTwoBitCorrection> cpsr;
@@ -104,7 +109,9 @@
 
 #if HAVE_kat
 #include "dsp/KAT7Unpacker.h"
+#include "dsp/MeerKATUnpacker.h"
 static dsp::Unpacker::Register::Enter<dsp::KAT7Unpacker> kat7;
+static dsp::Unpacker::Register::Enter<dsp::MeerKATUnpacker> meerkat;
 #endif
 
 #if HAVE_lofar_dal
@@ -149,6 +156,11 @@
 static dsp::Unpacker::Register::Enter<dsp::Mark5TwoBitCorrection> mark5;
 #endif
 
+#if HAVE_mark5b
+#include "dsp/Mark5bUnpacker.h"
+static dsp::Unpacker::Register::Enter<dsp::Mark5bUnpacker> mark5b;
+#endif
+
 #if HAVE_maxim
 #include "dsp/MaximUnpacker.h"
 static dsp::Unpacker::Register::Enter<dsp::MaximUnpacker> maxim;
@@ -159,8 +171,14 @@
 static dsp::Unpacker::Register::Enter<dsp::MiniUnpack> miniunpack;
 #endif
 
+#if HAVE_mopsr
+#include "dsp/MOPSRUnpacker.h"
+static dsp::Unpacker::Register::Enter<dsp::MOPSRUnpacker> mopsr;
+#endif
+
 #if HAVE_mwa
-// There is no MWA unpacker checked into the repository
+#include "dsp/EDAFourBit.h"
+static dsp::Unpacker::Register::Enter<dsp::EDAFourBit> eda4bit;
 #endif
 
 #if HAVE_pmdaq
@@ -200,11 +218,19 @@
 static dsp::Unpacker::Register::Enter<dsp::FITSUnpacker> fits;
 #endif
 
+
+#if HAVE_emerlin
+#include "dsp/EmerlinUnpacker.h"
+static dsp::Unpacker::Register::Enter<dsp::EmerlinUnpacker> emerlin;
+#endif
+
 #if HAVE_vdif
 #include "dsp/VDIFTwoBitCorrection.h"
 static dsp::Unpacker::Register::Enter<dsp::VDIFTwoBitCorrection> vdif;
 #include "dsp/VDIFTwoBitCorrectionMulti.h"
 static dsp::Unpacker::Register::Enter<dsp::VDIFTwoBitCorrectionMulti> vdif_multi;
+#include "dsp/VDIFFourBitUnpacker.h"
+static dsp::Unpacker::Register::Enter<dsp::VDIFFourBitUnpacker> vdif4;
 #include "dsp/VDIFEightBitUnpacker.h"
 static dsp::Unpacker::Register::Enter<dsp::VDIFEightBitUnpacker> vdif8;
 #endif
@@ -253,9 +279,17 @@
 static dsp::Unpacker::Register::Enter<dsp::GenericEightBitUnpacker> gen8bit;
 
 /*
+  Generic four-bit unpacker is used if no other four-bit unpacker steps up
+*/
+
+#include "dsp/GenericFourBitUnpacker.h"
+static dsp::Unpacker::Register::Enter<dsp::GenericFourBitUnpacker> gen4bit;
+
+/*
   get_registry is defined here to ensure that this file is linked
 */
 dsp::Unpacker::Register& dsp::Unpacker::get_register()
 {
   return Register::get_registry();
 }
+
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/vdif/dsp/VDIFFourBitUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/vdif/dsp/VDIFFourBitUnpacker.h
--- bl-dspsr-0+git20160405/Kernel/Formats/vdif/dsp/VDIFFourBitUnpacker.h	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/vdif/dsp/VDIFFourBitUnpacker.h	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,33 @@
+//-*-C++-*-
+/***************************************************************************
+ *
+ *   Copyright (C) 2016 by Paul Demorest
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#ifndef __VDIFFourBitUnpacker_h
+#define __VDIFFourBitUnpacker_h
+
+#include "dsp/FourBitUnpacker.h"
+
+namespace dsp {
+
+  //! Unpack 4-bit, single-pol VDIF data
+  class VDIFFourBitUnpacker : public FourBitUnpacker {
+
+  public:
+    
+    //! Constructor
+    VDIFFourBitUnpacker (const char* name = "VDIFFourBitUnpacker");
+
+   protected:
+    
+    //! Return true if we can convert the Observation
+    virtual bool matches (const Observation* observation);
+
+  };
+
+}
+
+#endif // !defined(__VDIFEightBitUnpacker_h)
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/vdif/Makefile.am bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/vdif/Makefile.am
--- bl-dspsr-0+git20160405/Kernel/Formats/vdif/Makefile.am	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/vdif/Makefile.am	2018-03-12 23:02:35.000000000 +0000
@@ -5,12 +5,14 @@
 			  dsp/VDIFTwoBitCorrection.h \
 			  dsp/VDIFTwoBitCorrectionMulti.h \
 			  dsp/VDIFTwoBitTable.h \
+			  dsp/VDIFFourBitUnpacker.h \
 			  dsp/VDIFEightBitUnpacker.h 
 
 libvdif_la_SOURCES = VDIFFile.C \
 		     VDIFTwoBitCorrection.C \
 		     VDIFTwoBitCorrectionMulti.C \
 		     VDIFTwoBitTable.C \
+		     VDIFFourBitUnpacker.C \
 		     VDIFEightBitUnpacker.C \
 		     vdifio.c vdifio.h
 
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/vdif/VDIFFile.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/vdif/VDIFFile.C
--- bl-dspsr-0+git20160405/Kernel/Formats/vdif/VDIFFile.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/vdif/VDIFFile.C	2018-03-12 23:02:35.000000000 +0000
@@ -214,7 +214,7 @@
         "Read vdif_nchan=%d, this is currently not supported", vdif_nchan);
   get_info()->set_npol( vdif_nchan );
   get_info()->set_nchan( 1 );
-  get_info()->set_rate( (double) get_info()->get_bandwidth() * 1e6 
+  get_info()->set_rate( fabs((double) get_info()->get_bandwidth()) * 1e6 
       / (double) get_info()->get_nchan() 
       * (get_info()->get_state() == Signal::Nyquist ? 2.0 : 1.0));
   if (verbose) cerr << "VDIFFile::open_file rate = " << get_info()->get_rate() << endl;
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/vdif/VDIFFourBitUnpacker.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/vdif/VDIFFourBitUnpacker.C
--- bl-dspsr-0+git20160405/Kernel/Formats/vdif/VDIFFourBitUnpacker.C	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/vdif/VDIFFourBitUnpacker.C	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,33 @@
+/***************************************************************************
+ *
+ *   Copyright (C) 2008 by Jayanta Roy and Willem van Straten
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#include "dsp/VDIFFourBitUnpacker.h"
+#include "dsp/BitTable.h"
+
+using namespace std;
+
+//! Constructor
+dsp::VDIFFourBitUnpacker::VDIFFourBitUnpacker (const char* name)
+  : FourBitUnpacker ("VDIFFourBit")
+{
+  BitTable* table = new BitTable (4, BitTable::OffsetBinary);
+  table->set_order( BitTable::LeastToMost );
+  set_table( table );
+}
+
+bool dsp::VDIFFourBitUnpacker::matches (const Observation* observation)
+{
+  if (verbose)
+    cerr << "dsp::VDIFFourBitUnpacker::matches machine=" 
+         << observation->get_machine() 
+         << " nbit=" << observation->get_nbit() << endl;
+
+  return observation->get_machine() == "VDIF" 
+    && observation->get_nbit() == 4
+    && observation->get_npol() == 1;
+}
+
diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/wapp/dsp/WAPPUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/wapp/dsp/WAPPUnpacker.h
--- bl-dspsr-0+git20160405/Kernel/Formats/wapp/dsp/WAPPUnpacker.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/wapp/dsp/WAPPUnpacker.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/wapp/dsp/WAPPUnpacker.h,v $
-   $Revision: 1.2 $
-   $Date: 2006/11/20 16:06:20 $
-   $Author: straten $ */
+// dspsr/Kernel/Formats/wapp/dsp/WAPPUnpacker.h
 
 #ifndef __WAPPUnpacker_h
 #define __WAPPUnpacker_h
diff -Nru bl-dspsr-0+git20160405/Makefile.am bl-dspsr-0.0~git20180312.50ea209/Makefile.am
--- bl-dspsr-0+git20160405/Makefile.am	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Makefile.am	2018-03-12 23:02:35.000000000 +0000
@@ -18,3 +18,9 @@
 backends.list:
 	cp $(top_srcdir)/config/backends.default backends.list
 
+# make clean deletes the local_include directory (and any out-of-date headers)
+clean-local: clean-local-include
+.PHONY: clean-local-include
+clean-local-include:
+	-rm -rf local_include
+
diff -Nru bl-dspsr-0+git20160405/More/Makefile.am bl-dspsr-0.0~git20180312.50ea209/More/Makefile.am
--- bl-dspsr-0+git20160405/More/Makefile.am	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/More/Makefile.am	2018-03-12 23:02:35.000000000 +0000
@@ -1,15 +1,12 @@
-SUBDIRS =
-
-lib_LTLIBRARIES = libdspsrmore.la
-libdspsrmore_la_SOURCES = 
-libdspsrmore_la_LIBADD =
 
 if HAVE_PGPLOT
-SUBDIRS += Plotting
 
-libdspsrmore_la_LIBADD += Plotting/libPlotting.la
+SUBDIRS = Plotting Applications
+
+lib_LTLIBRARIES = libdspsrmore.la
+libdspsrmore_la_SOURCES = $(top_srcdir)/Signal/General/
+libdspsrmore_la_LIBADD = Plotting/libPlotting.la
 
-SUBDIRS += Applications
 endif
 
 include $(top_srcdir)/config/Makefile.include
diff -Nru bl-dspsr-0+git20160405/More/Plotting/dsp/FluxPlot.h bl-dspsr-0.0~git20180312.50ea209/More/Plotting/dsp/FluxPlot.h
--- bl-dspsr-0+git20160405/More/Plotting/dsp/FluxPlot.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/More/Plotting/dsp/FluxPlot.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/psrchive/psrchive/More/Plotting/Pulsar/FluxPlot.h,v $
-   $Revision: 1.32 $
-   $Date: 2009/02/13 12:06:52 $
-   $Author: straten $ */
+// psrchive/More/Plotting/Pulsar/FluxPlot.h
 
 #ifndef __Pulsar_FluxPlot_h
 #define __Pulsar_FluxPlot_h
diff -Nru bl-dspsr-0+git20160405/python/Makefile.am bl-dspsr-0.0~git20180312.50ea209/python/Makefile.am
--- bl-dspsr-0+git20160405/python/Makefile.am	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/python/Makefile.am	2018-03-12 23:02:35.000000000 +0000
@@ -26,5 +26,5 @@
 #############################################################################
 #
 
-INCLUDES = @SWIG_PYTHON_CPPFLAGS@ -I$(top_builddir)/local_include @PSRCHIVE_CFLAGS@
+AM_CPPFLAGS = @SWIG_PYTHON_CPPFLAGS@ -I$(top_builddir)/local_include @PSRCHIVE_CFLAGS@
 
diff -Nru bl-dspsr-0+git20160405/Signal/General/bench_oversample.csh bl-dspsr-0.0~git20180312.50ea209/Signal/General/bench_oversample.csh
--- bl-dspsr-0+git20160405/Signal/General/bench_oversample.csh	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/bench_oversample.csh	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,30 @@
+#!/bin/tcsh
+
+set npt = 8
+set done = 0
+
+while ( ( $npt < 5000000 ) && ( $done ==  0 ) )
+
+  set npt_pow2 = `echo $npt | awk '{print ($1 * 4)}'`
+
+  set result = `./undersampling_speed -cuda -f $npt_pow2 -b $npt_pow2 -c 1 -p 1 -t 64|& tail -n 1`
+  if ( $? == 0) then
+    set gflops_pow2  = `echo $result | awk -F= '{print $NF}'`
+  else
+    set done = 1
+  endif
+
+  set npt_fwd = `echo $npt | awk '{print ($1 * 6)}'`
+  set npt_bwd = `echo $npt | awk '{print ($1 * 5)}'`
+  set result = `./undersampling_speed -cuda -f $npt_fwd -b $npt_bwd -c 1 -p 1 -t 64 |& tail -n 1`
+  if ( $? == 0) then
+    set gflops_over  = `echo $result | awk -F= '{print $NF}'`
+  else
+    set done = 1
+  endif
+
+  echo "npts pow2=$npt_pow2 -> gflops=$gflops_pow2  npt_fwd=$npt_fwd -> gflops=$gflops_over"
+
+   @ npt = $npt * 2
+
+end
diff -Nru bl-dspsr-0+git20160405/Signal/General/Convolution.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/Convolution.C
--- bl-dspsr-0+git20160405/Signal/General/Convolution.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/Convolution.C	2018-03-12 23:02:35.000000000 +0000
@@ -5,6 +5,10 @@
  *
  ***************************************************************************/
 
+#if HAVE_CONFIG_H
+#include <config.h>
+#endif
+
 #include "dsp/Convolution.h"
 #include "dsp/WeightedTimeSeries.h"
 #include "dsp/Apodization.h"
@@ -14,6 +18,10 @@
 #include "dsp/Dedispersion.h"
 #include "dsp/Scratch.h"
 
+#if HAVE_CUDA
+#include "dsp/MemoryCUDA.h"
+#endif
+
 #include "FTransform.h"
 
 //#define _DEBUG 1
@@ -33,6 +41,28 @@
 {
 }
 
+//! Set the device memory to use
+void dsp::Convolution::set_device (Memory* mem)
+{
+  memory = mem;
+
+#if HAVE_CUDA
+  CUDA::DeviceMemory* device_memory = dynamic_cast< CUDA::DeviceMemory*> ( mem);
+
+  if ( device_memory )
+  {
+    Scratch* gpu_scratch = new Scratch;
+    gpu_scratch->set_memory (device_memory);
+    set_scratch (gpu_scratch);
+  }
+#endif
+}
+
+void dsp::Convolution::set_engine (Engine * _engine)
+{
+  engine = _engine;
+}
+
 //! Set the frequency response function
 void dsp::Convolution::set_response (Response* _response)
 {
@@ -76,11 +106,11 @@
 {
   if (!response)
     throw Error (InvalidState, "dsp::Convolution::prepare",
-		 "no frequency response");
+                 "no frequency response");
 
   if (input->get_detected())
     throw Error (InvalidState, "dsp::Convolution::prepare",
-		 "input data are detected");
+                 "input data are detected");
 
   response->match (input);
 
@@ -90,7 +120,7 @@
   // response must have at least two points in it
   if (response->get_ndat() < 2)
     throw Error (InvalidState, "dsp::Convolution::prepare",
-		 "invalid response size");
+                 "invalid response size");
 
   // if the response has 8 dimensions, then perform matrix convolution
   matrix_convolution = response->get_ndim() == 8;
@@ -102,13 +132,13 @@
   // if matrix convolution, then there must be two polns
   if (matrix_convolution && npol != 2)
     throw Error (InvalidState, "dsp::Convolution::prepare",
-		 "matrix response and input.npol != 2");
+                 "matrix response and input.npol != 2");
 
   // response must contain a unique kernel for each channel
   if (response->get_nchan() != nchan)
     throw Error (InvalidState, "dsp::Convolution::prepare",
-		 "invalid response nsub=%d != nchan=%d",
-		 response->get_nchan(), nchan);
+                 "invalid response nsub=%d != nchan=%d",
+                 response->get_nchan(), nchan);
 
   // number of points after first fft
   n_fft = response->get_ndat();
@@ -123,7 +153,7 @@
 
   if (verbose)
     cerr << "Convolution::prepare filt=" << n_fft 
-	 << " smear=" << nfilt_tot << endl;
+         << " smear=" << nfilt_tot << endl;
 
   // 2 arrays needed: one for each of the forward and backward FFT results
   // 2 floats per complex number
@@ -150,25 +180,34 @@
   }
   else
     throw Error (InvalidState, "dsp::Convolution::prepare",
-		 "Cannot transform Signal::State="
-		 + tostring(input->get_state()));
+                 "Cannot transform Signal::State="
+                 + tostring(input->get_state()));
 
   // the FFT size must be greater than the number of discarded points
   if (nsamp_fft < nsamp_overlap)
     throw Error (InvalidState, "dsp::Convolution::prepare",
-		 "error nfft=%d < nfilt=%d", nsamp_fft, nsamp_overlap);
+                 "error nfft=%d < nfilt=%d", nsamp_fft, nsamp_overlap);
 
   if (has_buffering_policy())
   {
     if (verbose)
       cerr << "dsp::Convolution::prepare"
-	" reserve=" << nsamp_fft << endl;
+        " reserve=" << nsamp_fft << endl;
 
     get_buffering_policy()->set_minimum_samples (nsamp_fft);
   }
 
   prepare_output ();
 
+  if (engine)
+  {
+    if (verbose)
+      cerr << "dsp::Convolution::make_preparations setup engine" << endl;
+    engine->prepare (this);
+    prepared = true;
+    return;
+  }
+
   using namespace FTransform;
 
   if (state == Signal::Nyquist)
@@ -191,13 +230,19 @@
 
   if (verbose)
     cerr << "dsp::Convolution::prepare_output nsamp fft=" << nsamp_fft
-	 << " overlap=" << nsamp_overlap << " step=" << nsamp_step << endl;
+         << " overlap=" << nsamp_overlap << " step=" << nsamp_step << endl;
 
   // number of FFTs for this data block
   npart = 0;
   if (ndat >= nsamp_fft)
     npart = (ndat-nsamp_overlap)/nsamp_step;
 
+  if (engine)
+  {
+    //scratch_needed = npart * n_fft * 2;
+    scratch_needed = n_fft * 2 * 2;
+  }
+
   /*
     The input must be buffered before the output is modified
     because the transformation may be inplace
@@ -220,6 +265,14 @@
 
   if ( state == Signal::Nyquist )
     output->set_rate( 0.5*get_input()->get_rate() );
+
+  // set the input sample
+  uint64_t output_ndat = output->get_ndat();
+  int64_t input_sample = input->get_input_sample();
+  if (output_ndat == 0)
+    output->set_input_sample (0);
+  else if (input_sample >= 0)
+    output->set_input_sample ((input_sample / nsamp_step) * nsamp_step);
 }
 
 //! Reserve the maximum amount of output space required
@@ -232,17 +285,17 @@
 
   if (verbose)
     cerr << "Convolution::reserve ndat=" << ndat << " nfft=" << nsamp_fft
-	 << " npart=" << npart << endl;
+         << " npart=" << npart << endl;
 
   uint64_t output_ndat = npart * nsamp_step;
   if ( state == Signal::Nyquist )
     output_ndat /= 2;
-    
+
   if( input != output )
     output->resize (output_ndat);
   else
     output->set_ndat (output_ndat);
-    
+
   // nfilt_pos complex points are dropped from the start of the first FFT
   output->change_start_time (nfilt_pos);
 
@@ -304,6 +357,12 @@
   if (matrix_convolution)
     spectrum[1] += n_fft * 2;
 
+  if (engine)
+  {
+    engine->set_scratch (spectrum[0]);
+    engine->perform (input, output, npart);
+    return;
+  }
   float* complex_time  = spectrum[1] + n_fft * 2;
 
   // although only two extra points are required, adding 4 ensures that
@@ -315,7 +374,7 @@
 
   if (verbose)
     cerr << "dsp::Convolution::transformation step nsamp=" << nsamp_step
-	 << " bytes=" << nbytes_step << " ndim=" << ndim << endl;
+         << " bytes=" << nbytes_step << " ndim=" << ndim << endl;
  
   const unsigned cross_pol = matrix_convolution ? 2 : 1;
  
@@ -331,71 +390,71 @@
     for (unsigned ipol=0; ipol < npol; ipol++)
       for (uint64_t ipart=0; ipart < npart; ipart++)
       {
-	offset = ipart * step;
-		
-	for (jpol=0; jpol<cross_pol; jpol++)
-	{
-	  if (matrix_convolution)
-	    ipol = jpol;
-	  
-	  ptr = const_cast<float*>(input->get_datptr (ichan, ipol)) + offset;
-	  
-	  if (apodization)
-	  {
-	    apodization -> operate (ptr, complex_time);
-	    ptr = complex_time;
-	  }
-
-	  DEBUG("FORWARD: nfft=" << nsamp_fft << " in=" << ptr \
-		<< " out=" << spectrum[ipol]);
-
-	  if (state == Signal::Nyquist)
-	    forward->frc1d (nsamp_fft, spectrum[ipol], ptr);
-
-	  else if (state == Signal::Analytic)
-	    forward->fcc1d (nsamp_fft, spectrum[ipol], ptr);
-	  
-	}
-	
-	if (matrix_convolution) {
-
-	  response->operate (spectrum[0], spectrum[1], ichan);
-
-	  if (passband)
-	    passband->integrate (spectrum[0], spectrum[1], ichan);
-
-	}
-	
-	else {
-
-	  response->operate (spectrum[ipol], ipol, ichan);
-
-	  if (passband)
-	    passband->integrate (spectrum[ipol], ipol, ichan);
-
-	}
-	
-	for (jpol=0; jpol<cross_pol; jpol++)
-	{
-	  if (matrix_convolution)
-	    ipol = jpol;
-	  
-	  DEBUG("BACKWARD: nfft=" << n_fft << " in=" << spectrum[ipol] \
-		<< " out=" << complex_time);
-
-	  // fft back to the complex time domain
-	  backward->bcc1d (n_fft, complex_time, spectrum[ipol]);
-	  
-	  // copy the good (complex) data back into the time stream
-	  ptr = output -> get_datptr (ichan, ipol) + offset;
-
-	  DEBUG("memcpy: nbytes=" << nbytes_step \
-		<< " in=" << complex_time + nfilt_pos*2 \
-		<< " out=" << ptr << " offset=" << offset);
+        offset = ipart * step;
+                
+        for (jpol=0; jpol<cross_pol; jpol++)
+        {
+          if (matrix_convolution)
+            ipol = jpol;
+          
+          ptr = const_cast<float*>(input->get_datptr (ichan, ipol)) + offset;
+          
+          if (apodization)
+          {
+            apodization -> operate (ptr, complex_time);
+            ptr = complex_time;
+          }
+
+          DEBUG("FORWARD: nfft=" << nsamp_fft << " in=" << ptr \
+                << " out=" << spectrum[ipol]);
+
+          if (state == Signal::Nyquist)
+            forward->frc1d (nsamp_fft, spectrum[ipol], ptr);
+
+          else if (state == Signal::Analytic)
+            forward->fcc1d (nsamp_fft, spectrum[ipol], ptr);
+          
+        }
+        
+        if (matrix_convolution) {
+
+          response->operate (spectrum[0], spectrum[1], ichan);
+
+          if (passband)
+            passband->integrate (spectrum[0], spectrum[1], ichan);
+
+        }
+        
+        else {
+
+          response->operate (spectrum[ipol], ipol, ichan);
+
+          if (passband)
+            passband->integrate (spectrum[ipol], ipol, ichan);
+
+        }
+        
+        for (jpol=0; jpol<cross_pol; jpol++)
+        {
+          if (matrix_convolution)
+            ipol = jpol;
+          
+          DEBUG("BACKWARD: nfft=" << n_fft << " in=" << spectrum[ipol] \
+                << " out=" << complex_time);
+
+          // fft back to the complex time domain
+          backward->bcc1d (n_fft, complex_time, spectrum[ipol]);
+          
+          // copy the good (complex) data back into the time stream
+          ptr = output -> get_datptr (ichan, ipol) + offset;
+
+          DEBUG("memcpy: nbytes=" << nbytes_step \
+                << " in=" << complex_time + nfilt_pos*2 \
+                << " out=" << ptr << " offset=" << offset);
 
-	  memcpy (ptr, complex_time + nfilt_pos*2, nbytes_step);
+          memcpy (ptr, complex_time + nfilt_pos*2, nbytes_step);
 
-	}  // for each poln, if matrix convolution
+        }  // for each poln, if matrix convolution
       }  // for each part of the time series
   // for each poln
   // for each channel
diff -Nru bl-dspsr-0+git20160405/Signal/General/ConvolutionCUDACallbacks.cu bl-dspsr-0.0~git20180312.50ea209/Signal/General/ConvolutionCUDACallbacks.cu
--- bl-dspsr-0+git20160405/Signal/General/ConvolutionCUDACallbacks.cu	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/ConvolutionCUDACallbacks.cu	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,259 @@
+//-*-C++-*-
+
+/***************************************************************************
+ *
+ *   Copyright (C) 2015 by Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#include "dsp/ConvolutionCUDACallbacks.h"
+#include "CUFFTError.h"
+#include "debug.h"
+
+#if HAVE_CUFFT_CALLBACKS
+#include <cufftXt.h>
+#endif
+
+using namespace std;
+
+#if HAVE_CUFFT_CALLBACKS
+
+// [0] channel offset ( ichan * npt)
+// [1] npt
+// [2] first_ipt ( nfilt_pos )
+// [3] last_ipt ( npt - nfilt_neg )
+// [4] nfilt_tot 
+__device__ __constant__ unsigned conv_params[5];
+
+/////////////////////////////////////////////////////////////////////////
+//
+// store with multiplication by dedispersion kernel [no FFT batching]
+//
+__device__ void CB_convolve_and_store (void * dataOut, size_t offset, cufftComplex d, void * callerInfo, void *sharedPtr)
+{
+  // the dedispersion kernel complex float for this element of the FFT
+  const cufftComplex k = ((cufftComplex *) callerInfo)[conv_params[0] + offset];
+  ((cufftComplex*)dataOut)[offset] = cuCmulf (d, k);
+}
+
+__device__ void CB_convolve_and_store_batch (void * dataOut, size_t offset, cufftComplex d, void * callerInfo, void *sharedPtr)
+{
+  // the dedispersion kernel value for this element of the FFT
+  const unsigned kernel_offset = conv_params[0] + (offset % conv_params[1]);
+  const cufftComplex k = ((cufftComplex *) callerInfo)[kernel_offset];
+
+  ((cufftComplex*)dataOut)[offset] = cuCmulf (d, k);
+}
+__device__ cufftCallbackStoreC d_store_fwd        = CB_convolve_and_store;
+__device__ cufftCallbackStoreC d_store_fwd_batch = CB_convolve_and_store_batch;
+
+/////////////////////////////////////////////////////////////////////////
+//
+// store with output filtering on
+//
+__device__ void CB_filtered_store (void * dataOut, size_t offset, cufftComplex d, void * callerInfo, void *sharedPtr)
+{
+  // if offset < nfilt_pos, discard
+  if (offset < conv_params[2])
+    return;
+
+  // if offset > (npt - nfilt_neg), discard
+  if (offset >= conv_params[3])
+    return;
+
+  ((cufftComplex*)dataOut)[offset - conv_params[2]] = d;
+}
+
+__device__ void CB_filtered_store_batch (void * dataOut, size_t offset, cufftComplex d, void * callerInfo, void *sharedPtr)
+{
+  const unsigned ibatch = offset / conv_params[1];
+  const unsigned ipt = offset - (ibatch * conv_params[1]);
+
+  // if ipt < nfilt_pos, discard
+  if (ipt < conv_params[2])
+    return;
+  
+  // if ipt > (npt - nfilt_neg), discard
+  if (ipt >= conv_params[3])
+    return;
+
+  // substract the required offsets
+  offset -= ((ibatch * conv_params[4]) + conv_params[2]);
+
+  ((cufftComplex*)dataOut)[offset] = d;
+}
+
+__device__ cufftCallbackStoreC d_store_bwd       = CB_filtered_store;
+__device__ cufftCallbackStoreC d_store_bwd_batch = CB_filtered_store_batch;
+
+void setup_callbacks_ConvolutionCUDA (cufftHandle plan_fwd, cufftHandle plan_bwd, 
+                                      cufftHandle plan_fwd_batched, cufftHandle plan_bwd_batched,
+                                      cufftComplex * d_kernels, int nbatch, cudaStream_t stream)
+{
+  cudaError_t error;
+  cufftResult_t result;
+
+  cufftCallbackStoreC h_store_fwd;
+  cufftCallbackStoreC h_store_bwd;
+  cufftCallbackStoreC h_store_fwd_batch;
+  cufftCallbackStoreC h_store_bwd_batch;
+
+  error = cudaMemcpyFromSymbolAsync(&h_store_fwd, d_store_fwd, 
+                                    sizeof(h_store_fwd), 0, 
+                                    cudaMemcpyDeviceToHost, stream);
+  if (error != cudaSuccess)
+    throw Error (FailedCall, "CUDA::ConvolutionEngine::setup_callbacks",
+                 "cudaMemcpyFromSymbolAsync failed for h_store_fwd");
+
+  error = cudaMemcpyFromSymbolAsync(&h_store_bwd, d_store_bwd,
+                                    sizeof(h_store_bwd), 0,
+                                    cudaMemcpyDeviceToHost, stream);
+  if (error != cudaSuccess)
+    throw Error (FailedCall, "CUDA::ConvolutionEngine::setup_callbacks",
+                 "cudaMemcpyFromSymbolAsync failed for h_store_bwd");
+
+  error = cudaMemcpyFromSymbolAsync(&h_store_fwd_batch, d_store_fwd_batch,
+                                    sizeof(h_store_fwd_batch), 0,
+                                    cudaMemcpyDeviceToHost, stream);
+  if (error != cudaSuccess)
+    throw Error (FailedCall, "CUDA::ConvolutionEngine::setup_callbacks",
+                 "cudaMemcpyFromSymbolAsync failed for h_store_fwd_batch");
+
+  error = cudaMemcpyFromSymbolAsync(&h_store_bwd_batch, d_store_bwd_batch,
+                                    sizeof(h_store_bwd_batch), 0,
+                                    cudaMemcpyDeviceToHost, stream);
+  if (error != cudaSuccess)
+    throw Error (FailedCall, "CUDA::ConvolutionEngine::setup_callbacks",
+                 "cudaMemcpyFromSymbolAsync failed for h_store_bwd_batch");
+
+  result = cufftXtSetCallback (plan_fwd, (void **)&h_store_fwd,
+                               CUFFT_CB_ST_COMPLEX, (void **)&d_kernels);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_callbacks",
+      "cufftXtSetCallback (plan_fwd, h_store_fwd)");
+
+  result = cufftXtSetCallback (plan_bwd, (void **)&h_store_bwd,
+                               CUFFT_CB_ST_COMPLEX, 0);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_callbacks",
+      "cufftXtSetCallback (plan_bwd, h_store_bwd)");
+
+  if (nbatch > 0)
+  {
+    result = cufftXtSetCallback (plan_fwd_batched, (void **)&h_store_fwd_batch,
+                                 CUFFT_CB_ST_COMPLEX, (void **)&d_kernels);
+    if (result != CUFFT_SUCCESS)
+      throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_callbacks",
+        "cufftXtSetCallback (plan_fwd_batched, h_store_fwd_batch)");
+
+    result = cufftXtSetCallback (plan_bwd_batched, (void **)&h_store_bwd_batch,
+                                 CUFFT_CB_ST_COMPLEX, 0);
+    if (result != CUFFT_SUCCESS)
+      throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_callbacks",
+        "cufftXtSetCallback (plan_bwd_batched, h_store_bwd_batch)");
+  }
+}
+
+void setup_callbacks_conv_params (unsigned * h_ptr, unsigned h_size, cudaStream_t stream)
+{
+  cudaError_t error = cudaMemcpyToSymbolAsync (conv_params, (void *) h_ptr,
+                                   h_size, 0,
+                                   cudaMemcpyHostToDevice, stream);
+  if (error != cudaSuccess)
+  {
+    throw Error (InvalidState, "CUDA::ConvolutionEngine::setup_kernel",
+     "could not initialize convolution params in device memory");
+  }
+
+}
+
+// 
+//
+//
+
+// [0] first_ipt ( nfilt_pos )
+// [1] last_ipt ( npt - nfilt_neg )
+__device__ __constant__ unsigned conv_params_spectral[2];
+
+/////////////////////////////////////////////////////////////////////////
+//
+// store with multiplication by dedispersion kernel
+//
+__device__ void CB_convolve_and_store_spectral (void * dataOut, size_t offset, cufftComplex d, void * callerInfo, void *sharedPtr)
+{
+  // the dedispersion kernel complex float for this element of the FFT
+  const cufftComplex k = ((cufftComplex *) callerInfo)[offset];
+  ((cufftComplex*)dataOut)[offset] = cuCmulf (d, k);
+}
+__device__ cufftCallbackStoreC d_store_fwd_spectral = CB_convolve_and_store_spectral;
+
+/////////////////////////////////////////////////////////////////////////
+//
+// store with output filtering on
+//
+__device__ void CB_filtered_store_spectral (void * dataOut, size_t offset, cufftComplex d, void * callerInfo, void *sharedPtr)
+{
+  // if offset < nfilt_pos, discard
+  if (offset < conv_params_spectral[0])
+    return;
+
+  // if offset > (npt - nfilt_neg), discard
+  if (offset >= conv_params_spectral[1])
+    return;
+
+  ((cufftComplex*)dataOut)[offset - conv_params_spectral[0]] = d;
+}
+__device__ cufftCallbackStoreC d_store_bwd_spectral = CB_filtered_store_spectral;
+
+
+void setup_callbacks_ConvolutionCUDASpectral (cufftHandle plan_fwd, cufftHandle plan_bwd, cufftComplex * d_kernels, cudaStream_t stream)
+{
+  cudaError_t error;
+  cufftResult_t result;
+
+  cufftCallbackStoreC h_store_fwd;
+  cufftCallbackStoreC h_store_bwd;
+
+  error = cudaMemcpyFromSymbolAsync(&h_store_fwd, d_store_fwd_spectral,
+                                    sizeof(h_store_fwd), 0,
+                                    cudaMemcpyDeviceToHost, stream);
+  if (error != cudaSuccess)
+    throw Error (FailedCall, "CUDA::ConvolutionEngineSpectral::setup_callbacks",
+                 "cudaMemcpyFromSymbolAsync failed for h_store_fwd");
+
+  error = cudaMemcpyFromSymbolAsync(&h_store_bwd, d_store_bwd_spectral,
+                                    sizeof(h_store_bwd), 0,
+                                    cudaMemcpyDeviceToHost, stream);
+  if (error != cudaSuccess)
+    throw Error (FailedCall, "CUDA::ConvolutionEngineSpectral::setup_callbacks",
+                 "cudaMemcpyFromSymbolAsync failed for h_store_bwd");
+
+  result = cufftXtSetCallback (plan_fwd, (void **)&h_store_fwd,
+                               CUFFT_CB_ST_COMPLEX, (void **)&d_kernels);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::setup_callbacks",
+      "cufftXtSetCallback (plan_fwd, h_store_fwd)");
+
+  result = cufftXtSetCallback (plan_bwd, (void **)&h_store_bwd,
+                               CUFFT_CB_ST_COMPLEX, 0);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::setup_callbacks",
+      "cufftXtSetCallback (plan_bwd, h_store_bwd)");
+}
+
+void setup_callbacks_conv_params_spectral (unsigned * h_ptr, unsigned h_size, cudaStream_t stream)
+{
+  cudaError_t error = cudaMemcpyToSymbolAsync (conv_params_spectral, (void *) h_ptr,
+                                   h_size, 0, cudaMemcpyHostToDevice, stream);
+  if (error != cudaSuccess)
+  {
+    throw Error (InvalidState, "CUDA::ConvolutionEngineSpectral::setup_kernel",
+     "could not initialize convolution params in device memory");
+  }
+}
+
+
+
+
+#endif
diff -Nru bl-dspsr-0+git20160405/Signal/General/ConvolutionCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Signal/General/ConvolutionCUDA.cu
--- bl-dspsr-0+git20160405/Signal/General/ConvolutionCUDA.cu	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/ConvolutionCUDA.cu	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,797 @@
+//-*-C++-*-
+
+/***************************************************************************
+ *
+ *   Copyright (C) 2015 by Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#include "dsp/ConvolutionCUDA.h"
+#include "CUFFTError.h"
+#include "debug.h"
+
+#if HAVE_CUFFT_CALLBACKS
+#include "dsp/ConvolutionCUDACallbacks.h"
+#include <cufftXt.h>
+#endif
+
+#include <iostream>
+#include <cassert>
+
+using namespace std;
+
+void check_error_stream (const char*, cudaStream_t);
+
+__global__ void k_multiply_conv (float2* d_fft, const __restrict__ float2 * kernel, unsigned npart)
+{
+  const unsigned npt = blockDim.x * gridDim.x;
+  unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
+
+  // load the kernel for this fine channel
+  const float2 k = kernel[i];
+
+  while (i < npt * npart)
+  {
+    d_fft[i] = cuCmulf(d_fft[i], k);
+    i += npt;
+  }
+}
+
+__global__ void k_ncopy_conv (float2* output_data, unsigned output_stride,
+           const float2* input_data, unsigned input_stride,
+           unsigned to_copy)
+{
+  // shift the input forward FFT by the required number of batches
+  input_data += blockIdx.y * input_stride;
+
+  // shift in output forward 
+  output_data += blockIdx.y * output_stride;
+
+  unsigned index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (index < to_copy)
+    output_data[index] = input_data[index];
+}
+
+
+#if HAVE_CUFFT_CALLBACKS
+/*
+// [0] channel offset ( ichan * npt)
+// [1] npt
+// [2] first_ipt ( nfilt_pos )
+// [3] last_ipt ( npt - nfilt_neg )
+// [4] nfilt_tot 
+__device__ __constant__ unsigned conv_params[5];
+
+/////////////////////////////////////////////////////////////////////////
+//
+// store with multiplication by dedispersion kernel [no FFT batching]
+//
+__device__ void CB_convolve_and_store (void * dataOut, size_t offset, cufftComplex d, void * callerInfo, void *sharedPtr)
+{
+  // the dedispersion kernel complex float for this element of the FFT
+  const cufftComplex k = ((cufftComplex *) callerInfo)[conv_params[0] + offset];
+  ((cufftComplex*)dataOut)[offset] = cuCmulf (d, k);
+}
+
+__device__ void CB_convolve_and_store_batch (void * dataOut, size_t offset, cufftComplex d, void * callerInfo, void *sharedPtr)
+{
+  // the dedispersion kernel value for this element of the FFT
+  const unsigned kernel_offset = conv_params[0] + (offset % conv_params[1]);
+  const cufftComplex k = ((cufftComplex *) callerInfo)[kernel_offset];
+
+  ((cufftComplex*)dataOut)[offset] = cuCmulf (d, k);
+}
+__device__ cufftCallbackStoreC d_store_fwd 			 = CB_convolve_and_store;
+__device__ cufftCallbackStoreC d_store_fwd_batch = CB_convolve_and_store_batch;
+
+/////////////////////////////////////////////////////////////////////////
+//
+// store with output filtering on
+//
+__device__ void CB_filtered_store (void * dataOut, size_t offset, cufftComplex d, void * callerInfo, void *sharedPtr)
+{
+	// if offset < nfilt_pos, discard
+  if (offset < conv_params[2])
+    return;
+
+	// if offset > (npt - nfilt_neg), discard
+  if (offset >= conv_params[3])
+    return;
+
+  ((cufftComplex*)dataOut)[offset - conv_params[2]] = d;
+}
+
+__device__ void CB_filtered_store_batch (void * dataOut, size_t offset, cufftComplex d, void * callerInfo, void *sharedPtr)
+{
+	const unsigned ibatch = offset / conv_params[1];
+	const unsigned ipt = offset - (ibatch * conv_params[1]);
+
+	// if ipt < nfilt_pos, discard
+	if (ipt < conv_params[2])
+		return;
+	
+	// if ipt > (npt - nfilt_neg), discard
+	if (ipt >= conv_params[3])
+		return;
+
+	// substract the required offsets
+	offset -= ((ibatch * conv_params[4]) + conv_params[2]);
+
+  ((cufftComplex*)dataOut)[offset] = d;
+}
+
+__device__ cufftCallbackStoreC d_store_bwd       = CB_filtered_store;
+__device__ cufftCallbackStoreC d_store_bwd_batch = CB_filtered_store_batch;
+*/
+#endif
+
+CUDA::ConvolutionEngine::ConvolutionEngine (cudaStream_t _stream)
+{
+  stream = _stream;
+
+  // create plan handles
+  cufftResult result;
+
+  result = cufftCreate (&plan_fwd);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::ConvolutionEngine", 
+                      "cufftCreate(plan_fwd)");
+
+  result = cufftCreate (&plan_fwd_batched);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::ConvolutionEngine", 
+                      "cufftCreate(plan_fwd_batched)");
+
+  result = cufftCreate (&plan_bwd);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::ConvolutionEngine", 
+                      "cufftCreate(plan_bwd)");
+
+  result = cufftCreate (&plan_bwd_batched);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::ConvolutionEngine", 
+                      "cufftCreate(plan_bwd_batched)");
+
+  nbatch = 0;
+  npt_fwd = 0;
+  npt_bwd = 0;
+
+  work_area = 0;
+  work_area_size = 0;
+
+  buf = 0;
+  d_kernels = 0;
+}
+
+CUDA::ConvolutionEngine::~ConvolutionEngine()
+{
+  cufftResult result;
+
+  result = cufftDestroy (plan_fwd);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::~ConvolutionEngine",
+                      "cufftDestroy(plan_fwd)");
+
+  result = cufftDestroy (plan_fwd_batched);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::~ConvolutionEngine", 
+                      "cufftDestroy(plan_fwd)");
+
+  result = cufftDestroy (plan_bwd);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::~ConvolutionEngine",
+                      "cufftDestroy(plan_bwd)");
+
+  result = cufftDestroy (plan_bwd_batched);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::~ConvolutionEngine",
+                      "cufftDestroy(plan_bwd_batched)");
+
+  if (work_area)
+  {
+    cudaError_t error = cudaFree (work_area);
+    if (error != cudaSuccess)
+       throw Error (FailedCall, "CUDA::ConvolutionEngine::~ConvolutionEngine",
+                    "cudaFree(%xu): %s", &work_area,
+                     cudaGetErrorString (error));
+  }
+}
+
+void CUDA::ConvolutionEngine::set_scratch (void * scratch)
+{
+  d_scratch = (cufftComplex *) scratch;
+}
+
+// prepare all relevant attributes for the engine
+void CUDA::ConvolutionEngine::prepare (dsp::Convolution * convolution)
+{
+  const dsp::Response* response = convolution->get_response();
+
+  npt_bwd = response->get_ndat();
+  npt_fwd = convolution->get_minimum_samples();
+  nsamp_overlap = convolution->get_minimum_samples_lost();
+  nsamp_step = npt_fwd - nsamp_overlap;
+  nfilt_pos = response->get_impulse_pos ();
+  nfilt_neg = response->get_impulse_neg ();
+
+  if (convolution->get_input()->get_state() == Signal::Nyquist)
+    type_fwd = CUFFT_R2C;
+  else
+    type_fwd = CUFFT_C2C;
+
+  // configure the dedispersion kernel
+  setup_kernel (convolution->get_response());
+
+  // configure the singular FFT
+  setup_singular ();
+
+  // it is only more efficient to batch about to about 1M points 
+  // at least on the TitanX, so lets choose the number of batches 
+  // based on that
+  unsigned npart = 1048576 / npt_fwd;
+
+  if (npart > 1)
+    setup_batched (npart);
+  else
+    nbatch = 0;
+
+#if HAVE_CUFFT_CALLBACKS
+  setup_callbacks_ConvolutionCUDA (plan_fwd, plan_bwd, plan_fwd_batched, plan_bwd_batched, d_kernels, nbatch, stream);
+#endif
+
+  // initialize the kernel size configuration
+  mp.init();
+  mp.set_nelement (npt_bwd);
+}
+
+// setup the convolution kernel based on the reposnse
+void CUDA::ConvolutionEngine::setup_kernel (const dsp::Response * response)
+{
+  unsigned nchan = response->get_nchan();
+  unsigned ndat = response->get_ndat();
+  unsigned ndim = response->get_ndim();
+
+  assert (ndim == 2);
+  assert (d_kernels == 0);
+
+	// allocate memory for dedispersion kernel of all channels
+	unsigned kernels_size = ndat * sizeof(cufftComplex) * nchan;
+  cudaError_t error = cudaMalloc ((void**)&d_kernels, kernels_size);
+  if (error != cudaSuccess)
+  {
+    throw Error (InvalidState, "CUDA::ConvolutionEngine::setup_kernel",
+     "could not allocate device memory for dedispersion kernel");
+  }
+
+  // copy all kernels from host to device
+  const float* kernel = response->get_datptr (0,0);
+
+  cerr << "CUDA::ConvolutionEngine::setup_kernel cudaMemcpy stream=" << stream 
+       << " size=" << kernels_size << endl;
+  if (stream)
+    error = cudaMemcpyAsync (d_kernels, kernel, kernels_size, cudaMemcpyHostToDevice, stream);
+  else
+    error = cudaMemcpy (d_kernels, kernel, kernels_size, cudaMemcpyHostToDevice);
+  if (error != cudaSuccess)
+  {
+    throw Error (InvalidState, "CUDA::ConvolutionEngine::setup_kernel",
+     "could not copy dedispersion kernel to device");
+  }
+
+#if HAVE_CUFFT_CALLBACKS
+  error = cudaMallocHost ((void **) h_conv_params, sizeof(unsigned) * 5);
+  if (error != cudaSuccess)
+    throw Error (InvalidState, "CUDA::ConvolutionEngine::setup_kernel",
+                 "could not allocate memory for h_conv_params");
+
+  h_conv_params[0] = 0;
+  h_conv_params[1] = npt_bwd;
+  h_conv_params[2] = nfilt_pos;
+  h_conv_params[3] = npt_bwd - nfilt_neg;
+  h_conv_params[4] = nfilt_pos + nfilt_neg;
+
+  setup_callbacks_conv_params (h_conv_params, sizeof(h_conv_params), stream);
+
+#endif
+}
+
+void CUDA::ConvolutionEngine::setup_singular ()
+{
+  if (dsp::Operation::verbose)
+    cerr << "CUDA::ConvolutionEngine::setup_singular fwd=" << npt_fwd 
+         << " bwd=" << npt_bwd << endl;
+
+  // setup forward plan
+  cufftResult result = cufftPlan1d (&plan_fwd, npt_fwd, type_fwd, 1);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_singular",
+                      "cufftPlan1d(plan_fwd)");
+
+  result = cufftSetStream (plan_fwd, stream);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_singular",
+          "cufftSetStream(plan_fwd)");
+
+  // setup backward plan
+  result = cufftPlan1d (&plan_bwd, npt_bwd, CUFFT_C2C, 1);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_singular",
+                      "cufftPlan1d(plan_bwd)");
+
+  result = cufftSetStream (plan_bwd, stream);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_singular",
+                      "cufftSetStream(plan_bwd)");
+
+  size_t buffer_size = npt_bwd * sizeof (cufftComplex);
+  cudaError_t error = cudaMalloc ((void **) &buf, buffer_size);
+  if (error != cudaSuccess)
+    throw Error (FailedCall, "CUDA::ConvolutionEngine::setup_singular",
+                 "cudaMalloc(%x, %u): %s", &buf, buffer_size,
+                 cudaGetErrorString (error));
+}
+
+
+// configure the singular and batched FFT plans
+void CUDA::ConvolutionEngine::setup_batched (unsigned _nbatch)
+{
+  if (dsp::Operation::verbose)
+    cerr << "CUDA::ConvolutionEngine::setup_batched npt_fwd=" << npt_fwd 
+         << " npt_bwd=" << npt_bwd << " nbatch=" << _nbatch << endl;
+
+  nbatch = _nbatch;
+
+  int rank = 1; 
+  int inembed[1];
+  int onembed[1];
+  int istride, ostride, idist, odist;
+  cufftResult result;
+
+  // now setup the forward batched plan
+  size_t work_size_fwd, work_size_bwd;
+
+  // complex layout plans for input
+  inembed[0] = npt_fwd;
+  onembed[0] = npt_bwd;
+
+  istride = 1;
+  ostride = 1;
+
+  // the fordward FFT only moves forward a shorter amount
+  idist = nsamp_step;
+  odist = npt_bwd;
+
+  // setup forward fft
+  result = cufftMakePlanMany (plan_fwd_batched, rank, &npt_fwd, 
+                              inembed, istride, idist,
+                              onembed, ostride, odist,
+                              type_fwd, nbatch, &work_size_fwd);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_batched", 
+                      "cufftMakePlanMany (plan_fwd_batched)");
+
+  result = cufftSetStream (plan_fwd_batched, stream);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_batched",
+          "cufftSetStream(plan_fwd_batched)");
+
+  // get a rough estimate on work buffer size
+  work_size_fwd = 0;
+  result = cufftEstimateMany(rank, &npt_fwd, 
+                             inembed, istride, idist, 
+                             onembed, ostride, odist, 
+                             type_fwd, nbatch, &work_size_fwd);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_batched",
+                      "cufftEstimateMany(plan_fwd)");
+
+  // complex layout plans for input
+  inembed[0] = npt_bwd;
+  onembed[0] = nsamp_step;
+
+  istride = 1;
+  ostride = 1;
+
+  // the fordward FFT only moves forward a shorter amount
+  idist = npt_bwd;
+  odist = nsamp_step;
+
+  // the backward FFT is a has a simple layout (npt_bwd)
+  DEBUG("CUDA::ConvolutionEngine::setup_batched cufftMakePlanMany (plan_bwd_batched)");
+  result = cufftMakePlanMany (plan_bwd_batched, rank, &npt_bwd, 
+                              inembed, istride, idist,
+                              onembed, ostride, odist,
+                              CUFFT_C2C, nbatch, &work_size_bwd);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_batched", 
+                      "cufftMakePlanMany (plan_bwd_batched)");
+
+  result = cufftSetStream (plan_bwd_batched, stream);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_batched",
+                      "cufftSetStream(plan_bwd_batched)");
+
+  DEBUG("CUDA::ConvolutionEngine::setup_batched bwd FFT plan set");
+
+  work_size_bwd = 0;
+  result = cufftEstimateMany(rank, &npt_bwd, 
+                             inembed, istride, idist, 
+                             onembed, ostride, odist, 
+                             CUFFT_C2C, nbatch, &work_size_bwd);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_batched",
+                      "cufftEstimateMany(plan_fwd)");
+  
+  work_area_size = (work_size_fwd > work_size_bwd) ? work_size_fwd : work_size_bwd;
+  auto_allocate = work_area_size > 0;
+
+  DEBUG("CUDA::ConvolutionEngine::setup_batched cufftSetAutoAllocation(plan_fwd)");
+  result = cufftSetAutoAllocation(plan_fwd_batched, auto_allocate);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_batched",
+                      "cufftSetAutoAllocation(plan_bwd_batched, %d)", 
+                      auto_allocate);
+
+  DEBUG("CUDA::ConvolutionEngine::setup_batched cufftSetAutoAllocation(plan_bwd_batched)");
+  result = cufftSetAutoAllocation(plan_bwd_batched, auto_allocate);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_batched",
+                      "cufftSetAutoAllocation(plan_bwd_batched, %d)", auto_allocate);
+
+  // free the space allocated for buf in setup_singular
+  cudaError_t error = cudaFree (buf);
+  if (error != cudaSuccess)
+    throw Error (FailedCall, "CUDA::ConvolutionEngine::setup_batched",
+                 "cudaFree(%x): %s", &buf, cudaGetErrorString (error));
+
+  size_t batched_buffer_size = npt_bwd * nbatch * sizeof (cufftComplex);
+  error = cudaMalloc ((void **) &buf, batched_buffer_size);
+  if (error != cudaSuccess)
+    throw Error (FailedCall, "CUDA::ConvolutionEngine::setup_batched",
+                 "cudaMalloc(%x, %u): %s", &buf, batched_buffer_size,
+                 cudaGetErrorString (error));
+
+	// allocate device memory for dedispsersion kernel (1 channel)
+
+  if (work_area_size > 0)
+  {
+    if (work_area)
+    {
+      error = cudaFree (work_area);
+      if (error != cudaSuccess)
+         throw Error (FailedCall, "CUDA::ConvolutionEngine::setup",
+                     "cudaFree(%xu): %s", &work_area,
+                     cudaGetErrorString (error));
+    }
+    DEBUG("CUDA::ConvolutionEngine::setup cudaMalloc("<<work_area<<", "<<work_area_size<<")");
+    error = cudaMalloc (&work_area, work_area_size);  
+    if (error != cudaSuccess)
+      throw Error (FailedCall, "CUDA::ConvolutionEngine::setup", 
+                   "cudaMalloc(%x, %u): %s", &work_area, work_area_size,
+                   cudaGetErrorString (error));
+  }
+  else
+    work_area = 0;
+}
+
+#if HAVE_CUFFT_CALLBACKS
+/*
+void CUDA::ConvolutionEngine::setup_callbacks ()
+{
+  cudaError_t error;
+  cufftResult_t result;
+
+  cufftCallbackStoreC h_store_fwd;
+  cufftCallbackStoreC h_store_bwd;
+  cufftCallbackStoreC h_store_fwd_batch;
+  cufftCallbackStoreC h_store_bwd_batch;
+
+  error = cudaMemcpyFromSymbolAsync(&h_store_fwd, d_store_fwd, 
+																		sizeof(h_store_fwd), 0, 
+																		cudaMemcpyDeviceToHost, stream);
+  if (error != cudaSuccess)
+    throw Error (FailedCall, "CUDA::ConvolutionEngine::setup_callbacks",
+                 "cudaMemcpyFromSymbolAsync failed for h_store_fwd");
+
+  error = cudaMemcpyFromSymbolAsync(&h_store_bwd, d_store_bwd,
+                                    sizeof(h_store_bwd), 0,
+                                    cudaMemcpyDeviceToHost, stream);
+  if (error != cudaSuccess)
+    throw Error (FailedCall, "CUDA::ConvolutionEngine::setup_callbacks",
+                 "cudaMemcpyFromSymbolAsync failed for h_store_bwd");
+
+  error = cudaMemcpyFromSymbolAsync(&h_store_fwd_batch, d_store_fwd_batch,
+                                    sizeof(h_store_fwd_batch), 0,
+                                    cudaMemcpyDeviceToHost, stream);
+  if (error != cudaSuccess)
+    throw Error (FailedCall, "CUDA::ConvolutionEngine::setup_callbacks",
+                 "cudaMemcpyFromSymbolAsync failed for h_store_fwd_batch");
+
+  error = cudaMemcpyFromSymbolAsync(&h_store_bwd_batch, d_store_bwd_batch,
+                                    sizeof(h_store_bwd_batch), 0,
+                                    cudaMemcpyDeviceToHost, stream);
+  if (error != cudaSuccess)
+    throw Error (FailedCall, "CUDA::ConvolutionEngine::setup_callbacks",
+                 "cudaMemcpyFromSymbolAsync failed for h_store_bwd_batch");
+
+  result = cufftXtSetCallback (plan_fwd, (void **)&h_store_fwd,
+                               CUFFT_CB_ST_COMPLEX, (void **)&d_kernels);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_callbacks",
+      "cufftXtSetCallback (plan_fwd, h_store_fwd)");
+
+  result = cufftXtSetCallback (plan_bwd, (void **)&h_store_bwd,
+                               CUFFT_CB_ST_COMPLEX, 0);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_callbacks",
+      "cufftXtSetCallback (plan_bwd, h_store_bwd)");
+
+	if (nbatch > 0)
+	{
+		result = cufftXtSetCallback (plan_fwd_batched, (void **)&h_store_fwd_batch,
+																 CUFFT_CB_ST_COMPLEX, (void **)&d_kernels);
+		if (result != CUFFT_SUCCESS)
+			throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_callbacks",
+				"cufftXtSetCallback (plan_fwd_batched, h_store_fwd_batch)");
+
+    result = cufftXtSetCallback (plan_bwd_batched, (void **)&h_store_bwd_batch,
+                                 CUFFT_CB_ST_COMPLEX, 0);
+    if (result != CUFFT_SUCCESS)
+      throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_callbacks",
+        "cufftXtSetCallback (plan_bwd_batched, h_store_bwd_batch)");
+  }
+}
+*/
+#endif
+
+
+// Perform convolution choosing the optimal batched size or if ndat is not as
+// was configured, then perform singular
+void CUDA::ConvolutionEngine::perform (const dsp::TimeSeries* input, dsp::TimeSeries * output, unsigned npart)
+{
+  if (dsp::Operation::verbose)
+    cerr << "CUDA::ConvolutionEngine::perform (" << npart << ")" << endl;
+
+  if (npart == 0)
+    return;
+
+  if (type_fwd == CUFFT_C2C)
+    perform_complex (input, output, npart);
+  else
+    perform_real (input, output, npart);
+
+}
+
+void CUDA::ConvolutionEngine::perform_complex (const dsp::TimeSeries* input, 
+                                               dsp::TimeSeries * output,
+                                               unsigned npart)
+{
+  const unsigned npol = input->get_npol();
+  const unsigned nchan = input->get_nchan();
+  const unsigned ndim = input->get_ndim();
+
+  cufftComplex * in;
+  cufftComplex * out;
+  cufftResult result;
+
+  const unsigned in_step_batch  = nsamp_step * nbatch;
+  const unsigned out_step_batch = nsamp_step * nbatch;
+
+  unsigned nbp = 0;
+  if (nbatch > 0)
+    nbp = npart / nbatch;
+
+	if (dsp::Operation::verbose)
+  	cerr << "CUDA::ConvolutionEngine::perform_complex npart=" << npart 
+         << " nbatch=" << nbatch 
+				 << " npb=" << nbp << " nsamp_step=" << nsamp_step << endl;
+
+#if !HAVE_CUFFT_CALLBACKS
+  dim3 blocks = dim3 (nsamp_step, nbatch, 0);
+  if (nsamp_step % mp.get_nthread())
+    blocks.x++;
+#endif
+
+  for (unsigned ichan=0; ichan<nchan; ichan++)
+  {
+
+#if HAVE_CUFFT_CALLBACKS
+    // determine convolution kernel offset
+    h_conv_params[0] = ichan * npt_bwd;
+
+    setup_callbacks_conv_params (h_conv_params, sizeof(unsigned), stream);
+
+/*
+		// update the channel offset in constant memory
+		cudaError_t error = cudaMemcpyToSymbolAsync (conv_params, (void *) &h_conv_params, 
+                          sizeof(unsigned), 0, cudaMemcpyHostToDevice, stream);
+    if (error != cudaSuccess)
+      throw Error (InvalidState, "CUDA::ConvolutionEngine::setup_kernel",
+                   "could not update conv_params in device memory");
+*/
+#else
+    const unsigned k_offset = ichan * npt_bwd;
+#endif
+
+    for (unsigned ipol=0; ipol<npol; ipol++)
+    {
+      in  = (cufftComplex *) input->get_datptr (ichan, ipol);
+      out = (cufftComplex *) output->get_datptr (ichan, ipol);
+
+      // for each batched FFT
+      for (unsigned i=0; i<nbp; i++)
+      {
+        // perform forward batched FFT
+        result = cufftExecC2C (plan_fwd_batched, in, buf, CUFFT_FORWARD);
+        if (result != CUFFT_SUCCESS)
+          throw CUFFTError (result, "CUDA::ConvolutionEngine::perform_complex",
+                            "cufftExecC2C(plan_fwd_batched)");
+
+#if HAVE_CUFFT_CALLBACKS
+        // perform the inverse batched FFT (out-of-place)
+        result = cufftExecC2C (plan_bwd_batched, buf, out, CUFFT_INVERSE);
+        if (result != CUFFT_SUCCESS)
+          throw CUFFTError (result, "CUDA::ConvolutionEngine::perform_complex",
+                            "cufftExecC2C(plan_bwd_batched)");
+
+#else
+        // multiply by the dedispersion kernel
+        k_multiply_conv<<<mp.get_nblock(),mp.get_nthread(),0,stream>>> (buf,
+                                                                        d_kernels + k_offset,
+                                                                        nbatch);
+
+        // perform the inverse batched FFT (in-place)
+        result = cufftExecC2C (plan_bwd_batched, buf, buf, CUFFT_INVERSE);
+        if (result != CUFFT_SUCCESS)
+          throw CUFFTError (result, "CUDA::ConvolutionEngine::perform_complex",
+                            "cufftExecC2C(plan_bwd_batched)");
+
+        // copy batches of output from input
+        k_ncopy_conv<<<blocks,mp.get_nthread(),0,stream>>> (out, nsamp_step,
+                                                       buf + nfilt_pos, npt_bwd,
+                                                       out_step_batch);
+#endif
+
+        out += out_step_batch;
+        in  += in_step_batch;
+      }
+
+      for (unsigned ipart=nbp*nbatch; ipart<npart; ipart++)
+      {
+        result = cufftExecC2C (plan_fwd, in, buf, CUFFT_FORWARD);
+        if (result != CUFFT_SUCCESS)
+          throw CUFFTError (result, "CUDA::ConvolutionEngine::perform_complex",
+                            "cufftExecC2C(plan_fwd)");
+
+#if HAVE_CUFFT_CALLBACKS
+        result = cufftExecC2C (plan_bwd, buf, out, CUFFT_INVERSE);
+        if (result != CUFFT_SUCCESS)
+          throw CUFFTError (result, "CUDA::ConvolutionEngine::perform_complex",
+                            "cufftExecC2C(plan_bwd)");
+#else
+            // multiply by the dedispersion kernel
+        k_multiply_conv<<<mp.get_nblock(),mp.get_nthread(),0,stream>>> (buf,
+                                                                   d_kernels + k_offset,
+                                                                   1);
+
+        // perform the inverse batched FFT (in-place)
+        result = cufftExecC2C (plan_bwd, buf, buf, CUFFT_INVERSE);
+        if (result != CUFFT_SUCCESS)
+          throw CUFFTError (result, "CUDA::ConvolutionEngine::perform",
+                            "cufftExecC2C(plan_bwd_batched)");
+
+        // copy batches of output from input
+        k_ncopy_conv<<<blocks.x,mp.get_nthread(),0,stream>>> (out, nsamp_step,
+                                                         buf + nfilt_pos, npt_bwd,
+                                                         nsamp_step);
+#endif
+
+        in  += nsamp_step;
+        out += nsamp_step;
+      }
+    }
+  }
+
+  if (dsp::Operation::record_time || dsp::Operation::verbose)
+    check_error_stream( "CUDA::ConvolutionEngine::perform_complex", stream );
+}
+
+void CUDA::ConvolutionEngine::perform_real(const dsp::TimeSeries* input,
+                                           dsp::TimeSeries * output,
+                                           unsigned npart)
+{
+  const unsigned npol = input->get_npol();
+  const unsigned nchan = input->get_nchan();
+  const unsigned ndim = input->get_ndim();
+
+  cufftReal * in;
+  cufftComplex * out;
+  cufftResult result;
+
+  const unsigned out_nsamp_step = nsamp_step / 2;
+
+  const unsigned in_step_batch  = nsamp_step * nbatch;
+  const unsigned out_step_batch = out_nsamp_step * nbatch;
+
+  unsigned nbp = 0;
+  if (nbatch > 0)
+    nbp = npart / nbatch;
+
+  dim3 blocks = dim3 (out_nsamp_step, nbatch, 0);
+  if (out_nsamp_step % mp.get_nthread())
+    blocks.x++;
+
+  if (dsp::Operation::verbose)
+    cerr << "CUDA::ConvolutionEngine::perform_real nsamp_step=" << nsamp_step
+         << " npt_bwd=" << npt_bwd << endl;
+
+  for (unsigned ichan=0; ichan<nchan; ichan++)
+  {
+    const unsigned k_offset = ichan * npt_bwd;
+
+    for (unsigned ipol=0; ipol<npol; ipol++)
+    {
+      in  = (cufftReal *) input->get_datptr (ichan, ipol);
+      out = (cufftComplex *) output->get_datptr (ichan, ipol);
+
+      // for each batched FFT
+      for (unsigned i=0; i<nbp; i++)
+      {
+        // perform forward batched FFT
+        result = cufftExecR2C (plan_fwd_batched, in, buf);
+        if (result != CUFFT_SUCCESS)
+          throw CUFFTError (result, "CUDA::ConvolutionEngine::perform_real",
+                            "cufftExecC2C(plan_fwd_batched)");
+
+        // multiply by the dedispersion kernel
+        k_multiply_conv<<<mp.get_nblock(),mp.get_nthread(),0,stream>>> (buf,
+                                                                   d_kernels + k_offset,
+                                                                   nbatch);
+
+        // perform the inverse batched FFT (in-place)
+        result = cufftExecC2C (plan_bwd_batched, buf, buf, CUFFT_INVERSE);
+        if (result != CUFFT_SUCCESS)
+          throw CUFFTError (result, "CUDA::ConvolutionEngine::perform_real",
+                            "cufftExecC2C(plan_bwd_batched)");
+
+        // copy batches of output from input
+        k_ncopy_conv<<<blocks,mp.get_nthread(),0,stream>>> (out, out_nsamp_step,
+                                                       buf + nfilt_pos, npt_bwd,
+                                                       out_step_batch);
+
+        in  += in_step_batch;
+        out += out_step_batch;
+      }
+
+      for (unsigned ipart=nbp*nbatch; ipart<npart; ipart++)
+      {
+        result = cufftExecR2C (plan_fwd, in, buf);
+        if (result != CUFFT_SUCCESS)
+          throw CUFFTError (result, "CUDA::ConvolutionEngine::perform_real",
+                            "cufftExecC2C(plan_fwd)");
+
+        // multiply by the dedispersion kernel
+        k_multiply_conv<<<mp.get_nblock(),mp.get_nthread(),0,stream>>> (buf,
+                                                                   d_kernels + k_offset,
+                                                                   1);
+
+        // perform the inverse batched FFT (in-place)
+        result = cufftExecC2C (plan_bwd, buf, buf, CUFFT_INVERSE);
+        if (result != CUFFT_SUCCESS)
+          throw CUFFTError (result, "CUDA::ConvolutionEngine::perform",
+                            "cufftExecC2C(plan_bwd_batched)");
+
+        // copy batches of output from input
+        k_ncopy_conv<<<blocks.x,mp.get_nthread(),0,stream>>> (out, out_nsamp_step,
+                                                         buf + nfilt_pos, npt_bwd,
+                                                         out_step_batch);
+        in  += nsamp_step;
+        out += out_nsamp_step;
+      }
+    }
+  }
+  if (dsp::Operation::record_time || dsp::Operation::verbose)
+    check_error_stream( "CUDA::ConvolutionEngine::perform_real", stream );
+}
diff -Nru bl-dspsr-0+git20160405/Signal/General/ConvolutionCUDASpectral.cu bl-dspsr-0.0~git20180312.50ea209/Signal/General/ConvolutionCUDASpectral.cu
--- bl-dspsr-0+git20160405/Signal/General/ConvolutionCUDASpectral.cu	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/ConvolutionCUDASpectral.cu	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,618 @@
+//-*-C++-*-
+
+/***************************************************************************
+ *
+ *   Copyright (C) 2015 by Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#include "dsp/ConvolutionCUDASpectral.h"
+#include "CUFFTError.h"
+#include "debug.h"
+
+#if HAVE_CUFFT_CALLBACKS
+#include "dsp/ConvolutionCUDACallbacks.h"
+#include <cufftXt.h>
+#endif
+
+#include <iostream>
+#include <cassert>
+
+using namespace std;
+
+void check_error_stream (const char*, cudaStream_t);
+
+// ichan   == blockIdx.y
+// ipt_bwd == blockIdx.x * blockDim.x + threadIdx.x
+__global__ void k_multiply_conv_spectral (float2* d_fft, const __restrict__ float2 * kernel, unsigned npt_bwd)
+{
+  const unsigned idx = (blockIdx.y * npt_bwd) + (blockIdx.x * blockDim.x) + threadIdx.x;
+  d_fft[idx] = cuCmulf(d_fft[idx], kernel[idx]);
+}
+
+// ichan == blockIdx.y
+// ipt_bwd == blockIdx.x * blockDim.x + threadIdx.x
+__global__ void k_ncopy_conv_spectral (float2* output_data, uint64_t ostride,
+           const float2* input_data, uint64_t istride,
+           unsigned nfilt_pos, unsigned nsamp_step)
+{
+
+  const unsigned idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  if (idx < nfilt_pos)
+    return;
+
+  uint64_t in_offset  = istride * blockIdx.y;
+  uint64_t out_offset = ostride * blockIdx.y;
+
+  unsigned isamp = idx;
+  unsigned osamp = idx - nfilt_pos;
+
+  if (osamp < nsamp_step)
+    output_data[out_offset + osamp] = input_data[in_offset + isamp];
+}
+
+CUDA::ConvolutionEngineSpectral::ConvolutionEngineSpectral (cudaStream_t _stream)
+{
+  stream = _stream;
+
+  // create plan handles
+  cufftResult result;
+
+  result = cufftCreate (&plan_fwd);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::ConvolutionEngineSpectral", 
+                      "cufftCreate(plan_fwd)");
+
+  result = cufftCreate (&plan_bwd);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::ConvolutionEngineSpectral", 
+                      "cufftCreate(plan_bwd)");
+
+  fft_configured = false;
+  nchan = 0;
+  npt_fwd = 0;
+  npt_bwd = 0;
+
+  work_area = 0;
+  work_area_size = 0;
+
+  buf = 0;
+  d_kernels = 0;
+}
+
+CUDA::ConvolutionEngineSpectral::~ConvolutionEngineSpectral()
+{
+  cufftResult result;
+
+  result = cufftDestroy (plan_fwd);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::~ConvolutionEngineSpectral",
+                      "cufftDestroy(plan_fwd)");
+
+  result = cufftDestroy (plan_bwd);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::~ConvolutionEngineSpectral",
+                      "cufftDestroy(plan_bwd)");
+
+  if (work_area)
+  {
+    cudaError_t error = cudaFree (work_area);
+    if (error != cudaSuccess)
+       throw Error (FailedCall, "CUDA::ConvolutionEngineSpectral::~ConvolutionEngineSpectral",
+                    "cudaFree(%xu): %s", &work_area,
+                     cudaGetErrorString (error));
+  }
+
+  if (buf)
+  {
+    cudaError_t error = cudaFree (buf);
+    if (error != cudaSuccess)
+       throw Error (FailedCall, "CUDA::ConvolutionEngineSpectral::~ConvolutionEngineSpectral",
+                    "cudaFree(%xu): %s", &buf,
+                     cudaGetErrorString (error));
+  }
+}
+
+void CUDA::ConvolutionEngineSpectral::regenerate_plans()
+{
+  cufftResult result;
+  result = cufftDestroy (plan_fwd);
+  result = cufftCreate (&plan_fwd);
+
+  result = cufftDestroy (plan_bwd);
+  result = cufftCreate (&plan_bwd);
+}
+
+void CUDA::ConvolutionEngineSpectral::set_scratch (void * scratch)
+{
+  d_scratch = (cufftComplex *) scratch;
+}
+
+// prepare all relevant attributes for the engine
+void CUDA::ConvolutionEngineSpectral::prepare (dsp::Convolution * convolution)
+{
+  const dsp::Response* response = convolution->get_response();
+
+  nchan = response->get_nchan();
+  npt_bwd = response->get_ndat();
+  npt_fwd = convolution->get_minimum_samples();
+  nsamp_overlap = convolution->get_minimum_samples_lost();
+  nsamp_step = npt_fwd - nsamp_overlap;
+  nfilt_pos = response->get_impulse_pos ();
+  nfilt_neg = response->get_impulse_neg ();
+
+  if (convolution->get_input()->get_state() == Signal::Nyquist)
+    type_fwd = CUFFT_R2C;
+  else
+    type_fwd = CUFFT_C2C;
+
+  // configure the dedispersion kernel
+  setup_kernel (convolution->get_response());
+
+  fft_configured = false;
+
+  // initialize the kernel size configuration
+  mp.init();
+  mp.set_nelement (npt_bwd);
+}
+
+// setup the convolution kernel based on the reposnse
+void CUDA::ConvolutionEngineSpectral::setup_kernel (const dsp::Response * response)
+{
+  unsigned nchan = response->get_nchan();
+  unsigned ndat = response->get_ndat();
+  unsigned ndim = response->get_ndim();
+
+  assert (ndim == 2);
+  assert (d_kernels == 0);
+
+  if (dsp::Operation::verbose)
+    cerr << "CUDA::ConvolutionEngineSpectral::setup_kernel response: "
+         << "nchan=" << nchan << " ndat=" << ndat << " ndim=" << ndim << endl;
+
+	// allocate memory for dedispersion kernel of all channels
+	unsigned kernels_size = ndat * sizeof(cufftComplex) * nchan;
+  cudaError_t error = cudaMalloc ((void**)&d_kernels, kernels_size);
+  if (error != cudaSuccess)
+  {
+    throw Error (InvalidState, "CUDA::ConvolutionEngineSpectral::setup_kernel",
+     "could not allocate device memory for dedispersion kernel");
+  }
+
+  // copy all kernels from host to device
+  const float* kernel = response->get_datptr (0,0);
+
+  if (dsp::Operation::verbose)
+    cerr << "CUDA::ConvolutionEngineSpectral::setup_kernel cudaMemcpy stream=" 
+         << stream << " size=" << kernels_size << endl;
+  if (stream)
+    error = cudaMemcpyAsync (d_kernels, kernel, kernels_size, cudaMemcpyHostToDevice, stream);
+  else
+    error = cudaMemcpy (d_kernels, kernel, kernels_size, cudaMemcpyHostToDevice);
+  if (error != cudaSuccess)
+  {
+    throw Error (InvalidState, "CUDA::ConvolutionEngineSpectral::setup_kernel",
+     "could not copy dedispersion kernel to device");
+  }
+
+#if HAVE_CUFFT_CALLBACKS
+  error = cudaMallocHost ((void **) h_conv_params, sizeof(unsigned) * 2);
+  if (error != cudaSuccess)
+    throw Error (InvalidState, "CUDA::ConvolutionEngineSpectral::setup_kernel",
+                 "could not allocate memory for h_conv_params");
+
+  h_conv_params[0] = nfilt_pos;
+  h_conv_params[1] = npt_bwd - nfilt_neg;
+  setup_callbacks_conv_params_spectral (h_conv_params, sizeof (h_conv_params), stream);
+#endif
+}
+
+// configure the batched FFT plans
+void CUDA::ConvolutionEngineSpectral::setup_batched (const dsp::TimeSeries* input,
+                                                     dsp::TimeSeries * output)
+{
+  if (dsp::Operation::verbose)
+    cerr << "CUDA::ConvolutionEngineSpectral::setup_batched npt_fwd=" << npt_fwd 
+         << " npt_bwd=" << npt_bwd << endl;
+
+  nchan = input->get_nchan();
+  npol  = input->get_npol();
+  unsigned ndim = input->get_ndim();
+
+#ifdef _DEBUG
+  cerr << "CUDA::ConvolutionEngineSpectral::setup_batched nchan=" << nchan 
+       << " npol=" << npol << " ndat=" << input->get_ndat() << endl;
+#endif
+
+  input_stride = (input->get_datptr (1, 0) - input->get_datptr (0, 0)) / ndim;
+  output_stride = (output->get_datptr (1, 0) - output->get_datptr (0, 0) ) / ndim;
+
+  int rank = 1; 
+  int inembed[1];
+  int onembed[1];
+  int istride, ostride, idist, odist;
+  cufftResult result;
+
+  // now setup the forward batched plan
+  size_t work_size_fwd, work_size_bwd;
+
+  // complex layout plans for input
+  inembed[0] = npt_fwd;
+  onembed[0] = npt_bwd;
+
+  istride = 1;
+  ostride = 1;
+
+  idist = (int) input_stride;
+  odist = npt_bwd;
+
+#ifdef _DEBUG
+  cerr << "CUDA::ConvolutionEngineSpectral::setup_batched npt_fwd=" << npt_fwd 
+       << " nbatch=" << nchan << endl;
+  cerr << "CUDA::ConvolutionEngineSpectral::setup_batched input_stride=" 
+       << input_stride << " output_stride=" << output_stride << endl;
+#endif
+
+  // setup forward fft
+  result = cufftMakePlanMany (plan_fwd, rank, &npt_fwd, 
+                              inembed, istride, idist,
+                              onembed, ostride, odist,
+                              type_fwd, nchan, &work_size_fwd);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::setup_batched", 
+                      "cufftMakePlanMany (plan_fwd)");
+
+  result = cufftSetStream (plan_fwd, stream);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::setup_batched",
+          "cufftSetStream(plan_fwd)");
+
+  // get a rough estimate on work buffer size
+  work_size_fwd = 0;
+  result = cufftEstimateMany(rank, &npt_fwd, 
+                             inembed, istride, idist, 
+                             onembed, ostride, odist, 
+                             type_fwd, nchan, &work_size_fwd);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::setup_batched",
+                      "cufftEstimateMany(plan_fwd)");
+
+  istride = 1;
+  ostride = 1;
+
+#ifdef HAVE_CUFFT_CALLBACKS
+  inembed[0] = npt_bwd;
+  onembed[0] = nsamp_step;
+
+  idist = npt_bwd;
+  odist = (int) output_stride;
+#else
+  inembed[0] = npt_bwd;
+  onembed[0] = npt_bwd;
+
+  idist = npt_bwd;
+  odist = npt_bwd;
+#endif
+
+  // the backward FFT is a has a simple layout (npt_bwd)
+  DEBUG("CUDA::ConvolutionEngineSpectral::setup_batched cufftMakePlanMany (plan_bwd)");
+  result = cufftMakePlanMany (plan_bwd, rank, &npt_bwd, 
+                              inembed, istride, idist,
+                              onembed, ostride, odist,
+                              CUFFT_C2C, nchan, &work_size_bwd);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::setup_batched", 
+                      "cufftMakePlanMany (plan_bwd)");
+
+  result = cufftSetStream (plan_bwd, stream);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::setup_batched",
+                      "cufftSetStream(plan_bwd)");
+
+  DEBUG("CUDA::ConvolutionEngineSpectral::setup_batched bwd FFT plan set");
+
+  work_size_bwd = 0;
+  result = cufftEstimateMany(rank, &npt_bwd, 
+                             inembed, istride, idist, 
+                             onembed, ostride, odist, 
+                             CUFFT_C2C, nchan, &work_size_bwd);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::setup_batched",
+                      "cufftEstimateMany(plan_fwd)");
+  
+/*
+  work_area_size = (work_size_fwd > work_size_bwd) ? work_size_fwd : work_size_bwd;
+  auto_allocate = work_area_size > 0;
+
+  DEBUG("CUDA::ConvolutionEngineSpectral::setup_batched cufftSetAutoAllocation(plan_fwd)");
+  result = cufftSetAutoAllocation(plan_fwd, auto_allocate);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::setup_batched",
+                      "cufftSetAutoAllocation(plan_bwd, %d)", 
+                      auto_allocate);
+
+  DEBUG("CUDA::ConvolutionEngineSpectral::setup_batched cufftSetAutoAllocation(plan_bwd)");
+  result = cufftSetAutoAllocation(plan_bwd, auto_allocate);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::setup_batched",
+                      "cufftSetAutoAllocation(plan_bwd, %d)", auto_allocate);
+
+*/
+  // free the space allocated for buf in setup_singular
+  cudaError_t error;
+  if (buf)
+  {
+    error = cudaFree (buf);
+    if (error != cudaSuccess)
+      throw Error (FailedCall, "CUDA::ConvolutionEngineSpectral::setup_batched",
+                   "cudaFree(%x): %s", &buf, cudaGetErrorString (error));
+  }
+
+  size_t batched_buffer_size = npt_bwd * nchan * sizeof (cufftComplex);
+  error = cudaMalloc ((void **) &buf, batched_buffer_size);
+  if (error != cudaSuccess)
+    throw Error (FailedCall, "CUDA::ConvolutionEngineSpectral::setup_batched",
+                 "cudaMalloc(%x, %u): %s", &buf, batched_buffer_size,
+                 cudaGetErrorString (error));
+
+	// allocate device memory for dedispsersion kernel (1 channel)
+/*
+  if (work_area_size > 0)
+  {
+    if (work_area)
+    {
+      error = cudaFree (work_area);
+      if (error != cudaSuccess)
+         throw Error (FailedCall, "CUDA::ConvolutionEngineSpectral::setup",
+                     "cudaFree(%xu): %s", &work_area,
+                     cudaGetErrorString (error));
+    }
+    DEBUG("CUDA::ConvolutionEngineSpectral::setup cudaMalloc("<<work_area<<", "<<work_area_size<<")");
+    error = cudaMalloc (&work_area, work_area_size);  
+    if (error != cudaSuccess)
+      throw Error (FailedCall, "CUDA::ConvolutionEngineSpectral::setup", 
+                   "cudaMalloc(%x, %u): %s", &work_area, work_area_size,
+                   cudaGetErrorString (error));
+  }
+  else
+    work_area = 0;
+*/
+}
+
+// Perform convolution choosing the optimal batched size or if ndat is not as
+// was configured, then perform singular
+void CUDA::ConvolutionEngineSpectral::perform (const dsp::TimeSeries* input, dsp::TimeSeries * output, unsigned npart)
+{
+  if (dsp::Operation::verbose)
+    cerr << "CUDA::ConvolutionEngineSpectral::perform (" << npart << ")" << endl;
+
+  if (npart == 0)
+    return;
+
+  uint64_t curr_istride = (input->get_datptr (1, 0) - input->get_datptr (0, 0)) / input->get_ndim();
+  uint64_t curr_ostride = (output->get_datptr (1, 0) - output->get_datptr (0, 0)) / output->get_ndim();
+
+  if (dsp::Operation::verbose)
+  {
+    cerr << "CUDA::ConvolutionEngineSpectral::perform istride prev=" << input_stride << " curr=" << curr_istride << " ndim=" << input->get_ndim() << endl;
+    cerr << "CUDA::ConvolutionEngineSpectral::perform ostride prev=" << output_stride << " curr=" << curr_ostride << " ndim=" <<
+output->get_ndim() << endl;
+  }
+
+  if (curr_istride != input_stride || curr_ostride != output_stride)
+  {
+    if (dsp::Operation::verbose)
+      cerr << "CUDA::ConvolutionEngineSpectral::perform reconfiguring FFT batch sizes" << endl;
+    fft_configured = false;
+  }
+
+  if (!fft_configured)
+  {
+    regenerate_plans ();
+    setup_batched (input, output);
+#if HAVE_CUFFT_CALLBACKS
+    cerr << "CUDA::ConvolutionEngineSpectral::perform setup_callbacks_ConvolutionCUDASpectral()" << endl;
+    setup_callbacks_ConvolutionCUDASpectral (plan_fwd, plan_bwd, d_kernels, stream);
+#endif
+    fft_configured = true;
+  }
+
+  if (type_fwd == CUFFT_C2C)
+  {
+    perform_complex (input, output, npart);
+  }
+  else
+  {
+    cerr << "CUDA::ConvolutionEngineSpectral::perform_real not implemented" << endl;
+    //perform_real (input, output, npart);
+  }
+}
+
+void CUDA::ConvolutionEngineSpectral::perform_complex (const dsp::TimeSeries* input, 
+                                                       dsp::TimeSeries * output,
+                                                       unsigned npart)
+{
+  const unsigned npol = input->get_npol();
+  const unsigned nchan = input->get_nchan();
+  const unsigned ndim = input->get_ndim();
+  const uint64_t ipol_stride = input_stride / npol;
+  const uint64_t opol_stride = output_stride / npol;
+
+  cufftComplex * in;
+  cufftComplex * out;
+  cufftResult result;
+
+	if (dsp::Operation::verbose)
+  	cerr << "CUDA::ConvolutionEngineSpectral::perform_complex npart=" << npart 
+				 << " nsamp_step=" << nsamp_step << endl;
+
+#if !HAVE_CUFFT_CALLBACKS
+  dim3 blocks = dim3 (npt_bwd / mp.get_nthread(), nchan);
+  unsigned nthreads = mp.get_nthread();
+
+  if (npt_bwd <= nthreads) 
+  {
+    blocks.x = 1;
+    nthreads = npt_bwd;
+  }
+  else
+  {
+    if (npt_bwd % nthreads)
+      blocks.x++;
+  }
+#endif
+
+  cufftComplex * in_t  = (cufftComplex *) input->get_datptr (0, 0);
+  cufftComplex * out_t = (cufftComplex *) output->get_datptr (0, 0);
+
+  if (dsp::Operation::verbose)
+    cerr << "CUDA::ConvolutionEngineSpectral::perform_complex in=" << in_t << " out=" << out_t << endl;
+
+  for (unsigned ipart=0; ipart<npart; ipart++)
+  {
+    in  = in_t;
+    out = out_t;
+
+    for (unsigned ipol=0; ipol<npol; ipol++)
+    {
+      // perform nchan batched forward FFTs for the current ipol and ipart
+      result = cufftExecC2C (plan_fwd, in, buf, CUFFT_FORWARD);
+      if (result != CUFFT_SUCCESS)
+        throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::perform_complex",
+                          "cufftExecC2C(plan_fwd)");
+
+#if HAVE_CUFFT_CALLBACKS
+      // perform the inverse batched FFT (out-of-place)
+      result = cufftExecC2C (plan_bwd, buf, out, CUFFT_INVERSE);
+      if (result != CUFFT_SUCCESS)
+        throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::perform_complex",
+                            "cufftExecC2C(plan_bwd)");
+
+#else
+      // multiply by the dedispersion kernel
+      k_multiply_conv_spectral<<<blocks, nthreads, 0, stream>>> (buf, d_kernels, npt_bwd);
+
+      // perform the inverse batched FFT (in-place)
+      result = cufftExecC2C (plan_bwd, buf, buf, CUFFT_INVERSE);
+      if (result != CUFFT_SUCCESS)
+        throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::perform_complex",
+                          "cufftExecC2C(plan_bwd)");
+
+      // copy batches of output from input
+      k_ncopy_conv_spectral<<<blocks, nthreads, 0, stream>>> (out, output_stride,
+                                                              buf, npt_bwd,
+                                                              nfilt_pos, nsamp_step);
+#endif
+      in  += ipol_stride;
+      out += opol_stride;
+    }
+
+    in_t  += nsamp_step;
+    out_t += nsamp_step;
+  }
+
+  if (dsp::Operation::record_time || dsp::Operation::verbose)
+    check_error_stream( "CUDA::ConvolutionEngineSpectral::perform_complex", stream );
+}
+
+#if 0
+void CUDA::ConvolutionEngineSpectral::perform_real(const dsp::TimeSeries* input,
+                                           dsp::TimeSeries * output,
+                                           unsigned npart)
+{
+  const unsigned npol = input->get_npol();
+  const unsigned nchan = input->get_nchan();
+  const unsigned ndim = input->get_ndim();
+
+  cufftReal * in;
+  cufftComplex * out;
+  cufftResult result;
+
+  const unsigned out_nsamp_step = nsamp_step / 2;
+
+  const unsigned in_step_batch  = nsamp_step * nbatch;
+  const unsigned out_step_batch = out_nsamp_step * nbatch;
+
+  unsigned nbp = 0;
+  if (nbatch > 0)
+    nbp = npart / nbatch;
+
+  dim3 blocks = dim3 (out_nsamp_step, nbatch, 0);
+  if (out_nsamp_step % mp.get_nthread())
+    blocks.x++;
+
+  if (dsp::Operation::verbose)
+    cerr << "CUDA::ConvolutionEngineSpectral::perform_real nsamp_step=" << nsamp_step
+         << " npt_bwd=" << npt_bwd << endl;
+
+  for (unsigned ichan=0; ichan<nchan; ichan++)
+  {
+    const unsigned k_offset = ichan * npt_bwd;
+
+    for (unsigned ipol=0; ipol<npol; ipol++)
+    {
+      in  = (cufftReal *) input->get_datptr (ichan, ipol);
+      out = (cufftComplex *) output->get_datptr (ichan, ipol);
+
+      // for each batched FFT
+      for (unsigned i=0; i<nbp; i++)
+      {
+        // perform forward batched FFT
+        result = cufftExecR2C (plan_fwd, in, buf);
+        if (result != CUFFT_SUCCESS)
+          throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::perform_real",
+                            "cufftExecC2C(plan_fwd)");
+
+        // multiply by the dedispersion kernel
+        k_multiply_conv<<<mp.get_nblock(),mp.get_nthread(),0,stream>>> (buf,
+                                                                   d_kernels + k_offset,
+                                                                   nbatch);
+
+        // perform the inverse batched FFT (in-place)
+        result = cufftExecC2C (plan_bwd, buf, buf, CUFFT_INVERSE);
+        if (result != CUFFT_SUCCESS)
+          throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::perform_real",
+                            "cufftExecC2C(plan_bwd)");
+
+        // copy batches of output from input
+        k_ncopy_conv<<<blocks,mp.get_nthread(),0,stream>>> (out, out_nsamp_step,
+                                                       buf + nfilt_pos, npt_bwd,
+                                                       out_step_batch);
+
+        in  += in_step_batch;
+        out += out_step_batch;
+      }
+
+      for (unsigned ipart=nbp*nbatch; ipart<npart; ipart++)
+      {
+        result = cufftExecR2C (plan_fwd, in, buf);
+        if (result != CUFFT_SUCCESS)
+          throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::perform_real",
+                            "cufftExecC2C(plan_fwd)");
+
+        // multiply by the dedispersion kernel
+        k_multiply_conv<<<mp.get_nblock(),mp.get_nthread(),0,stream>>> (buf,
+                                                                   d_kernels + k_offset,
+                                                                   1);
+
+        // perform the inverse batched FFT (in-place)
+        result = cufftExecC2C (plan_bwd, buf, buf, CUFFT_INVERSE);
+        if (result != CUFFT_SUCCESS)
+          throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::perform",
+                            "cufftExecC2C(plan_bwd)");
+
+        // copy batches of output from input
+        k_ncopy_conv<<<blocks.x,mp.get_nthread(),0,stream>>> (out, out_nsamp_step,
+                                                         buf + nfilt_pos, npt_bwd,
+                                                         out_step_batch);
+        in  += nsamp_step;
+        out += out_nsamp_step;
+      }
+    }
+  }
+  if (dsp::Operation::record_time || dsp::Operation::verbose)
+    check_error_stream( "CUDA::ConvolutionEngineSpectral::perform_real", stream );
+}
+#endif
diff -Nru bl-dspsr-0+git20160405/Signal/General/cross_detect.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/cross_detect.h
--- bl-dspsr-0+git20160405/Signal/General/cross_detect.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/cross_detect.h	2018-03-12 23:02:35.000000000 +0000
@@ -4,10 +4,7 @@
  *   Licensed under the Academic Free License version 2.1
  *
  ***************************************************************************/
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/cross_detect.h,v $
-   $Revision: 1.1 $
-   $Date: 2006/10/15 18:56:39 $
-   $Author: straten $ */
+// dspsr/Signal/General/cross_detect.h
 
 #ifndef __cross_detect_h
 #define __cross_detect_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/cufft_callback_bench.cu bl-dspsr-0.0~git20180312.50ea209/Signal/General/cufft_callback_bench.cu
--- bl-dspsr-0+git20160405/Signal/General/cufft_callback_bench.cu	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/cufft_callback_bench.cu	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,523 @@
+/***************************************************************************
+ *
+ *   Copyright (C) 2015 by Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#if HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <cuda_runtime.h>
+#include <cufft.h>
+#include <cuda_fp16.h>
+#include <cufftXt.h>
+
+#include "CUFFTError.h"
+#include "CommandLine.h"
+#include "RealTimer.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <iostream>
+#include <math.h>
+
+using namespace std;
+
+__global__ void k_unpack (cuFloatComplex * output, const __restrict__ char2 * input, const float scale)
+{
+  unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
+
+  char2 element = input[i];
+  output[i] = make_cuComplex ((float) element.x/scale, (float) element.y/scale);
+}
+
+__global__ void k_multiply (float2* d_fft, const __restrict__ float2 * kernel, unsigned npart)
+{
+  const unsigned npt = blockDim.x * gridDim.x;
+  unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
+
+  // load the kernel for this fine channel
+  const float2 k = kernel[i];
+
+  while (i < npt * npart)
+  {
+    const float2 d = d_fft[i];
+    const float x = d.x * k.x - d.y * k.y;
+    d_fft[i].y = d.x * k.y + d.y * k.x;
+    d_fft[i].x = x;
+    i += npt;
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////
+//
+// store with multiplication by dedispersion kernel
+//
+__device__ void CB_convolve_and_storeC (void * dataOut, size_t offset, cufftComplex d, void * callerInfo, void *sharedPtr)
+{
+  // the dedispersion kernel value for this element of the FFT
+  const cufftComplex k = ((cufftComplex *) callerInfo)[offset];
+  ((cufftComplex*)dataOut)[offset] = cuCmulf (d, k);
+}
+
+__device__ cufftCallbackStoreC d_store_fwd_C = CB_convolve_and_storeC;
+
+
+/////////////////////////////////////////////////////////////////////////
+//
+// convert an 8bit number to 32 bit
+//
+__device__ cufftComplex cufft_callback_load_8bit(
+    void *dataIn, 
+    size_t offset, 
+    void *callerInfo, 
+    void *sharedPtr) 
+{
+  const __restrict__ char2 in = ((char2 *)dataIn)[offset];
+  const float scale = 127.0f;
+  return make_cuComplex ((float)in.x/scale, (float) in.y/scale);
+  //char2 in = ((char2*)dataIn)[offset];
+  //float2 out;
+  //out.x = (float) in.x / scale;
+  //out.y = (float) in.y / scale;
+
+  //return out;
+  //return make_cuComplex ((float) element.x, (float) element.y);
+  //return make_cuComplex ((float) element.x/scale, (float) element.y/scale);
+}
+__device__ cufftCallbackLoadC d_load_8bit_fwd_C = cufft_callback_load_8bit;
+
+
+/////////////////////////////////////////////////////////////////////////
+//
+// convert an 16bit number to 32 bit
+//
+__device__ cufftComplex cufft_callback_load_half2(
+    void *dataIn,
+    size_t offset,
+    void *callerInfo,
+    void *sharedPtr)
+{
+  half * ptr = (half*) dataIn + (2*offset);
+  return make_cuComplex ( __half2float(ptr[0]), __half2float(ptr[1]));
+}
+
+__device__ cufftCallbackLoadC d_load_half2_fwd_C = cufft_callback_load_half2;
+
+
+/////////////////////////////////////////////////////////////////////////
+//
+// store with output filtering on
+//
+__device__ void CB_filtered_store (void * dataOut, size_t offset, cufftComplex d, void * callerInfo, void *sharedPtr)
+{
+  unsigned nfilt_pos = ((unsigned *) callerInfo)[0];
+  unsigned nsamp_filt = ((unsigned *) callerInfo)[1];
+
+  offset -= nfilt_pos;
+  if ((offset > 0) && (offset < nsamp_filt))
+    ((cufftComplex*)dataOut)[offset] = d;
+}
+
+__device__ cufftCallbackStoreC d_store_bwd_C = CB_filtered_store;
+
+class Speed : public Reference::Able
+{
+public:
+
+  Speed ();
+
+  // parse command line options
+  void parseOptions (int argc, char** argv);
+
+  // run the test
+  void runTest ();
+
+protected:
+
+  int npt;
+  int niter;
+  unsigned gpu_id;
+  bool cuda;
+};
+
+
+Speed::Speed ()
+{
+  gpu_id = 0;
+  niter = 16;
+  npt = 1024;
+  cuda = false;
+}
+
+int main(int argc, char** argv) try
+{
+  Speed speed;
+  speed.parseOptions (argc, argv);
+  speed.runTest ();
+  return 0;
+}
+catch (Error& error)
+{
+  cerr << error << endl;
+  return -1;
+}
+
+void Speed::parseOptions (int argc, char** argv)
+{
+  CommandLine::Menu menu;
+  CommandLine::Argument* arg;
+
+  menu.set_help_header ("undersampling_speed - measure under sampling speed");
+  menu.set_version ("undersampling_speed version 1.0");
+
+  arg = menu.add (npt, 'n', "npt");
+  arg->set_help ("number of points in each FFT");
+
+#if HAVE_CUFFT
+  arg = menu.add (gpu_id, 'd');
+  arg->set_help ("GPU device ID");
+#endif
+
+  arg = menu.add (niter, 't', "ninter");
+  arg->set_help ("number of iterations (batch/loops)");
+
+#if HAVE_CUFFT
+  arg = menu.add (cuda, "cuda");
+  arg->set_help ("benchmark CUDA");
+#endif
+
+  menu.parse (argc, argv);
+}
+
+#if HAVE_CUFFT
+void check_error_stream (const char*, cudaStream_t);
+#endif
+
+void Speed::runTest ()
+{
+#ifdef _DEBUG
+  dsp::Operation::verbose = true;
+  dsp::Observation::verbose = true;
+#endif
+
+  // assume complex FFTs
+  const unsigned ndim = 2;
+ 
+  cudaStream_t stream = 0;
+  if (cuda)
+  {
+    cerr << "using GPU " << gpu_id << endl;
+    cudaError_t err = cudaSetDevice(gpu_id); 
+    if (err != cudaSuccess)
+      throw Error (InvalidState, "undersampling_speed",
+                   "cudaSetDevice failed: %s", cudaGetErrorString(err));
+
+    err = cudaStreamCreate( &stream );
+    if (err != cudaSuccess)
+      throw Error (InvalidState, "undersampling_speed",
+                   "cudaStreamCreate failed: %s", cudaGetErrorString(err));
+
+  }
+
+  const unsigned ndat = npt * niter;
+  const unsigned raw_size = ndat * ndim * sizeof(int8_t);
+  const unsigned half2_size = ndat * ndim * sizeof(half);
+  const unsigned unpacked_size = ndat * ndim * sizeof(float);
+  const unsigned kernel_size = npt * sizeof (cuFloatComplex);
+
+  char2 * raw;
+  half2 * input_h2;
+  cufftComplex * input;
+  cufftComplex * buffer;
+  cufftComplex * output;
+  cufftComplex * d_kernel;
+  unsigned * d_offsets;
+  cufftResult result;
+  size_t work_size;
+
+  cudaMalloc ((void **) &raw, raw_size);
+  cudaMalloc ((void **) &input_h2, half2_size);
+  cudaMalloc ((void **) &input, unpacked_size);
+  cudaMalloc ((void **) &buffer, unpacked_size);
+  cudaMalloc ((void **) &output, unpacked_size);
+  cudaMalloc ((void **) &d_kernel, kernel_size);
+  cudaMalloc ((void **) &d_offsets, 2 * sizeof(unsigned));
+
+  cudaMemsetAsync ((void *) raw, 0, raw_size, stream);
+  cudaMemsetAsync ((void *) input, 0, unpacked_size, stream);
+  cudaMemsetAsync ((void *) input_h2, 0, half2_size, stream);
+  cudaMemsetAsync ((void *) d_kernel, 0, kernel_size, stream);
+
+  unsigned * h_offsets;
+  cudaMallocHost((void **) &h_offsets, 2 * sizeof(unsigned));
+  h_offsets[0] = (unsigned) (npt / 15);
+  h_offsets[1] = (unsigned) (npt / 15);
+
+  cudaMemcpyAsync ((void *) d_offsets, (void *) h_offsets, 2 * sizeof(unsigned), cudaMemcpyHostToDevice, stream);
+
+  // all plans are using batched FFTs to ensure at least 1M points
+
+  cufftHandle plan_batch;
+  result = cufftCreate (&plan_batch);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "Speed::runTest", "cufftCreate(plan_batch)");
+
+  int rank = 1;
+  result = cufftMakePlanMany (plan_batch, rank, &npt, NULL, 0, 0, NULL, 0, 0, 
+                              CUFFT_C2C, niter, &work_size);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "Speed::runTest", "cufftMakePlanMany (plan_batch)");
+
+  result = cufftSetStream (plan_batch, stream);
+  if (result != CUFFT_SUCCESS)
+    CUFFTError (result, "Speed::runTest", "cufftSetStream (plan_batch)");
+
+
+  cufftHandle plan_callback;
+  result = cufftCreate (&plan_callback);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "Speed::runTest", "cufftCreate(plan_callback)");
+
+  result = cufftMakePlanMany (plan_callback, rank, &npt, NULL, 0, 0, NULL, 0, 0,
+                              CUFFT_C2C, niter, &work_size);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "Speed::runTest", "cufftMakePlanMany (plan_callback)");
+
+  result = cufftSetStream (plan_callback, stream);
+  if (result != CUFFT_SUCCESS)
+    CUFFTError (result, "Speed::runTest", "cufftSetStream (plan_callback)");
+
+  cufftHandle plan_half;
+  result = cufftCreate (&plan_half);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "Speed::runTest", "cufftCreate(plan_half)");
+
+  result = cufftMakePlanMany (plan_half, rank, &npt, NULL, 0, 0, NULL, 0, 0,
+                              CUFFT_C2C, niter, &work_size);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "Speed::runTest", "cufftMakePlanMany (plan_half)");
+
+  result = cufftSetStream (plan_half, stream);
+  if (result != CUFFT_SUCCESS)
+    CUFFTError (result, "Speed::runTest", "cufftSetStream (plan_half)");
+
+  cufftHandle plan_bwd;
+  result = cufftCreate (&plan_bwd);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "Speed::runTest", "cufftCreate(plan_bwd)");
+
+  result = cufftMakePlanMany (plan_bwd, rank, &npt, NULL, 0, 0, NULL, 0, 0,
+                              CUFFT_C2C, niter, &work_size);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "Speed::runTest", "cufftMakePlanMany (plan_bwd)");
+
+  result = cufftSetStream (plan_bwd, stream);
+  if (result != CUFFT_SUCCESS)
+    CUFFTError (result, "Speed::runTest", "cufftSetStream (plan_bwd)");
+
+  cufftHandle plan_bwd_cb;
+  result = cufftCreate (&plan_bwd_cb);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "Speed::runTest", "cufftCreate(plan_bwd_cb)");
+  
+  result = cufftMakePlanMany (plan_bwd_cb, rank, &npt, NULL, 0, 0, NULL, 0, 0,
+                              CUFFT_C2C, niter, &work_size);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "Speed::runTest", "cufftMakePlanMany (plan_bwd_cb)");
+
+  result = cufftSetStream (plan_bwd_cb, stream);
+  if (result != CUFFT_SUCCESS)
+    CUFFTError (result, "Speed::runTest", "cufftSetStream (plan_bwd_cb)");
+
+
+
+
+  RealTimer timer_batch;
+  RealTimer timer_callback;
+  RealTimer timer_half;
+  RealTimer timer_;
+ 
+  cufftCallbackLoadC  h_load_8bit_fwd_C;
+  cufftCallbackLoadC  h_load_half2_fwd_C;
+  cufftCallbackStoreC h_store_fwd_C;
+  cufftCallbackStoreC h_store_bwd_C;
+  cudaError_t error;
+
+  error = cudaMemcpyFromSymbolAsync(&h_load_8bit_fwd_C,
+                                    d_load_8bit_fwd_C,
+                                    sizeof(h_load_8bit_fwd_C),
+                                    0,
+                                    cudaMemcpyDeviceToHost,
+                                    stream);
+  if (error != cudaSuccess)
+    throw Error (FailedCall, "Speed::runTest",
+                 "cudaMemcpyFromSymbolAsync failed for h_load_8bit_fwd_C");
+
+
+  error = cudaMemcpyFromSymbolAsync(&h_load_half2_fwd_C,
+                                    d_load_half2_fwd_C,
+                                    sizeof(h_load_half2_fwd_C),
+                                    0,
+                                    cudaMemcpyDeviceToHost,
+                                    stream);
+  if (error != cudaSuccess)
+    throw Error (FailedCall, "Speed::runTest",
+                 "cudaMemcpyFromSymbolAsync failed for h_load_half2_fwd_C");
+
+  error = cudaMemcpyFromSymbolAsync(&h_store_fwd_C,
+                                    d_store_fwd_C,
+                                    sizeof(h_store_fwd_C),
+                                    0,
+                                    cudaMemcpyDeviceToHost,
+                                    stream);
+  if (error != cudaSuccess)
+    throw Error (FailedCall, "Speed::runTest",
+                 "cudaMemcpyFromSymbolAsync failed for h_store_fwd_C");
+
+  error = cudaMemcpyFromSymbolAsync(&h_store_bwd_C,
+                                    d_store_bwd_C,
+                                    sizeof(h_store_bwd_C),
+                                    0,
+                                    cudaMemcpyDeviceToHost,
+                                    stream);
+  if (error != cudaSuccess)
+    throw Error (FailedCall, "Speed::runTest",
+                 "cudaMemcpyFromSymbolAsync failed for h_store_bwd_C");
+
+  result = cufftXtSetCallback (plan_callback,
+                               (void **)&h_load_8bit_fwd_C,
+                               CUFFT_CB_LD_COMPLEX,
+                               0);
+  if (result == CUFFT_LICENSE_ERROR)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_callbacks", 
+                      "CUFFT Callback invalid license");
+  cerr << "result=" << result << endl;
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_callbacks",
+      "cufftXtSetCallback (plan_fwd, h_load_8bit_fwd_C)");
+
+/*
+  result = cufftXtSetCallback (plan_half,
+                               (void **)&h_load_half2_fwd_C,
+                               CUFFT_CB_LD_COMPLEX,
+                               0);
+  if (result == CUFFT_LICENSE_ERROR)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_callbacks",
+                      "CUFFT Callback invalid license");
+  cerr << "result=" << result << endl;
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_callbacks",
+      "cufftXtSetCallback (plan_fwd, h_load_half2_fwd_C)");
+*/
+
+  result = cufftXtSetCallback (plan_callback,
+                               (void **)&h_store_fwd_C,
+                               CUFFT_CB_ST_COMPLEX,
+                               (void **)&d_kernel);
+  if (result == CUFFT_LICENSE_ERROR)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_callbacks",
+                      "CUFFT Callback invalid license");
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_callbacks",
+      "cufftXtSetCallback (plan_fwd, h_store_fwd_C)");
+
+  result = cufftXtSetCallback (plan_bwd_cb,
+                               (void **)&h_store_bwd_C,
+                               CUFFT_CB_ST_COMPLEX,
+                               (void **)&d_offsets);
+  if (result == CUFFT_LICENSE_ERROR)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_callbacks",
+                      "CUFFT Callback invalid license");
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_callbacks",
+      "cufftXtSetCallback (plan_bwd_cb, h_store_bwd_C)");
+
+
+  cudaStreamSynchronize (stream);
+/*
+  timer_half.start();
+
+  result = cufftExecC2C (plan_half, (cufftComplex *) input_h2, output, CUFFT_FORWARD);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "Speed::runTest", "cufftExecC2C(plan_half)");
+  cudaStreamSynchronize(stream);
+
+  timer_half.stop();
+*/
+
+  timer_callback.start ();
+
+  result = cufftExecC2C (plan_callback, (cuFloatComplex *) raw, buffer, CUFFT_FORWARD);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "Speed::runTest", "cufftExecC2C(plan_callback)");
+
+  result = cufftExecC2C (plan_bwd_cb, output, buffer, CUFFT_INVERSE);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "Speed::runTest", "cufftExecC2C(plan_callback)");
+
+  cudaStreamSynchronize(stream);
+
+  timer_callback.stop ();
+  double total_time, time_per_fft, time_us;
+
+  total_time = timer_callback.get_elapsed();
+  time_per_fft = total_time / niter;
+  time_us = time_per_fft * 1e6;
+  cerr << "CALLBACK: total_time=" << total_time << " time_per_fft=" << time_per_fft 
+       << " time_us=" << time_us << endl;
+
+  timer_batch.start ();
+
+  unsigned nthreads = 1024;
+  unsigned nblocks = ndat / nthreads;
+  if (ndat % nthreads != 0)
+    nblocks++;
+
+  k_unpack<<<nblocks,nthreads,0,stream>>> (input, raw, 127.0f);
+
+  result = cufftExecC2C (plan_batch, input, buffer, CUFFT_FORWARD);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "Speed::runTest", "cufftExecC2C(plan_batch)");
+
+  nthreads = 1024;
+  nblocks = npt / nthreads;
+  if (npt % nthreads)
+    nblocks++;
+
+  k_multiply<<<nblocks,nthreads,0,stream>>> (buffer, d_kernel, niter);
+
+  result = cufftExecC2C (plan_bwd, buffer, buffer, CUFFT_INVERSE);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "Speed::runTest", "cufftExecC2C(plan_callback)");
+
+  cufftComplex * ou = output;
+  cufftComplex * in = buffer;
+
+  for (unsigned i=0; i<niter; i++)
+  { 
+    cudaMemcpyAsync ((void *) ou, (void *) in, npt * sizeof(cufftComplex), cudaMemcpyDeviceToDevice, stream);
+    ou += npt;
+    in += npt;
+  }
+
+  cudaStreamSynchronize(stream);
+
+  timer_batch.stop ();
+
+  total_time = timer_batch.get_elapsed();
+  time_per_fft = total_time / niter;
+  time_us = time_per_fft * 1e6;
+  cerr << "BATCH: total_time=" << total_time << " time_per_fft=" << time_per_fft 
+       << " time_us=" << time_us << endl;
+
+  cufftDestroy(plan_callback);
+  cufftDestroy(plan_batch);
+
+  cudaFree(raw);
+  cudaFree(input);
+  cudaFree(output);
+  cudaFree(d_kernel);
+}
diff -Nru bl-dspsr-0+git20160405/Signal/General/CUFFTError.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/CUFFTError.C
--- bl-dspsr-0+git20160405/Signal/General/CUFFTError.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/CUFFTError.C	2018-03-12 23:02:35.000000000 +0000
@@ -50,6 +50,12 @@
     case CUFFT_NO_WORKSPACE :
       return "No workspace has been provided prior to plan execution";
 #endif
+#if CUDA_VERSION >= 6050
+    case CUFFT_NOT_IMPLEMENTED:
+      return "Not Implemented";
+    case CUFFT_LICENSE_ERROR:
+      return "License error";
+#endif
     }
   return "unrecognized cufftResult";
 }
diff -Nru bl-dspsr-0+git20160405/Signal/General/Dedispersion.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/Dedispersion.C
--- bl-dspsr-0+git20160405/Signal/General/Dedispersion.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/Dedispersion.C	2018-03-12 23:02:35.000000000 +0000
@@ -11,6 +11,7 @@
 
 #include "ThreadContext.h"
 #include "Error.h"
+
 #include <complex>
 
 using namespace std;
@@ -209,12 +210,33 @@
   note that each time sample depends upon the preceding impulse_pos
   points.
 */
+
+unsigned smearing_samples_threshold = 16 * 1024 * 1024;
+
 void dsp::Dedispersion::prepare ()
 {
   if (!smearing_samples_set)
   {
+    unsigned threshold = smearing_samples_threshold / nchan;
+    supported_channels = vector<bool> (nchan, true);
+    unsigned ichan = 0;
+  
+    while( (impulse_neg = smearing_samples (-1)) > threshold )
+    {
+      supported_channels[ichan] = false;
+      ichan ++;
+      if (ichan == nchan)
+	throw Error (InvalidState,
+		     "dsp::Dedispersion::prepare",
+		     "smearing samples=%u exceeds threshold=%u",
+		     impulse_neg, threshold);
+    }
+
+    if (verbose)
+      cerr << "dsp::Dedispersion::prepare "
+	   << ichan << " unsupported channels" << endl;
+    
     impulse_pos = smearing_samples (1);
-    impulse_neg = smearing_samples (-1);
   }
 
   if (psrdisp_compatible)
@@ -223,13 +245,6 @@
       "   using symmetric impulse response function" << endl;
     impulse_pos = impulse_neg;
   }
-
-#if 0
-  // test the effect of a possibly common error in the interpretation of HR75
-  impulse_pos += impulse_neg;
-  impulse_neg = 0;
-#endif
-
 }
 
 
@@ -354,6 +369,9 @@
 //! Return the effective number of smearing samples
 unsigned dsp::Dedispersion::get_effective_smearing_samples () const
 {
+  if (verbose)
+    cerr << "dsp::Dedispersion::get_effective_smearing_samples" << endl;
+
   return smearing_samples (0);
 }
 
@@ -372,6 +390,13 @@
   double ch_abs_bw = abs_bw / double(nchan);
   double lower_ch_cfreq = centre_frequency - (abs_bw - ch_abs_bw) / 2.0;
 
+  unsigned ichan=0;
+  while (ichan < supported_channels.size() && !supported_channels[ichan])
+  {
+    lower_ch_cfreq += ch_abs_bw;
+    ichan++;
+  }
+      
   // calculate the smearing (in the specified half of the band)
   if (half)
   {
@@ -414,7 +439,7 @@
 
   if (verbose)
     cerr << "dsp::Dedispersion::smearing_samples = "
-	 << int(tsmear * sampling_rate) << endl;
+	 << int64_t(tsmear * sampling_rate) << endl;
 
   // add another ten percent, just to be sure that the pollution due
   // to the cyclical convolution effect is minimized
diff -Nru bl-dspsr-0+git20160405/Signal/General/Detection.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/Detection.C
--- bl-dspsr-0+git20160405/Signal/General/Detection.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/Detection.C	2018-03-12 23:02:35.000000000 +0000
@@ -106,9 +106,6 @@
     return;
   }
 
-  if (input->get_ndat() == 0)
-    return;
-
   if (!inplace)
     resize_output ();    
 
diff -Nru bl-dspsr-0+git20160405/Signal/General/DetectionCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Signal/General/DetectionCUDA.cu
--- bl-dspsr-0+git20160405/Signal/General/DetectionCUDA.cu	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/DetectionCUDA.cu	2018-03-12 23:02:35.000000000 +0000
@@ -156,6 +156,9 @@
          << " input.span=" << input_span 
          << " output.span=" << output_span << endl;
 
+  if (ndat == 0)
+    return;
+
   dim3 threads (128);
   dim3 blocks (ndat/threads.x, nchan);
 
@@ -170,7 +173,7 @@
                                            ndat); 
 
   if (dsp::Operation::record_time || dsp::Operation::verbose)
-    check_error ("CUDA::DetectionEngine::polarimetry");
+    check_error_stream ("CUDA::DetectionEngine::polarimetry", stream);
 }
 
 // dubiuous about the correctness here... TODO AJ
diff -Nru bl-dspsr-0+git20160405/Signal/General/digifits.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/digifits.C
--- bl-dspsr-0+git20160405/Signal/General/digifits.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/digifits.C	2018-03-12 23:02:35.000000000 +0000
@@ -87,6 +87,11 @@
   arg = menu.add (config->block_size, 'B', "MB");
   arg->set_help ("block size in megabytes");
 
+  string ram_limit;
+  arg = menu.add (ram_limit, 'U', "MB");
+  arg->set_help ("upper limit on RAM usage");
+  arg->set_long_help ("specify the floating point number of megabytes; e.g. -U 256 \n");
+
   //arg = menu.add (&config->filterbank, 
   //    &dsp::Filterbank::Config::set_freq_res, 
   //    'x', "nfft");
@@ -123,6 +128,9 @@
   arg = menu.add (config->nsblk, "nsblk", "N");
   arg->set_help ("output block size in samples (default=2048)");
 
+  arg = menu.add (config->integration_length, 'L', "seconds");
+  arg->set_help ("set maximum file length");
+
   arg = menu.add (config->dedisperse, 'K');
   arg->set_help ("remove inter-channel dispersion delays");
 
@@ -138,6 +146,12 @@
 
   menu.parse (argc, argv);
 
+  if (!ram_limit.empty())
+  {
+    double MB = fromstring<double> (ram_limit);
+    config->set_maximum_RAM (uint64_t( MB * 1024.0 * 1024.0 ));
+  }
+
   //if (revert)
   //  config->order = dsp::TimeSeries::OrderFPT;
 }
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/ACFilterbank.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/ACFilterbank.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/ACFilterbank.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/ACFilterbank.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/ACFilterbank.h,v $
-   $Revision: 1.5 $
-   $Date: 2006/07/09 13:27:11 $
-   $Author: wvanstra $ */
+// dspsr/Signal/General/dsp/ACFilterbank.h
 
 #ifndef __ACFilterbank_h
 #define __ACFilterbank_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/AutoCorrelation.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/AutoCorrelation.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/AutoCorrelation.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/AutoCorrelation.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/AutoCorrelation.h,v $
-   $Revision: 1.2 $
-   $Date: 2006/07/09 13:27:11 $
-   $Author: wvanstra $ */
+// dspsr/Signal/General/dsp/AutoCorrelation.h
 
 #ifndef __AutoCorrelation_h
 #define __AutoCorrelation_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/BandpassMonitor.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/BandpassMonitor.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/BandpassMonitor.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/BandpassMonitor.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/BandpassMonitor.h,v $
-   $Revision: 1.6 $
-   $Date: 2008/10/03 05:42:40 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/BandpassMonitor.h
 
 #ifndef __baseband_dsp_BandpassMonitor_h
 #define __baseband_dsp_BandpassMonitor_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/BitStatsPlotter.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/BitStatsPlotter.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/BitStatsPlotter.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/BitStatsPlotter.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/BitStatsPlotter.h,v $
-   $Revision: 1.5 $
-   $Date: 2008/07/13 00:38:54 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/BitStatsPlotter.h
 
 #ifndef __BitStatsPlotter_h
 #define __BitStatsPlotter_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/ConvolutionCUDACallbacks.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/ConvolutionCUDACallbacks.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/ConvolutionCUDACallbacks.h	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/ConvolutionCUDACallbacks.h	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,30 @@
+//-*-C++-*-
+/***************************************************************************
+ *
+ *   Copyright (C) 2015 by Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#ifndef __baseband_cuda_Convolution_Callbacks_h
+#define __baseband_cuda_Convolution_Callbacks_h
+
+#include <cufft.h>
+#include <config.h>
+
+#if HAVE_CUFFT_CALLBACKS
+
+  void setup_callbacks_ConvolutionCUDA (cufftHandle plan_fwd, cufftHandle plan_bwd,
+																			  cufftHandle plan_fwd_batch, cufftHandle plan_bwd_batch,
+																			  cufftComplex * d_kernels, int nbatch, cudaStream_t stream);
+
+  void setup_callbacks_conv_params (unsigned * h_ptr, unsigned h_size, cudaStream_t stream);
+
+  void setup_callbacks_ConvolutionCUDASpectral (cufftHandle plan_fwd, cufftHandle plan_bwd,
+                                     					  cufftComplex * d_kernels, cudaStream_t stream);
+
+  void setup_callbacks_conv_params_spectral (unsigned * h_ptr, unsigned h_size, cudaStream_t stream);
+
+#endif
+
+#endif //__baseband_cuda_Convolution_Callbacks_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/ConvolutionCUDA.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/ConvolutionCUDA.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/ConvolutionCUDA.h	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/ConvolutionCUDA.h	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,110 @@
+//-*-C++-*-
+/***************************************************************************
+ *
+ *   Copyright (C) 2015 by Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#ifndef __baseband_cuda_Convolution_h
+#define __baseband_cuda_Convolution_h
+
+#include <cufft.h>
+#include <config.h>
+
+#include "dsp/Convolution.h"
+#include "dsp/LaunchConfig.h"
+
+namespace CUDA
+{
+  class ConvolutionEngine : public dsp::Convolution::Engine
+  {
+  public:
+
+    //! Default Constructor
+    ConvolutionEngine (cudaStream_t stream);
+    ~ConvolutionEngine();
+
+    void set_scratch (void * scratch);
+
+    //! prepare the required attributes for the engine
+    void prepare (dsp::Convolution * convolution);
+
+    //! setup the dedispersion kernel from the response
+    void setup_kernel (const dsp::Response * response);
+
+    //! configure the singular FFTs
+    void setup_singular ();
+
+    //! configure the batched FFTs
+    void setup_batched (unsigned nbatch);
+
+#if HAVE_CUFFT_CALLBACKS
+    //! setup FFT callbacks
+    //void setup_callbacks ();
+#endif
+
+    void perform (const dsp::TimeSeries* input, dsp::TimeSeries* output,
+                  unsigned npart);
+
+  protected:
+
+    void perform_complex (const dsp::TimeSeries* input, dsp::TimeSeries * output,
+                         unsigned npart);
+
+    void perform_real (const dsp::TimeSeries* input, dsp::TimeSeries * output,
+                       unsigned npart);
+
+    cudaStream_t stream;
+
+    LaunchConfig1D mp;
+
+    cufftType type_fwd;
+
+    cufftHandle plan_fwd;
+
+    cufftHandle plan_bwd;
+
+    cufftHandle plan_fwd_batched;
+
+    cufftHandle plan_bwd_batched;
+
+		size_t kernel_size;
+
+		// dedispersion kernel for all input channels in device memory
+    cufftComplex * d_kernels;
+
+    // device scratch memory
+    cufftComplex * d_scratch;
+
+    cufftComplex * buf;
+
+    void * work_area;
+
+    size_t work_area_size;
+
+    int auto_allocate;
+
+    int npt_fwd;
+
+    int npt_bwd;
+
+    int nbatch;
+
+    unsigned nsamp_overlap;
+
+    unsigned nsamp_step;
+    
+    unsigned nfilt_pos;
+
+    unsigned nfilt_neg;
+
+#if HAVE_CUFFT_CALLBACKS
+		unsigned h_conv_params[5];
+#endif
+
+  };
+}
+
+#endif
+
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/ConvolutionCUDASpectral.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/ConvolutionCUDASpectral.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/ConvolutionCUDASpectral.h	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/ConvolutionCUDASpectral.h	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,115 @@
+//-*-C++-*-
+/***************************************************************************
+ *
+ *   Copyright (C) 2015 by Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#ifndef __baseband_cuda_ConvolutionSpectral_h
+#define __baseband_cuda_ConvolutionSpectral_h
+
+#include <cufft.h>
+#include <config.h>
+
+#include "dsp/Convolution.h"
+#include "dsp/LaunchConfig.h"
+
+namespace CUDA
+{
+  class ConvolutionEngineSpectral : public dsp::Convolution::Engine
+  {
+  public:
+
+    //! Default Constructor
+    ConvolutionEngineSpectral (cudaStream_t stream);
+    ~ConvolutionEngineSpectral();
+
+    void regenerate_plans();
+
+    void set_scratch (void * scratch);
+
+    //! prepare the required attributes for the engine
+    void prepare (dsp::Convolution * convolution);
+
+    //! setup the dedispersion kernel from the response
+    void setup_kernel (const dsp::Response * response);
+
+    //! configure batched FFT
+    void setup_batched (const dsp::TimeSeries* input, dsp::TimeSeries * output);
+
+#if HAVE_CUFFT_CALLBACKS
+    //! setup FFT callbacks
+    void setup_callbacks ();
+#endif
+
+    void perform (const dsp::TimeSeries* input, dsp::TimeSeries* output,
+                  unsigned npart);
+
+  protected:
+
+    void perform_complex (const dsp::TimeSeries* input, dsp::TimeSeries * output,
+                         unsigned npart);
+
+    void perform_real (const dsp::TimeSeries* input, dsp::TimeSeries * output,
+                       unsigned npart);
+
+    cudaStream_t stream;
+
+    LaunchConfig1D mp;
+
+    cufftType type_fwd;
+
+    cufftHandle plan_fwd;
+
+    cufftHandle plan_bwd;
+
+		size_t kernel_size;
+
+		// dedispersion kernel for all input channels in device memory
+    cufftComplex * d_kernels;
+
+    // device scratch memory
+    cufftComplex * d_scratch;
+
+    cufftComplex * buf;
+
+    void * work_area;
+
+    size_t work_area_size;
+
+    int auto_allocate;
+
+    int nchan;
+
+    int npol;
+
+    bool fft_configured;
+
+    uint64_t input_stride;
+
+    uint64_t output_stride;
+
+    int npt_fwd;
+
+    int npt_bwd;
+
+    int nbatch;
+
+    unsigned nsamp_overlap;
+
+    unsigned nsamp_step;
+    
+    unsigned nfilt_pos;
+
+    unsigned nfilt_neg;
+
+#if HAVE_CUFFT_CALLBACKS
+		unsigned h_conv_params[2];
+#endif
+
+  };
+}
+
+#endif
+
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/Convolution.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Convolution.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/Convolution.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Convolution.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/Convolution.h,v $
-   $Revision: 1.26 $
-   $Date: 2011/08/04 21:06:30 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/Convolution.h
 
 #ifndef __Convolution_h
 #define __Convolution_h
@@ -58,8 +55,7 @@
   public:
 
     //! Null constructor
-    Convolution (const char* name = "Convolution", 
-		 Behaviour type = anyplace);
+    Convolution (const char* name = "Convolution", Behaviour type = outofplace);
 
     //! Destructor
     virtual ~Convolution ();
@@ -103,6 +99,14 @@
     //! Return a pointer to the integrated passband
     virtual const Response* get_passband() const;
 
+    //! Set the memory allocator to be used
+    void set_device (Memory *);
+
+    //! Engine used to perform discrete convolution step
+    class Engine;
+
+    void set_engine (Engine*);
+
   protected:
 
     //! Perform the convolution transformation on the input TimeSeries
@@ -126,6 +130,8 @@
     friend class TFPFilterbank;
     friend class SKFilterbank;
 
+    Reference::To<Memory> memory;
+
     unsigned nfilt_tot;
     unsigned nfilt_pos;
     unsigned nfilt_neg;
@@ -144,7 +150,22 @@
     unsigned scratch_needed;
     uint64_t npart;
     unsigned n_fft;
+
+    //! Interface to alternate processing engine (e.g. GPU)
+    Reference::To<Engine> engine;
   };
+
+  class Convolution::Engine : public Reference::Able
+  {
+    public:
+
+      virtual void set_scratch (void *) = 0;
+
+      virtual void prepare (dsp::Convolution * convolution) = 0;
+
+      virtual void perform (const TimeSeries* in, TimeSeries* out, unsigned npart) = 0;
+  };
+
   
 }
 
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/Dedispersion.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Dedispersion.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/Dedispersion.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Dedispersion.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/Dedispersion.h,v $
-   $Revision: 1.29 $
-   $Date: 2010/04/11 05:21:43 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/Dedispersion.h
 
 #ifndef __Dedispersion_h
 #define __Dedispersion_h
@@ -183,6 +180,10 @@
     //! Flag that the response and bandpass attributes reflect the state
     bool built;
 
+    //! Supported frequency channels
+    /*! Set to false when the dispersive smearing is too large */
+    std::vector<bool> supported_channels;
+    
     //! Return the effective smearing time in seconds (worker function)
     double smearing_time (int half) const;
 
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/DedispersionSampleDelay.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/DedispersionSampleDelay.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/DedispersionSampleDelay.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/DedispersionSampleDelay.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/DedispersionSampleDelay.h,v $
-   $Revision: 1.2 $
-   $Date: 2009/06/17 10:16:54 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/DedispersionSampleDelay.h
 
 #ifndef __Dedispersion_SampleDelay_h
 #define __Dedispersion_SampleDelay_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/DetectionCUDA.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/DetectionCUDA.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/DetectionCUDA.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/DetectionCUDA.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/DetectionCUDA.h,v $
-   $Revision: 1.3 $
-   $Date: 2010/06/01 10:46:29 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/DetectionCUDA.h
 
 #ifndef __baseband_cuda_Detection_h
 #define __baseband_cuda_Detection_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/Detection.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Detection.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/Detection.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Detection.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/Detection.h,v $
-   $Revision: 1.20 $
-   $Date: 2010/06/01 09:12:18 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/Detection.h
 
 
 #ifndef __Detection_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/Example.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Example.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/Example.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Example.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/Example.h,v $
-   $Revision: 1.3 $
-   $Date: 2006/07/09 13:27:12 $
-   $Author: wvanstra $ */
+// dspsr/Signal/General/dsp/Example.h
 
 #ifndef __Example_h
 #define __Example_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/ExcisionStatsPlotter.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/ExcisionStatsPlotter.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/ExcisionStatsPlotter.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/ExcisionStatsPlotter.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/ExcisionStatsPlotter.h,v $
-   $Revision: 1.1 $
-   $Date: 2008/07/16 07:00:16 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/ExcisionStatsPlotter.h
 
 #ifndef __ExcisionStatsPlotter_h
 #define __ExcisionStatsPlotter_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/FilterbankConfig.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/FilterbankConfig.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/FilterbankConfig.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/FilterbankConfig.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/FilterbankConfig.h,v $
-   $Revision: 1.2 $
-   $Date: 2011/07/15 04:18:11 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/FilterbankConfig.h
 
 #ifndef __FilterbankConfig_h
 #define __FilterbankConfig_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/filterbank_cuda.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/filterbank_cuda.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/filterbank_cuda.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/filterbank_cuda.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,11 +6,8 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/filterbank_cuda.h,v $
-   $Revision: 1.1 $
+// dspsr/Signal/General/dsp/filterbank_cuda.h
 
-   $Date: 2011/10/07 11:10:14 $
-   $Author: straten $ */
 
 #ifndef __filterbank_cuda_h
 #define __filterbank_cuda_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/FilterbankCUDA.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/FilterbankCUDA.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/FilterbankCUDA.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/FilterbankCUDA.h	2018-03-12 23:02:35.000000000 +0000
@@ -7,10 +7,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/FilterbankCUDA.h,v $
-   $Revision: 1.17 $
-   $Date: 2011/10/07 11:10:14 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/FilterbankCUDA.h
 
 #ifndef __FilterbankCUDA_h
 #define __FilterbankCUDA_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/filterbank_engine.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/filterbank_engine.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/filterbank_engine.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/filterbank_engine.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/filterbank_engine.h,v $
-   $Revision: 1.1 $
-   $Date: 2011/10/07 11:01:50 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/filterbank_engine.h
 
 #ifndef __filterbank_engine_h
 #define __filterbank_engine_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/Filterbank.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Filterbank.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/Filterbank.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Filterbank.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/Filterbank.h,v $
-   $Revision: 1.26 $
-   $Date: 2011/10/07 11:01:50 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/Filterbank.h
 
 #ifndef __Filterbank_h
 #define __Filterbank_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/FourthMoment.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/FourthMoment.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/FourthMoment.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/FourthMoment.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/FourthMoment.h,v $
-   $Revision: 1.2 $
-   $Date: 2009/06/08 19:45:01 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/FourthMoment.h
 
 
 #ifndef __FourthMoment_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/GeometricDelay.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/GeometricDelay.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/GeometricDelay.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/GeometricDelay.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/GeometricDelay.h,v $
-   $Revision: 1.2 $
-   $Date: 2010/06/27 10:56:21 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/GeometricDelay.h
 
 #ifndef __Geometric_SampleDelay_h
 #define __Geometric_SampleDelay_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/LaunchConfig.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/LaunchConfig.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/LaunchConfig.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/LaunchConfig.h	2018-03-12 23:02:35.000000000 +0000
@@ -7,6 +7,9 @@
  *
  ***************************************************************************/
 
+#ifndef __LaunchConfig_h
+#define __LaunchConfig_h
+
 #include <cuda_runtime.h>
 
 namespace CUDA
@@ -24,6 +27,10 @@
 
     //! gets the current device ID and calls cudaGetDeviceProperties
     void init ();
+
+    size_t get_max_threads_per_block ();
+
+    size_t get_max_shm ();
   };
 
 
@@ -54,3 +61,5 @@
     unsigned get_nthread() { return nthread; }
   };
 }
+
+#endif
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/LevelMonitor.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/LevelMonitor.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/LevelMonitor.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/LevelMonitor.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/LevelMonitor.h,v $
-   $Revision: 1.9 $
-   $Date: 2010/03/22 06:06:58 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/LevelMonitor.h
 
 #ifndef __LevelMonitor_h
 #define __LevelMonitor_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/LoadToFil.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/LoadToFil.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/LoadToFil.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/LoadToFil.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/LoadToFil.h,v $
-   $Revision: 1.4 $
-   $Date: 2011/12/21 06:02:20 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/LoadToFil.h
 
 #ifndef __dspsr_LoadToFil_h
 #define __dspsr_LoadToFil_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/LoadToFITS.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/LoadToFITS.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/LoadToFITS.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/LoadToFITS.h	2018-03-12 23:02:35.000000000 +0000
@@ -70,6 +70,10 @@
     // Sets default values
     Config ();
 
+    // set block_size to result in at least this much RAM usage
+    void set_maximum_RAM (uint64_t);
+    uint64_t get_maximum_RAM () const { return maximum_RAM; }
+
     // input data block size in MB
     double block_size;
 
@@ -105,6 +109,9 @@
 
     //! hold offset and scale constant after first update
     bool rescale_constant;
+
+    //! set maximum length for a file
+    double integration_length;
     
     //! number of bits used to re-digitize the floating point time series
     int nbits;
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/MultiThread.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/MultiThread.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/MultiThread.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/MultiThread.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/MultiThread.h,v $
-   $Revision: 1.2 $
-   $Date: 2011/08/24 22:16:04 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/MultiThread.h
 
 #ifndef __dspsr_MultiThread_h
 #define __dspsr_MultiThread_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/on_host.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/on_host.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/on_host.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/on_host.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/on_host.h,v $
-   $Revision: 1.4 $
-   $Date: 2010/04/24 14:13:38 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/on_host.h
 
 #ifndef __on_host_h
 #define __on_host_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/OptimalFFT.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/OptimalFFT.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/OptimalFFT.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/OptimalFFT.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/OptimalFFT.h,v $
-   $Revision: 1.3 $
-   $Date: 2010/05/18 15:39:58 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/OptimalFFT.h
 
 #ifndef __OptimalFFT_h
 #define __OptimalFFT_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/OptimalFilterbank.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/OptimalFilterbank.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/OptimalFilterbank.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/OptimalFilterbank.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/OptimalFilterbank.h,v $
-   $Revision: 1.1 $
-   $Date: 2010/05/18 15:23:17 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/OptimalFilterbank.h
 
 #ifndef __OptimalFilterbank_h
 #define __OptimalFilterbank_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/Pipeline.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Pipeline.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/Pipeline.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Pipeline.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/Pipeline.h,v $
-   $Revision: 1.1 $
-   $Date: 2011/08/23 20:55:19 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/Pipeline.h
 
 #ifndef __dspsr_Pipeline_h
 #define __dspsr_Pipeline_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/PolnSelectCUDA.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/PolnSelectCUDA.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/PolnSelectCUDA.h	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/PolnSelectCUDA.h	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,47 @@
+//-*-C++-*-
+/***************************************************************************
+ *
+ *   Copyright (C) 2015 by Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#ifndef __baseband_cuda_PolnSelect_h
+#define __baseband_cuda_PolnSelect_h
+
+#include "dsp/PolnSelect.h"
+#include "dsp/LaunchConfig.h"
+
+namespace CUDA
+{
+  class PolnSelectEngine : public dsp::PolnSelect::Engine
+  {
+  public:
+
+    //! Default Constructor
+    PolnSelectEngine (cudaStream_t stream);
+
+    ~PolnSelectEngine ();
+
+    void setup ();
+
+    void fpt_polnselect (int ipol, 
+                         const dsp::TimeSeries* in,
+                         dsp::TimeSeries* out);
+
+    void tfp_polnselect (int ipol,
+                         const dsp::TimeSeries* in,
+                         dsp::TimeSeries* out);
+
+  protected:
+
+    cudaStream_t stream;
+
+    //! gpu configuration
+    LaunchConfig gpu_config; 
+
+  };
+}
+
+#endif
+
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/PolnSelect.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/PolnSelect.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/PolnSelect.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/PolnSelect.h	2018-03-12 23:02:35.000000000 +0000
@@ -34,11 +34,32 @@
     // Get the currently selected poln index
     int get_ipol () const { return ipol_keep; }
 
+    class Engine;
+
+    void set_engine (Engine*);
+
   protected:
 
     //! The polarization to keep
     int ipol_keep;
   };
+
+  class PolnSelect::Engine : public OwnStream
+  {
+  public:
+
+    virtual void setup () = 0;
+
+    virtual void fpt_polnselect (int ipol,
+                                 const dsp::TimeSeries * in,
+                                 dsp::TimeSeries * out) = 0;
+
+    virtual void tfp_polnselect (int ipol,
+                                 const dsp::TimeSeries* in,
+                                 dsp::TimeSeries* out) = 0;
+
+   };
+
 }
 
 #endif
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/PScrunchCUDA.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/PScrunchCUDA.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/PScrunchCUDA.h	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/PScrunchCUDA.h	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,45 @@
+//-*-C++-*-
+/***************************************************************************
+ *
+ *   Copyright (C) 2015 by Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#ifndef __baseband_cuda_PScrunch_h
+#define __baseband_cuda_PScrunch_h
+
+#include "dsp/PScrunch.h"
+#include "dsp/LaunchConfig.h"
+
+namespace CUDA
+{
+  class PScrunchEngine : public dsp::PScrunch::Engine
+  {
+  public:
+
+    //! Default Constructor
+    PScrunchEngine (cudaStream_t stream);
+
+    ~PScrunchEngine ();
+
+    void setup ();
+
+    void fpt_pscrunch (const dsp::TimeSeries* in,
+                       dsp::TimeSeries* out);
+
+    void tfp_pscrunch (const dsp::TimeSeries* in,
+                       dsp::TimeSeries* out);
+
+  protected:
+
+    cudaStream_t stream;
+
+    //! gpu configuration
+    LaunchConfig gpu_config; 
+
+  };
+}
+
+#endif
+
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/PScrunch.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/PScrunch.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/PScrunch.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/PScrunch.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/PScrunch.h,v $
-   $Revision: 1.1 $
-   $Date: 2008/07/01 12:23:21 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/PScrunch.h
 
 #ifndef __baseband_dsp_PScrunch_h
 #define __baseband_dsp_PScrunch_h
@@ -32,7 +29,31 @@
 
     //! PScrunch to zero mean and unit variance
     void transformation ();
+
+   class Engine;
+
+   void set_engine (Engine*);
+
+  protected:
+
+    Reference::To<Engine> engine;
+
   };
+
+  class PScrunch::Engine : public OwnStream
+  {
+  public:
+
+    virtual void setup () = 0;
+
+    virtual void fpt_pscrunch (const dsp::TimeSeries * in,
+                               dsp::TimeSeries * out) = 0;
+
+    virtual void tfp_pscrunch (const dsp::TimeSeries* in,
+                               dsp::TimeSeries* out) = 0;
+
+   };
+
 }
 
 #endif
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/Rescale.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Rescale.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/Rescale.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Rescale.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/Rescale.h,v $
-   $Revision: 1.9 $
-   $Date: 2010/02/02 11:18:41 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/Rescale.h
 
 #ifndef __baseband_dsp_Rescale_h
 #define __baseband_dsp_Rescale_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/Response.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Response.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/Response.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Response.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/Response.h,v $
-   $Revision: 1.33 $
-   $Date: 2011/01/06 05:16:55 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/Response.h
 
 #ifndef __Response_h
 #define __Response_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/ResponseProduct.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/ResponseProduct.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/ResponseProduct.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/ResponseProduct.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/ResponseProduct.h,v $
-   $Revision: 1.6 $
-   $Date: 2009/06/12 06:18:56 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/ResponseProduct.h
 
 #ifndef __ResponseProduct_h
 #define __ResponseProduct_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/RFIFilter.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/RFIFilter.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/RFIFilter.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/RFIFilter.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/RFIFilter.h,v $
-   $Revision: 1.4 $
-   $Date: 2009/06/17 10:16:54 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/RFIFilter.h
 
 #ifndef __RFIFilter_h
 #define __RFIFilter_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/SampleDelayFunction.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SampleDelayFunction.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/SampleDelayFunction.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SampleDelayFunction.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/SampleDelayFunction.h,v $
-   $Revision: 1.6 $
-   $Date: 2010/06/27 10:56:25 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/SampleDelayFunction.h
 
 #ifndef __baseband_dsp_SampleDelayFunction_h
 #define __baseband_dsp_SampleDelayFunction_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/SampleDelay.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SampleDelay.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/SampleDelay.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SampleDelay.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/SampleDelay.h,v $
-   $Revision: 1.5 $
-   $Date: 2010/05/21 07:29:37 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/SampleDelay.h
 
 #ifndef __baseband_dsp_SampleDelay_h
 #define __baseband_dsp_SampleDelay_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/Shape.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Shape.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/Shape.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Shape.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/Shape.h,v $
-   $Revision: 1.12 $
-   $Date: 2009/06/07 01:22:34 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/Shape.h
 
 #ifndef __Shape_h
 #define __Shape_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/SingleThread.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SingleThread.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/SingleThread.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SingleThread.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/SingleThread.h,v $
-   $Revision: 1.7 $
-   $Date: 2012/01/19 21:46:17 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/SingleThread.h
 
 #ifndef __dspsr_SingleThread_h
 #define __dspsr_SingleThread_h
@@ -151,6 +148,7 @@
 
     Reference::To<Memory> device_memory;
     void* gpu_stream;
+    int gpu_device;
 
   };
 
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/SKComputerCUDA.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKComputerCUDA.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/SKComputerCUDA.h	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKComputerCUDA.h	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,50 @@
+//-*-C++-*-
+/***************************************************************************
+ *
+ *   Copyright (C) 2016 by Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#ifndef __baseband_cuda_SKComputer_h
+#define __baseband_cuda_SKComputer_h
+
+#include "dsp/SKComputer.h"
+#include "dsp/MemoryCUDA.h"
+
+namespace CUDA
+{
+  class Memory;
+
+  class SKComputerEngine : public dsp::SKComputer::Engine
+  {
+  public:
+
+    //! Default Constructor
+    SKComputerEngine (dsp::Memory * memory);
+
+    void setup ();
+
+    void compute (const dsp::TimeSeries* input, dsp::TimeSeries* output,
+                  dsp::TimeSeries *output_tscr, unsigned tscrunch);
+
+    void insertsk (const dsp::TimeSeries* input, dsp::TimeSeries* output,
+                   unsigned tscrunch);
+
+  protected:
+
+    DeviceMemory * device_memory;
+
+    cudaStream_t stream;
+
+    // device work buffer
+    float * work_buffer;
+
+    size_t work_buffer_size;
+
+    int max_threads_per_block;
+
+  };
+}
+
+#endif
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/SKComputer.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKComputer.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/SKComputer.h	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKComputer.h	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,56 @@
+//-*-C++-*-
+/***************************************************************************
+ *
+ *   Copyright (C) 2016 by Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#include "dsp/TimeSeries.h"
+#include "dsp/Transformation.h"
+
+#ifndef __SKComputer_h
+#define __SKComputer_h
+
+namespace dsp {
+
+  class SKComputer: public Transformation<TimeSeries,TimeSeries> {
+
+  public:
+
+    //! Null constructor
+    SKComputer ();
+
+    ~SKComputer();
+
+    //! Engine used to perform discrete convolution step
+    class Engine;
+
+    void set_engine (Engine*);
+
+  protected:
+
+    //! Perform the transformation on the input time series
+    void transformation ();
+
+    //! Interface to alternate processing engine (e.g. GPU)
+    Reference::To<Engine> engine;
+
+  };
+
+  class SKComputer::Engine : public Reference::Able
+  {
+  public:
+      Engine () {}
+
+      virtual void setup () = 0;
+
+      virtual void compute (const dsp::TimeSeries* input, dsp::TimeSeries* output,
+                            dsp::TimeSeries *output_tscr, unsigned tscrunch) = 0;
+
+      virtual void insertsk (const dsp::TimeSeries* input, dsp::TimeSeries* output,
+                             unsigned tscrunch) = 0;
+  };
+}
+
+#endif
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/SKDetectorCUDA.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKDetectorCUDA.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/SKDetectorCUDA.h	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKDetectorCUDA.h	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,76 @@
+//-*-C++-*-
+/***************************************************************************
+ *
+ *   Copyright (C) 2016 by Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#ifndef __baseband_cuda_SKDetector_h
+#define __baseband_cuda_SKDetector_h
+
+#include "dsp/SKDetector.h"
+#include "dsp/MemoryCUDA.h"
+
+#include "dsp/TransferCUDA.h"
+#include "dsp/TransferBitSeriesCUDA.h"
+
+namespace CUDA
+{
+  class SKDetectorEngine : public dsp::SKDetector::Engine
+  {
+  public:
+
+    //! Default Constructor
+    SKDetectorEngine (dsp::Memory * memory);
+
+    void setup ();
+
+    void reset_mask (dsp::BitSeries* output);
+
+    void detect_ft (const dsp::TimeSeries* input, dsp::BitSeries* output,
+                    float upper_thresh, float lower_thresh);
+
+    void detect_fscr(const dsp::TimeSeries* input, dsp::BitSeries* output, 
+                     const float lower, const float upper,
+                     unsigned schan, unsigned echan);
+
+    void detect_tscr (const dsp::TimeSeries* input, const dsp::TimeSeries* input_tscr, dsp::BitSeries* output, 
+                      float upper_thresh, float lower_thresh);
+
+    int count_mask (const dsp::BitSeries* output);
+
+    float * get_estimates (const dsp::TimeSeries* input);
+
+    unsigned char * get_zapmask (const dsp::BitSeries* input);
+
+
+  protected:
+
+    DeviceMemory * device_memory;
+
+    cudaStream_t stream;
+
+    unsigned nchan;
+
+    unsigned npol;
+
+    //! DDFB span, i.e. n floats between channels from raw base ptr
+    unsigned span;
+
+    int max_threads_per_block;
+
+    PinnedMemory * pinned_memory;
+
+    dsp::TimeSeries * estimates_host;
+
+    dsp::BitSeries * zapmask_host;
+
+    dsp::TransferCUDA* transfer_estimates;
+
+    dsp::TransferBitSeriesCUDA* transfer_zapmask;
+
+  };
+}
+
+#endif
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/SKDetector.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKDetector.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/SKDetector.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKDetector.h	2018-03-12 23:02:35.000000000 +0000
@@ -67,6 +67,11 @@
     //! The arrays will be reset when count_zapped is next called
     void reset_count () { unfiltered_hits = 0; }
 
+    //! Engine used to perform detection
+    class Engine;
+
+    void set_engine (Engine*);
+
   protected:
 
     //! Reserve the required amount of output space required
@@ -75,6 +80,9 @@
     //! Perform the transformation on the input time series
     void transformation ();
 
+    //! Interface to alternate processing engine (e.g. GPU)
+    Reference::To<Engine> engine;
+
     void reset_mask ();
 
     void detect_tscr ();
@@ -99,7 +107,7 @@
 
     //! Tsrunched SK statistic timeseries for the current block
     Reference::To<TimeSeries> input_tscr;
-  
+
     //! Number of time samples integrated into tscr SK estimates
     unsigned tscr_M;
 
@@ -149,6 +157,35 @@
 
   };
   
+  class SKDetector::Engine : public Reference::Able
+  {
+  public:
+
+    virtual void setup () = 0;
+
+    virtual void reset_mask (dsp::BitSeries* output) = 0;
+
+    virtual void detect_ft (const dsp::TimeSeries* input, dsp::BitSeries* output,
+                            float upper_thresh, float lower_thresh) = 0;
+
+    virtual void detect_fscr (const dsp::TimeSeries* input, dsp::BitSeries* output,
+                              const float lower, const float upper,
+                              unsigned s_chan, unsigned e_chan) = 0;
+
+    virtual void detect_tscr (const dsp::TimeSeries* input,
+                              const dsp::TimeSeries * input_tscr,
+                              dsp::BitSeries* output,
+                              float upper_thresh, float lower_thresh) = 0;
+
+    virtual int count_mask (const dsp::BitSeries* output) = 0;
+
+    virtual float * get_estimates (const TimeSeries* input) = 0;
+
+    virtual unsigned char * get_zapmask (const BitSeries* input) = 0;
+
+  };
+
+
 }
 
 #endif
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/SKFilterbankCUDA.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKFilterbankCUDA.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/SKFilterbankCUDA.h	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKFilterbankCUDA.h	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,75 @@
+//-*-C++-*-
+/***************************************************************************
+ *
+ *   Copyright (C) 2015 by Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#ifndef __baseband_cuda_SKFilterbank_h
+#define __baseband_cuda_SKFilterbank_h
+
+#include <cufft.h>
+#include <config.h>
+
+#include "dsp/SKFilterbank.h"
+#include "dsp/MemoryCUDA.h"
+#include "dsp/LaunchConfig.h"
+
+namespace CUDA
+{
+  class SKFilterbankEngine : public dsp::SKFilterbank::Engine
+  {
+  public:
+
+    //! Default Constructor
+    SKFilterbankEngine (dsp::Memory * _memory, unsigned _tscrunch);
+
+    ~SKFilterbankEngine();
+
+    void setup ();
+
+    void prepare (const dsp::TimeSeries* input, unsigned _nfft);
+
+    void perform (const dsp::TimeSeries* input,
+                  dsp::TimeSeries* output,
+                  dsp::TimeSeries* output_tscr);
+
+  protected:
+
+    DeviceMemory * memory;
+
+    void fft_real (cufftReal *in, cufftComplex * out);
+
+    void fft_complex (cufftComplex *in, cufftComplex * out);
+
+    cudaStream_t stream;
+
+    cufftType type;
+
+    cufftHandle plan;
+
+    void * buffer;
+
+    size_t buffer_size;
+
+    void * sums;
+
+    size_t sums_size;
+
+    int nchan;
+
+    int npol;
+
+    int npt;
+
+    int nbatch;
+
+    int tscrunch;
+ 
+    int max_threads_per_block;
+  };
+}
+
+#endif
+
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/SKFilterbank.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKFilterbank.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/SKFilterbank.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKFilterbank.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/SKFilterbank.h,v $
-   $Revision: 1.2 $
-   $Date: 2011/08/04 21:06:12 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/SKFilterbank.h
 
 #ifndef __SKFilterbank_h
 #define __SKFilterbank_h
@@ -27,7 +24,7 @@
   public:
 
     //! Null constructor
-    SKFilterbank ( unsigned _n_threads );
+    SKFilterbank ( unsigned _n_threads=1 );
     ~SKFilterbank ();
 
     //! Engine used to perform discrete convolution step
@@ -107,7 +104,14 @@
   class SKFilterbank::Engine : public Reference::Able
   {
   public:
-      Engine () {}
+
+      virtual void setup () = 0;
+
+      virtual void prepare (const dsp::TimeSeries* input, unsigned _nfft) = 0;
+
+      virtual void perform (const dsp::TimeSeries* input, dsp::TimeSeries* output,
+                            dsp::TimeSeries *output_tscr) = 0;
+
   }; 
 }
 
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/SKMaskerCUDA.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKMaskerCUDA.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/SKMaskerCUDA.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKMaskerCUDA.h	2018-03-12 23:02:35.000000000 +0000
@@ -10,6 +10,7 @@
 #define __baseband_cuda_SKMasker_h
 
 #include "dsp/SKMasker.h"
+#include "dsp/MemoryCUDA.h"
 
 namespace CUDA
 {
@@ -18,22 +19,18 @@
   public:
 
     //! Default Constructor
-    SKMaskerEngine (cudaStream_t stream);
+    SKMaskerEngine (dsp::Memory * memory);
 
-    void setup (unsigned nchan, unsigned npol, unsigned span);
+    void setup ();
 
-    void perform (dsp::BitSeries* mask, unsigned mask_offset, dsp::TimeSeries* out, 
-                  unsigned offset, unsigned end);
+    void perform (dsp::BitSeries* mask, const dsp::TimeSeries* input,
+                  dsp::TimeSeries* out, unsigned M);
 
   protected:
-    cudaStream_t stream;
-
-    unsigned nchan;
 
-    unsigned npol;
+    DeviceMemory * device_memory;
 
-    //! DDFB span, i.e. n floats between channels from raw base ptr
-    unsigned span;
+    cudaStream_t stream;
 
     int max_threads_per_block;
 
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/SKMasker.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKMasker.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/SKMasker.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKMasker.h	2018-03-12 23:02:35.000000000 +0000
@@ -74,9 +74,9 @@
   class SKMasker::Engine : public Reference::Able
   { 
   public:
-    virtual void setup (unsigned nchan, unsigned npol, unsigned span) = 0;
+    virtual void setup () = 0;
 
-    virtual void perform (BitSeries* mask, unsigned mask_offset, TimeSeries* out, unsigned offset, unsigned end) = 0; 
+    virtual void perform (BitSeries* mask, const TimeSeries* in, TimeSeries* out, unsigned M) = 0; 
 
   }; 
 
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/SpectralKurtosisCUDA.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SpectralKurtosisCUDA.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/SpectralKurtosisCUDA.h	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SpectralKurtosisCUDA.h	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,82 @@
+//-*-C++-*-
+/***************************************************************************
+ *
+ *   Copyright (C) 2016 by Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#ifndef __baseband_cuda_SpectralKurtosis_h
+#define __baseband_cuda_SpectralKurtosis_h
+
+#include "dsp/SpectralKurtosis.h"
+
+#include "dsp/MemoryCUDA.h"
+#include "dsp/SKComputerCUDA.h"
+#include "dsp/SKDetectorCUDA.h"
+#include "dsp/SKMaskerCUDA.h"
+
+#include "dsp/TransferCUDA.h"
+#include "dsp/TransferBitSeriesCUDA.h"
+
+namespace CUDA
+{
+
+  class SpectralKurtosisEngine : public dsp::SpectralKurtosis::Engine
+  {
+  public:
+
+    //! Default Constructor
+    SpectralKurtosisEngine (dsp::Memory * memory);
+
+    void setup ();
+
+    void compute (const dsp::TimeSeries* input, dsp::TimeSeries* output,
+                  dsp::TimeSeries *output_tscr, unsigned tscrunch);
+
+    void reset_mask (dsp::BitSeries* output);
+
+    void detect_ft (const dsp::TimeSeries* input, dsp::BitSeries* output,
+                    float upper_thresh, float lower_thresh);
+
+    void detect_fscr (const dsp::TimeSeries* input, dsp::BitSeries* output,
+                      const float lower, const float upper,
+                      unsigned schan, unsigned echan);
+
+    void detect_tscr (const dsp::TimeSeries* input,
+                      const dsp::TimeSeries * input_tscr,
+                      dsp::BitSeries* output,
+                      float upper, float lower);
+
+    int count_mask (const dsp::BitSeries* output);
+
+    float * get_estimates (const dsp::TimeSeries * estimates);
+
+    unsigned char * get_zapmask (const dsp::BitSeries * zapmask);
+
+    void mask (dsp::BitSeries* mask, const dsp::TimeSeries *in, dsp::TimeSeries* out, unsigned M);
+
+    void insertsk (const dsp::TimeSeries* input, dsp::TimeSeries* out, unsigned M);
+
+  protected:
+
+    DeviceMemory * device_memory;
+
+    cudaStream_t stream;
+
+    SKComputerEngine * computer;
+
+    SKDetectorEngine * detector;
+
+    SKMaskerEngine * masker;
+
+    float * work_buffer;
+
+    size_t work_buffer_size;
+
+    int max_threads_per_block;
+
+  };
+}
+
+#endif
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/SpectralKurtosis.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SpectralKurtosis.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/SpectralKurtosis.h	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SpectralKurtosis.h	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,215 @@
+//-*-C++-*-
+/***************************************************************************
+ *
+ *   Copyright (C) 2016 by Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#include "dsp/Transformation.h"
+#include "dsp/TimeSeries.h"
+#include "dsp/BitSeries.h"
+#include "dsp/Memory.h"
+
+#ifndef __SpectralKurtosis_h
+#define __SpectralKurtosis_h
+
+#define ZAP_ALL  0
+#define ZAP_SKFB 1
+#define ZAP_FSCR 2
+#define ZAP_TSCR 3
+
+namespace dsp {
+
+  //! Perform Spectral Kurtosis on Input Timeseries, creating output Time Series
+  /*! Output will be in time, frequency, polarization order */
+
+  class SpectralKurtosis: public Transformation<TimeSeries,TimeSeries> {
+
+  public:
+
+    //! Default constructor
+    SpectralKurtosis ();
+
+    //! Destructor
+    ~SpectralKurtosis ();
+
+    bool get_order_supported (TimeSeries::Order order) const;
+
+    void set_M (unsigned _M) { M = _M; }
+
+    //! Set the RFI thresholds with the specified factor
+    void set_thresholds (unsigned _M, unsigned _std_devs);
+
+    //! Set the channel range to conduct detection
+    void set_channel_range (unsigned start, unsigned end);
+
+    //! Set various options for Specral Kurtosis
+    void set_options (bool _disable_fscr, bool _disable_tscr, bool _disable_ft);
+
+    void reserve ();
+
+    void prepare ();
+
+    void prepare_output ();
+
+    //! The number of time samples used to calculate the SK statistic
+    unsigned get_M () const { return M; }
+
+    //! The excision threshold in number of standard deviations
+    unsigned get_excision_threshold () const { return std_devs; }
+
+    //! Total SK statistic for each poln/channel, post filtering
+    void get_filtered_sum (std::vector<float>& sum) const
+    {  sum = filtered_sum; }
+
+    //! Hits on filtered average for each channel
+    void get_filtered_hits (std::vector<uint64_t>& hits) const
+    { hits = filtered_hits; }
+
+    //! Total SK statistic for each poln/channel, before filtering
+    void get_unfiltered_sum (std::vector<float>& sum) const
+    { sum = unfiltered_sum; }
+
+    //! Hits on unfiltered SK statistic, same for each channel
+    uint64_t get_unfiltered_hits () const { return unfiltered_hits; }
+
+    //! The arrays will be reset when count_zapped is next called
+    void reset_count () { unfiltered_hits = 0; }
+
+
+    //! Engine used to perform discrete convolution step
+    class Engine;
+
+    void set_engine (Engine*);
+
+  protected:
+
+    //! Perform the transformation on the input time series
+    void transformation ();
+
+    //! Interface to alternate processing engine (e.g. GPU)
+    Reference::To<Engine> engine;
+
+  private:
+
+    void compute ();
+
+    void detect ();
+    void detect_tscr ();
+    void detect_skfb ();
+    void detect_fscr ();
+    void count_zapped ();
+
+    void mask ();
+    void reset_mask ();
+
+    void insertsk ();
+
+    unsigned debugd;
+
+    //! number of samples used in each SK estimate
+    unsigned M;
+
+    unsigned nchan;
+
+    unsigned npol;
+
+    unsigned ndim;
+
+    uint64_t npart;
+
+    uint64_t output_ndat;
+
+    //! SK Estimates 
+    Reference::To<TimeSeries> estimates;
+
+    //! Tscrunched SK Estimate for block
+    Reference::To<TimeSeries> estimates_tscr;
+
+    //! Zap mask
+    Reference::To<BitSeries> zapmask;
+
+    //! accumulation arrays for S1 and S2 in t scrunch
+    std::vector <float> S1_tscr;
+    std::vector <float> S2_tscr;
+
+    //! Total SK statistic for each poln/channel, post filtering
+    std::vector<float> filtered_sum;
+
+    //! Hits on filtered average for each channel
+    std::vector<uint64_t> filtered_hits;
+
+    //! Total SK statistic for each poln/channel, before filtering
+    std::vector<float> unfiltered_sum;
+
+    //! Hits on unfiltered SK statistic, same for each channel
+    uint64_t unfiltered_hits;
+
+    //! number of std devs used to calculate excision limits
+    unsigned std_devs;
+
+    //! lower and upper thresholds of excision limits
+    std::vector<float> thresholds;
+
+    float one_sigma;
+
+    //! Number of samples integrated into tscr
+    unsigned M_tscr;
+
+    //! exicision thresholds for tscr
+    std::vector<float> thresholds_tscr;
+
+    //! channel range to compute and apply SK excisions
+    std::vector<unsigned> channels;
+
+    //! samples zapped by type [0:all, 1:sk, 2:fscr, 3:tscr]
+    std::vector<uint64_t> zap_counts;
+
+    //! total number of samples processed
+    uint64_t npart_total;
+
+    //! flags for detection types [0:fscr, 1:tscr, 2:tscr]
+    std::vector<bool> detection_flags;
+
+    bool prepared;
+
+  };
+
+  class SpectralKurtosis::Engine : public Reference::Able
+  {
+  public:
+
+      virtual void setup () = 0;
+
+      virtual void compute (const TimeSeries* input, TimeSeries* output,
+                            TimeSeries *output_tscr, unsigned tscrunch) = 0;
+
+      virtual void reset_mask (BitSeries* output) = 0;
+
+      virtual void detect_ft (const TimeSeries* input, BitSeries* output,
+                              float upper_thresh, float lower_thresh) = 0;
+
+      virtual void detect_fscr (const TimeSeries* input, BitSeries* output,
+                                const float lower, const float upper,
+                                unsigned schan, unsigned echan) = 0;
+
+      virtual void detect_tscr (const TimeSeries* input,
+                                const TimeSeries * input_tscr,
+                                BitSeries* output,
+                                float upper, float lower) = 0;
+ 
+      virtual int count_mask (const BitSeries* output) = 0;
+
+      virtual float * get_estimates (const TimeSeries* input) = 0;
+
+      virtual unsigned char * get_zapmask (const BitSeries* input) = 0;
+
+      virtual void mask (BitSeries* mask, const TimeSeries * in, TimeSeries* out, unsigned M) = 0;
+
+      virtual void insertsk (const TimeSeries* input, TimeSeries* out, unsigned M) = 0;
+
+  };
+}
+
+#endif
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/TFPFilterbank.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/TFPFilterbank.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/TFPFilterbank.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/TFPFilterbank.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/TFPFilterbank.h,v $
-   $Revision: 1.3 $
-   $Date: 2011/08/23 21:00:38 $
-   $Author: straten $ */
+// dspsr/Signal/General/dsp/TFPFilterbank.h
 
 #ifndef __TFPFilterbank_h
 #define __TFPFilterbank_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/TScrunchCUDA.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/TScrunchCUDA.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/TScrunchCUDA.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/TScrunchCUDA.h	2018-03-12 23:02:35.000000000 +0000
@@ -23,8 +23,8 @@
     TScrunchEngine (cudaStream_t stream);
 
     void fpt_tscrunch (const dsp::TimeSeries * input, 
-                         dsp::TimeSeries * output,
-                         unsigned sfactor);
+                       dsp::TimeSeries * output,
+                       unsigned sfactor);
 
   protected:
 
diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/UnderSamplingBench.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/UnderSamplingBench.h
--- bl-dspsr-0+git20160405/Signal/General/dsp/UnderSamplingBench.h	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/UnderSamplingBench.h	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,39 @@
+//-*-C++-*-
+/***************************************************************************
+ *
+ *   Copyright (C) 2015 by Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#ifndef __UnderSamplingBench_h_
+#define __UnderSamplingBench_h_
+
+#include "FTransformBench.h"
+
+namespace dsp {
+
+  //! Stores UnderSampling benchmark data
+  class UnderSamplingBench : public FTransform::Bench
+  {
+  public:
+
+    static bool verbose;
+
+    //! Construct from installed benchmarks
+    UnderSamplingBench (const std::string& library);
+
+    //! Set the number of channels
+    void set_nchan (unsigned);
+
+  protected:
+
+    unsigned nchan;
+    std::string library;
+
+    void load () const;
+    void load (const std::string& library, const std::string& filename) const;
+  };
+}
+
+#endif
diff -Nru bl-dspsr-0+git20160405/Signal/General/fftbatch_speed.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/fftbatch_speed.C
--- bl-dspsr-0+git20160405/Signal/General/fftbatch_speed.C	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/fftbatch_speed.C	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,210 @@
+/***************************************************************************
+ *
+ *   Copyright (C) 2015 by Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#if HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <cuda_runtime.h>
+#include <cufft.h>
+#include "CUFFTError.h"
+
+#include "CommandLine.h"
+#include "RealTimer.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <iostream>
+#include <math.h>
+
+using namespace std;
+
+class Speed : public Reference::Able
+{
+public:
+
+  Speed ();
+
+  // parse command line options
+  void parseOptions (int argc, char** argv);
+
+  // run the test
+  void runTest ();
+
+protected:
+
+  int npt;
+  int niter;
+  unsigned gpu_id;
+  bool cuda;
+};
+
+
+Speed::Speed ()
+{
+  gpu_id = 0;
+  niter = 16;
+  npt = 1024;
+  cuda = false;
+}
+
+int main(int argc, char** argv) try
+{
+  Speed speed;
+  speed.parseOptions (argc, argv);
+  speed.runTest ();
+  return 0;
+}
+catch (Error& error)
+{
+  cerr << error << endl;
+  return -1;
+}
+
+void Speed::parseOptions (int argc, char** argv)
+{
+  CommandLine::Menu menu;
+  CommandLine::Argument* arg;
+
+  menu.set_help_header ("undersampling_speed - measure under sampling speed");
+  menu.set_version ("undersampling_speed version 1.0");
+
+  arg = menu.add (npt, 'n', "npt");
+  arg->set_help ("number of points in each FFT");
+
+  arg = menu.add (gpu_id, 'd');
+  arg->set_help ("GPU device ID");
+
+  arg = menu.add (niter, 't', "ninter");
+  arg->set_help ("number of iterations (batch/loops)");
+
+  arg = menu.add (cuda, "cuda");
+  arg->set_help ("benchmark CUDA");
+
+  menu.parse (argc, argv);
+}
+
+void check_error_stream (const char*, cudaStream_t);
+
+void Speed::runTest ()
+{
+#ifdef _DEBUG
+  dsp::Operation::verbose = true;
+  dsp::Observation::verbose = true;
+#endif
+
+  // assume complex FFTs
+  const unsigned ndim = 2;
+ 
+  cudaStream_t stream = 0;
+  if (cuda)
+  {
+    cerr << "using GPU " << gpu_id << endl;
+    cudaError_t err = cudaSetDevice(gpu_id); 
+    if (err != cudaSuccess)
+      throw Error (InvalidState, "undersampling_speed",
+                   "cudaSetDevice failed: %s", cudaGetErrorString(err));
+
+    err = cudaStreamCreate( &stream );
+    if (err != cudaSuccess)
+      throw Error (InvalidState, "undersampling_speed",
+                   "cudaStreamCreate failed: %s", cudaGetErrorString(err));
+
+  }
+
+  unsigned ndat = npt * niter;
+  unsigned nbytes = ndat * sizeof (cufftComplex);
+
+  cufftComplex * input;
+  cufftComplex * output;
+  cufftResult result;
+  size_t work_size;
+
+  cudaMalloc ((void **) &input, nbytes);
+  cudaMalloc ((void **) &output, nbytes);
+
+  cudaMemsetAsync ((void *) input, 0, nbytes, stream);
+  cudaMemsetAsync ((void *) output, 0, nbytes, stream);
+
+  // setup loop based FFT plan
+  cufftHandle plan_loop;
+
+  result = cufftCreate (&plan_loop);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "Speed::runTest", "cufftCreate(plan_loop)");
+
+  result = cufftMakePlan1d (plan_loop, npt, CUFFT_C2C, 1, &work_size);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "Speed::runTest", "cufftMakePlan1D (plan_loop)");
+
+  result = cufftSetStream (plan_loop, stream);
+  if (result != CUFFT_SUCCESS)
+    CUFFTError (result, "Speed::runTest", "cufftSetStream (plan_loop)");
+
+  // setup batch based FFT plan
+  cufftHandle plan_batch;
+
+  result = cufftCreate (&plan_batch);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "Speed::runTest", "cufftCreate(plan_batch)");
+
+  int rank = 1;
+  result = cufftMakePlanMany (plan_batch, rank, &npt, NULL, 0, 0, NULL, 0, 0, 
+                              CUFFT_C2C, niter, &work_size);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "Speed::runTest", "cufftMakePlanMany (plan_batch)");
+
+  result = cufftSetStream (plan_batch, stream);
+  if (result != CUFFT_SUCCESS)
+    CUFFTError (result, "Speed::runTest", "cufftSetStream (plan_batch)");
+
+  RealTimer timer_loop;
+  RealTimer timer_batch;
+
+  cudaStreamSynchronize (stream);
+
+  timer_loop.start ();
+
+  for (unsigned i=0; i<niter; i++)
+  {
+    result = cufftExecC2C (plan_loop, input, output, CUFFT_FORWARD);
+    if (result != CUFFT_SUCCESS)
+      throw CUFFTError (result, "Speed::runTest", "cufftExecC2C(plan_loop)");
+    cudaStreamSynchronize(stream);
+  }
+
+  timer_loop.stop ();
+
+  double total_time, time_per_fft, time_us;
+
+  total_time = timer_loop.get_elapsed();
+  time_per_fft = total_time / niter;
+  time_us = time_per_fft * 1e6;
+  cerr << "LOOP: total_time=" << total_time << " time_per_fft=" << time_per_fft 
+       << " time_us=" << time_us << endl;
+
+  timer_batch.start ();
+
+  result = cufftExecC2C (plan_batch, input, output, CUFFT_FORWARD);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "Speed::runTest", "cufftExecC2C(plan_batch)");
+  cudaStreamSynchronize(stream);
+
+  timer_batch.stop ();
+
+  total_time = timer_batch.get_elapsed();
+  time_per_fft = total_time / niter;
+  time_us = time_per_fft * 1e6;
+  cerr << "BATCH: total_time=" << total_time << " time_per_fft=" << time_per_fft 
+       << " time_us=" << time_us << endl;
+
+  cufftDestroy(plan_loop);
+  cufftDestroy(plan_batch);
+  cudaFree(input);
+  cudaFree(output);
+
+}
diff -Nru bl-dspsr-0+git20160405/Signal/General/FilterbankCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Signal/General/FilterbankCUDA.cu
--- bl-dspsr-0+git20160405/Signal/General/FilterbankCUDA.cu	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/FilterbankCUDA.cu	2018-03-12 23:02:35.000000000 +0000
@@ -103,11 +103,6 @@
 			"cufftPlan1d(plan_fwd, CUFFT_C2C)");
   }
 
-  result = cufftSetCompatibilityMode(plan_fwd, CUFFT_COMPATIBILITY_NATIVE);
-  if (result != CUFFT_SUCCESS)
-    throw CUFFTError (result, "CUDA::FilterbankEngine::setup",
-		      "cufftSetCompatibilityMode(plan_fwd)");
-
   DEBUG("CUDA::FilterbankEngine::setup setting stream=" << stream);
   result = cufftSetStream (plan_fwd, stream);
   if (result != CUFFT_SUCCESS)
@@ -123,12 +118,6 @@
       throw CUFFTError (result, "CUDA::FilterbankEngine::setup", 
 			"cufftPlan1d(plan_bwd)");
 
-    // optimal performance for CUFFT regarding data layout
-    result = cufftSetCompatibilityMode(plan_bwd, CUFFT_COMPATIBILITY_NATIVE);
-    if (result != CUFFT_SUCCESS)
-      throw CUFFTError (result, "CUDA::FilterbankEngine::setup",
-			"cufftSetCompatibilityMode(plan_bwd)");
-
     result = cufftSetStream (plan_bwd, stream);
     if (result != CUFFT_SUCCESS)
       throw CUFFTError (result, "CUDA::FilterbankEngine::setup",
diff -Nru bl-dspsr-0+git20160405/Signal/General/FScrunch.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/FScrunch.C
--- bl-dspsr-0+git20160405/Signal/General/FScrunch.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/FScrunch.C	2018-03-12 23:02:35.000000000 +0000
@@ -80,9 +80,6 @@
     return;
   }
 
-  if (input->get_ndat() == 0)
-    return;
-
   if( !input->get_detected() )
     throw Error(InvalidState,"dsp::FScrunch::transformation()",
 		"invalid input state: " + tostring(input->get_state()));
diff -Nru bl-dspsr-0+git20160405/Signal/General/LaunchConfig.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/LaunchConfig.C
--- bl-dspsr-0+git20160405/Signal/General/LaunchConfig.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/LaunchConfig.C	2018-03-12 23:02:35.000000000 +0000
@@ -17,6 +17,31 @@
   cudaGetDeviceProperties (&device_properties, device);
 }
 
+size_t CUDA::LaunchConfig::get_max_threads_per_block ()
+{
+  if (device < 0)
+  {
+    throw Error (InvalidState, "CUDA::LaunchConfig::get_max_threads_per_block",
+                 "not initialized");
+    return -1;
+  }
+  else
+    return device_properties.maxThreadsPerBlock;
+}
+
+size_t CUDA::LaunchConfig::get_max_shm ()
+{
+  if (device < 0)
+  {
+    throw Error (InvalidState, "CUDA::LaunchConfig::get_max_shm",
+                 "not initialized");
+    return -1;
+  }
+  else
+    return device_properties.sharedMemPerBlock;
+}
+
+
 void CUDA::LaunchConfig1D::set_nelement (unsigned N)
 {
   unsigned max_nthread = device_properties.maxThreadsPerBlock;
diff -Nru bl-dspsr-0+git20160405/Signal/General/LoadToFil.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/LoadToFil.C
--- bl-dspsr-0+git20160405/Signal/General/LoadToFil.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/LoadToFil.C	2018-03-12 23:02:35.000000000 +0000
@@ -176,12 +176,12 @@
     if ( config->filterbank.get_nchan() )
     {
       if (verbose)
-	cerr << "digifil: creating " << config->filterbank.get_nchan()
-	     << " channel filterbank" << endl;
+        cerr << "digifil: creating " << config->filterbank.get_nchan()
+             << " channel filterbank" << endl;
 
       if ( config->coherent_dedisp )
       {
-	cerr << "digifil: using coherent dedispersion" << endl;
+        cerr << "digifil: using coherent dedispersion" << endl;
 
         kernel = new Dedispersion;
 
@@ -198,35 +198,37 @@
           || config->coherent_dedisp 
           || (config->npol>2) )
       {
-	cerr << "digifil: using convolving filterbank" << endl;
+        cerr << "digifil: using convolving filterbank" << endl;
 
-	filterbank = new Filterbank;
+        filterbank = new Filterbank;
 
-	filterbank->set_nchan( config->filterbank.get_nchan() );
-	filterbank->set_input( timeseries );
+        filterbank->set_nchan( config->filterbank.get_nchan() );
+        filterbank->set_input( timeseries );
         filterbank->set_output( timeseries = new_TimeSeries() );
 
         if (kernel)
           filterbank->set_response( kernel );
 
-	if ( config->filterbank.get_freq_res() ) 
+        if ( config->filterbank.get_freq_res() ) 
           filterbank->set_frequency_resolution ( 
               config->filterbank.get_freq_res() );
 
-	operations.push_back( filterbank.get() );
-	do_detection = true;
+        operations.push_back( filterbank.get() );
+        do_detection = true;
       }
       else
       {
-	filterbank = new TFPFilterbank;
+        filterbank = new TFPFilterbank;
 
-	filterbank->set_nchan( config->filterbank.get_nchan() );
-	filterbank->set_input( timeseries );
-	filterbank->set_output( timeseries = new_TimeSeries() );
+        filterbank->set_nchan( config->filterbank.get_nchan() );
+        filterbank->set_input( timeseries );
+        filterbank->set_output( timeseries = new_TimeSeries() );
 
-	operations.push_back( filterbank.get() );
+        operations.push_back( filterbank.get() );
       }
     }
+    else
+      do_detection = true;
 
     if ( config->dedisperse )
     {
@@ -245,9 +247,9 @@
     if (do_detection)
     {
       if (verbose)
-	cerr << "digifil: creating detection operation (npol=" <<
+        cerr << "digifil: creating detection operation (npol=" <<
           config->npol << ")" << endl;
-      
+  
       Detection* detection = new Detection;
 
       detection->set_input( timeseries );
diff -Nru bl-dspsr-0+git20160405/Signal/General/LoadToFITS.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/LoadToFITS.C
--- bl-dspsr-0+git20160405/Signal/General/LoadToFITS.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/LoadToFITS.C	2018-03-12 23:02:35.000000000 +0000
@@ -82,6 +82,8 @@
   rescale_seconds = -1;
   rescale_constant = false;
 
+  integration_length = 0;
+
   nbits = 2;
 
   npol = 4;
@@ -93,6 +95,12 @@
   weighted_time_series = false;
 }
 
+// set block_size to result in approximately this much RAM usage
+void dsp::LoadToFITS::Config::set_maximum_RAM (uint64_t ram)
+{
+  maximum_RAM = ram;
+}
+
 void dsp::LoadToFITS::Config::set_quiet ()
 {
   SingleThread::Config::set_quiet();
@@ -135,7 +143,6 @@
   if (!config->dedisperse && unpacker->get_order_supported (config->order))
     unpacker->set_output_order (config->order);
 
-
   // get basic information about the observation
 
   Observation* obs = manager->get_info();
@@ -149,31 +156,46 @@
     cerr << "Source = " << obs->get_source() << endl;
     cerr << "Frequency = " << obs->get_centre_frequency() << endl;
     cerr << "Bandwidth = " << obs->get_bandwidth() << endl;
+    cerr << "Channels = " << nchan << endl;
     cerr << "Sampling rate = " << rate << endl;
     cerr << "State = " << tostring(obs->get_state()) <<endl;
   }
 
   obs->set_dispersion_measure( config->dispersion_measure );
 
-  // Strategy will be to tscrunch from Nyquist resolution to desired reso.
-
-  // voltage samples per filterbank sample
-  double samp_per_fb = config->tsamp * rate;
-  if (verbose)
-    cerr << "voltage samples per filterbank sample="<<samp_per_fb << endl;
-  // correction for number of samples per filterbank channel
-  double factor = obs->get_state() == Signal::Nyquist? 0.5 : 1.0;
   unsigned fb_nchan = config->filterbank.get_nchan();
-  unsigned tres_factor = round(factor*samp_per_fb/fb_nchan);
-  double tsamp = tres_factor/factor*fb_nchan/rate;
+  unsigned nsample;
+  double tsamp, samp_per_fb;
+  unsigned tres_factor;
+  double factor = obs->get_state() == Signal::Nyquist? 0.5 : 1.0;
 
-  cerr << "digifits: requested tsamp=" << config->tsamp << " rate=" << rate << endl << "             actual tsamp=" << tsamp << " (tscrunch=" << tres_factor << ")" << endl;
+  if (fb_nchan > 0)
+  {
+    // Strategy will be to tscrunch from Nyquist resolution to desired reso.
+    // voltage samples per filterbank sample
+    samp_per_fb = config->tsamp * rate;
+    if (verbose)
+      cerr << "voltage samples per filterbank sample="<<samp_per_fb << endl;
+    // correction for number of samples per filterbank channel
+    tres_factor = round(factor*samp_per_fb/fb_nchan);
+    tsamp = tres_factor/factor*fb_nchan/rate;
+
+    // voltage samples per output block
+    nsample = round(samp_per_fb * config->nsblk);
+  }
+  else
+  {
+    samp_per_fb = 1.0;
+    tres_factor = round(rate * config->tsamp);
+    tsamp = tres_factor/factor * 1/rate;
+    nsample = config->nsblk * tres_factor;
+  }
+
+  cerr << "digifits: requested tsamp=" << config->tsamp << " rate=" << rate << endl 
+       << "             actual tsamp=" << tsamp << " (tscrunch=" << tres_factor << ")" << endl;
   if (verbose)
     cerr << "digifits: nsblk=" << config->nsblk << endl;
 
-  // voltage samples per output block
-  uint64_t nsample = round(samp_per_fb * config->nsblk);
-
   // the unpacked input will occupy nbytes_per_sample
   double nbytes_per_sample = sizeof(float) * nchan * npol * ndim;
   double MB = 1024.0 * 1024.0;
@@ -181,6 +203,10 @@
   // ideally, block size would be a full output block, but this is too large
   // pick a nice fraction that will divide evently into maximum RAM
   // NB this doesn't account for copies (yet)
+
+  if (verbose)
+    cerr << "digifits: nsample * nbytes_per_sample=" << nsample * nbytes_per_sample 
+         << " config->maximum_RAM=" << config->maximum_RAM << endl;
   while (nsample * nbytes_per_sample > config->maximum_RAM) nsample /= 2;
 
   if (verbose)
@@ -202,65 +228,74 @@
 
   if (!obs->get_detected())
   {
-
-    if ( !config->filterbank.get_nchan() )
-      throw Error(InvalidParam,"dsp::LoadToFITS::construct",
-          "must specify filterbank scheme if data are not detected");
-
-    // If user specifies -FN:D, enable coherent dedispersion
-    if ( config->filterbank.get_convolve_when() == 
-        Filterbank::Config::During )
-      config->coherent_dedisp = true;
-
-    if ( (config->coherent_dedisp) && (config->dispersion_measure != 0.0) )
+    // if no filterbank specified
+    if (fb_nchan == 0)
     {
-      cerr << "digifits: using coherent dedispersion" << endl;
-
-      // "During" is the only option, my friends
-      config->filterbank.set_convolve_when( Filterbank::Config::During );
-
-      kernel = new Dedispersion;
-      kernel->set_dispersion_measure( config->dispersion_measure );
-
-      if (config->filterbank.get_freq_res())
-        kernel -> set_times_minimum_nfft (config->filterbank.get_freq_res () );
-        //kernel->set_frequency_resolution (
-        //    config->filterbank.get_freq_res());
-
+      if (nchan == 1)
+        throw Error(InvalidParam,"dsp::LoadToFITS::construct",
+            "must specify filterbank scheme if single channel data");
+      else
+        if (verbose)
+          cerr << "digifits: no filterbank specified" << endl;
     }
-    else config->coherent_dedisp = false;
+    else
+    {
+      // If user specifies -FN:D, enable coherent dedispersion
+      if ( config->filterbank.get_convolve_when() == 
+          Filterbank::Config::During )
+        config->coherent_dedisp = true;
+
+      if ( (config->coherent_dedisp) && (config->dispersion_measure != 0.0) )
+      {
+        cerr << "digifits: using coherent dedispersion" << endl;
+
+        // "During" is the only option, my friends
+        config->filterbank.set_convolve_when( Filterbank::Config::During );
+
+        kernel = new Dedispersion;
+        kernel->set_dispersion_measure( config->dispersion_measure );
+
+        if (config->filterbank.get_freq_res())
+          kernel -> set_times_minimum_nfft (config->filterbank.get_freq_res () );
+          //kernel->set_frequency_resolution (
+          //    config->filterbank.get_freq_res());
+
+      }
+      else 
+        config->coherent_dedisp = false;
 
 # if HAVE_CUDA
-    if (run_on_gpu)
-    {
-      timeseries->set_memory (device_memory);
-      config->filterbank.set_device ( device_memory.ptr() );
-      config->filterbank.set_stream ( gpu_stream );
-    }
+      if (run_on_gpu)
+      {
+        timeseries->set_memory (device_memory);
+        config->filterbank.set_device ( device_memory.ptr() );
+        config->filterbank.set_stream ( gpu_stream );
+      }
 #endif
 
-    filterbank = config->filterbank.create ();
+      filterbank = config->filterbank.create ();
+
+      filterbank->set_nchan( config->filterbank.get_nchan() );
+      filterbank->set_input( timeseries );
+      filterbank->set_output( timeseries = new_TimeSeries() );
 
-    filterbank->set_nchan( config->filterbank.get_nchan() );
-    filterbank->set_input( timeseries );
-    filterbank->set_output( timeseries = new_TimeSeries() );
 # if HAVE_CUDA
-    if (run_on_gpu)
-      timeseries->set_memory (device_memory);
+      if (run_on_gpu)
+        timeseries->set_memory (device_memory);
 #endif
 
-    if (kernel)
-      filterbank->set_response( kernel );
+      if (kernel)
+        filterbank->set_response( kernel );
 
-    if ( !config->coherent_dedisp )
-    {
-      unsigned freq_res = config->filterbank.get_freq_res();
-      if (freq_res > 1)
-        filterbank->set_frequency_resolution ( freq_res );
-    }
-
-    operations.push_back( filterbank.get() );
+      if ( !config->coherent_dedisp )
+      {
+        unsigned freq_res = config->filterbank.get_freq_res();
+        if (freq_res > 1)
+          filterbank->set_frequency_resolution ( freq_res );
+      }
 
+      operations.push_back( filterbank.get() );
+    }
       if (verbose)
 	      cerr << "digifits: creating detection operation" << endl;
       
@@ -441,6 +476,7 @@
   FITSOutputFile* outputfile = new FITSOutputFile (output_filename);
   outputfile->set_nsblk (config->nsblk);
   outputfile->set_nbit (config->nbits);
+  outputfile->set_max_length (config->integration_length);
   outputFile = outputfile;
   outputFile->set_input (bitseries);
 
@@ -462,8 +498,11 @@
 
   unsigned freq_res = config->coherent_dedisp? kernel->get_frequency_resolution() : config->filterbank.get_freq_res();
   if (freq_res == 0) freq_res = 1;
-  cerr << "digifits: creating " << config->filterbank.get_nchan()
-       << " by " << freq_res << " back channel filterbank" << endl;
+  if (config->filterbank.get_nchan())
+    cerr << "digifits: creating " << config->filterbank.get_nchan()
+         << " by " << freq_res << " back channel filterbank" << endl;
+  else
+    cerr << "digifits: processing " << manager->get_info()->get_nchan() << " channels" << endl;
   
   // TODO -- set an optimal block size for search mode
 
diff -Nru bl-dspsr-0+git20160405/Signal/General/Makefile.am bl-dspsr-0.0~git20180312.50ea209/Signal/General/Makefile.am
--- bl-dspsr-0+git20160405/Signal/General/Makefile.am	2018-03-12 08:32:59.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/Makefile.am	2018-03-12 23:02:35.000000000 +0000
@@ -21,7 +21,8 @@
 	dsp/TFPFilterbank.h dsp/RFIZapper.h dsp/SKFilterbank.h	       \
 	dsp/Resize.h dsp/SKDetector.h dsp/SKMasker.h		       \
 	dsp/Pipeline.h dsp/SingleThread.h dsp/MultiThread.h            \
-	dsp/PolnSelect.h dsp/PolnReshape.h
+	dsp/PolnSelect.h dsp/PolnReshape.h dsp/SpectralKurtosis.h \
+  dsp/SKComputer.h
 
 libdspdsp_la_SOURCES = optimize_fft.c cross_detect.c cross_detect.h  \
 	cross_detect.ic stokes_detect.c stokes_detect.h		     \
@@ -38,25 +39,54 @@
 	TFPFilterbank.C RFIZapper.C SKFilterbank.C \
 	Resize.C SKDetector.C SKMasker.C \
 	SingleThread.C MultiThread.C dsp_verbosity.C \
-	PolnSelect.C PolnReshape.C
+	PolnSelect.C PolnReshape.C SpectralKurtosis.C
+
+bin_PROGRAMS = dmsmear digitxt digimon digihist filterbank_speed
 
 libdspdsp_la_LIBADD =
 
 if HAVE_CUFFT
 
 nobase_include_HEADERS += CUFFTError.h dsp/LaunchConfig.h \
-	dsp/FilterbankCUDA.h dsp/filterbank_cuda.h \
-	dsp/TransferCUDA.h dsp/TransferBitSeriesCUDA.h \
-    dsp/SKMaskerCUDA.h dsp/DetectionCUDA.h dsp/FZoomCUDA.h \
-    dsp/FScrunchCUDA.h dsp/TScrunchCUDA.h
+  dsp/FilterbankCUDA.h dsp/filterbank_cuda.h \
+  dsp/TransferCUDA.h dsp/TransferBitSeriesCUDA.h \
+  dsp/SKMaskerCUDA.h dsp/DetectionCUDA.h dsp/FZoomCUDA.h \
+  dsp/FScrunchCUDA.h dsp/TScrunchCUDA.h \
+  dsp/PScrunchCUDA.h dsp/PolnSelectCUDA.h \
+  dsp/ConvolutionCUDA.h dsp/ConvolutionCUDASpectral.h \
+  dsp/ConvolutionCUDACallbacks.h dsp/SpectralKurtosisCUDA.h \
+  dsp/SKComputerCUDA.h dsp/SKDetectorCUDA.h dsp/SKFilterbankCUDA.h
 
 libdspdsp_la_SOURCES += CUFFTError.C LaunchConfig.C FilterbankCUDA.cu \
-    TransferCUDA.C TransferBitSeriesCUDA.C DetectionCUDA.cu           \
-    SKMaskerCUDA.cu FZoomCUDA.cu FScrunchCUDA.cu TScrunchCUDA.cu
+  TransferCUDA.C TransferBitSeriesCUDA.C DetectionCUDA.cu             \
+  SKMaskerCUDA.cu FZoomCUDA.cu FScrunchCUDA.cu TScrunchCUDA.cu        \
+  PScrunchCUDA.cu PolnSelectCUDA.cu SpectralKurtosisCUDA.cu \
+  SKComputerCUDA.cu SKDetectorCUDA.cu SKFilterbankCUDA.cu \
+  ConvolutionCUDA.cu ConvolutionCUDASpectral.cu ConvolutionCUDACallbacks.cu
+
+bin_PROGRAMS += fftbatch_speed
+fftbatch_speed_SOURCES = fftbatch_speed.C
+
+if HAVE_CUFFT_CALLBACKS
+bin_PROGRAMS += cufft_callback_bench
+
+
+cufft_callback_bench_DC.o: cufft_callback_bench.o
+	$(CUDA_NVCC) -o cufft_callback_bench_DC.o -dlink cufft_callback_bench.o -lcufft_static
+
+ConvolutionCUDACallbacks.lo: ConvolutionCUDACallbacks.cu
+	$(top_srcdir)/config/cudalt.py $(top_builddir)/libtool $@ $(CUDA_NVCC) -dc -c $<
+	$(CUDA_NVCC) -o ConvolutionCUDACallbacks_DC.o -dlink ConvolutionCUDACallbacks.o -lcufft_static
+
+cufft_callback_bench.lo: cufft_callback_bench.cu
+	$(top_srcdir)/config/cudalt.py $(top_builddir)/libtool $@ $(CUDA_NVCC) -dc -c $<
+	$(CUDA_NVCC) -o cufft_callback_bench_DC.o -dlink cufft_callback_bench.o -lcufft_static
+
+cufft_callback_bench_LDADD = $(LDADD) cufft_callback_bench_DC.o -lcudart -lcufft_static -lculibos
 
 endif
+endif
 
-bin_PROGRAMS = dmsmear digitxt digimon digihist filterbank_speed
 
 dmsmear_SOURCES = dmsmear.C 
 digitxt_SOURCES = digitxt.C
@@ -86,6 +116,10 @@
   passband_SOURCES = passband.C
   passband_LDADD = @PSRPLOT_LIBS@ $(LDADD)
 
+if HAVE_CUFFT_CALLBACKS
+	digistat_LDADD += ConvolutionCUDACallbacks_DC.o -lcudart -lcufft_static -lculibos
+	passband_LDADD += ConvolutionCUDACallbacks_DC.o -lcudart -lcufft_static -lculibos
+endif
   #
   # end PGPLOT-specific code
   #
@@ -128,11 +162,17 @@
   bin_PROGRAMS += digifil
   digifil_SOURCES = digifil.C
 
+
 if HAVE_dada
   bin_PROGRAMS += the_decimator
   the_decimator_SOURCES = the_decimator.C
   the_decimator_LDADD = $(LDADD) @OPENSSL_LIBS@ @PSRXML_LIBS@ @PSRDADA_LIBS@
   the_decimator_CPPFLAGS = $(AM_CPPFLAGS) $(CPPFLAGS) @PSRXML_CFLAGS@ @PSRDADA_CFLAGS@
+
+if HAVE_CUFFT_CALLBACKS
+  the_decimator_LDADD += ConvolutionCUDACallbacks_DC.o -lcudart -lcufft_static -lculibos
+endif
+
 endif
 
   #
@@ -151,9 +191,8 @@
 LDADD = libdspdsp.la \
 	$(top_builddir)/Kernel/libdspbase.la \
 	$(top_builddir)/Signal/Statistics/libdspstats.la \
-	@PGPLOT_LIBS@ @CUFFT_LIBS@ @CUDA_LIBS@
+	@PGPLOT_LIBS@ @CUDA_LIBS@ @CUFFT_LIBS@
 
 AM_CPPFLAGS += @PGPLOT_CFLAGS@ @CUFFT_CFLAGS@ @CFITSIO_CFLAGS@
 AM_CXXFLAGS = @OPENMP_CFLAGS@
 
-
diff -Nru bl-dspsr-0+git20160405/Signal/General/passband.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/passband.C
--- bl-dspsr-0+git20160405/Signal/General/passband.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/passband.C	2018-03-12 23:02:35.000000000 +0000
@@ -5,6 +5,10 @@
  *
  ***************************************************************************/
 
+#if HAVE_CONFIG_H
+#include <config.h>
+#endif
+
 #include "dsp/Bandpass.h"
 #include "dsp/RFIFilter.h"
 
@@ -23,6 +27,11 @@
 #include "pgutil.h"
 #include "Error.h"
 
+#if HAVE_fits
+#include "dsp/FITSFile.h"
+#include "dsp/FITSUnpacker.h"
+#endif
+
 #include <cpgplot.h>
 #include <iostream>
 #include <unistd.h>
@@ -36,11 +45,13 @@
     "Options:\n"
     " -b         plot frequency bins (histogram style) \n"
     " -c cmap    set the colour map (0 to 7) \n"
+    " -D         set the PGPLOT device\n"
     " -d         produce dynamic spectrum (greyscale) \n"
     " -F min,max set the min,max x-value (e.g. frequency zoom) \n" 
     " -r min,max set the min,max y-value (e.g. saturate birdies) \n"
     " -n nchan   number of frequency channels in each spectrum \n"
     " -t seconds integration interval for each spectrum \n"
+    " -s         quit after a single integration \n"
     " -p         detect the full-polarization bandpass \n"
     " -R         test RFIFilter class \n"
        << endl;
@@ -66,6 +77,9 @@
   // integration length
   double integrate = 1.0;
 
+  // quit after a single integration (for quicklook)
+  bool single_quit = false;
+
   // seek into file
   double seek_seconds = 0.0;
 
@@ -101,7 +115,7 @@
   int width_pixels  = 0;
   int height_pixels = 0;
 
-  static const char* args = "ibB:c:dD:f:F:G:g:lr:n:pRS:T:t:hvV";
+  static const char* args = "ibB:c:dD:f:F:G:g:lr:n:pRS:T:t:shvV";
 
   while ((c = getopt(argc, argv, args)) != -1)
     switch (c) {
@@ -181,6 +195,10 @@
       integrate = atof (optarg);
       break;
 
+    case 's':
+      single_quit = true;
+      break;
+
     case 'h':
       usage ();
       return 0;
@@ -304,6 +322,30 @@
       cerr << "opening data file " << filenames[ifile] << endl;
       
     manager->open (filenames[ifile]);
+
+#if HAVE_fits
+    // Use callback to handle scales/offsets for read-in
+    if (manager->get_info()->get_machine() == "FITS")
+    {
+      if (dsp::Operation::verbose)
+        cerr << "Using callback to read PSRFITS file." << endl;
+      // connect a callback
+      bool success = false;
+      dsp::FITSUnpacker* funp = dynamic_cast<dsp::FITSUnpacker*> (
+          manager->get_unpacker());
+      dsp::FITSFile* ffile = dynamic_cast<dsp::FITSFile*> (
+          manager->get_input());
+      cerr << funp << endl;
+      cerr << ffile << endl;
+      if (funp && ffile)
+      {
+        ffile->update.connect ( funp, &dsp::FITSUnpacker::set_parameters );
+        success = true;
+      }
+      if (not success)
+        cerr << "dspsr: WARNING: FITS input input but unable to apply scales and offsets." << endl;
+    }
+#endif
     
     if (verbose)
       cerr << "data file " << filenames[ifile] << " opened" << endl;
@@ -409,6 +451,8 @@
 
 	passband->reset_output();
       }
+      if (single_quit)
+        break;
     }
 
     if (dynamic)
diff -Nru bl-dspsr-0+git20160405/Signal/General/PolnSelectCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Signal/General/PolnSelectCUDA.cu
--- bl-dspsr-0+git20160405/Signal/General/PolnSelectCUDA.cu	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/PolnSelectCUDA.cu	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,151 @@
+//-*-C++-*-
+
+/***************************************************************************
+ *
+ *   Copyright (C) 2015 by Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#include "dsp/PolnSelectCUDA.h"
+
+#include "Error.h"
+#include "debug.h"
+
+#include <memory>
+
+#include <string.h>
+
+using namespace std;
+
+void check_error (const char*);
+
+CUDA::PolnSelectEngine::PolnSelectEngine (cudaStream_t _stream)
+{
+  stream = _stream;
+}
+
+CUDA::PolnSelectEngine::~PolnSelectEngine ()
+{
+}
+
+//! get cuda device properties
+void CUDA::PolnSelectEngine::setup()
+{
+  gpu_config.init();
+}
+
+
+//
+// each thread reads a single value from both polarisation
+// and adds them together
+//
+__global__ void fpt_polnselect_kernel (float * in, float * out, 
+                                       uint64_t in_chan_span,
+                                       uint64_t out_chan_span, 
+                                       uint64_t in_ndat)
+{
+  // ichan: blockIdx.y
+  const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (idx >= in_ndat)
+    return;
+
+  out[blockIdx.y * out_chan_span + idx] = in[blockIdx.y * in_chan_span + idx];
+}
+
+void CUDA::PolnSelectEngine::fpt_polnselect (int ipol,
+                                             const dsp::TimeSeries* input, 
+                                             dsp::TimeSeries* output)
+{
+  if (input == output)
+    throw Error (InvalidParam, "CUDA::PolnSelectEngine::fpt_polnselect",
+     "cannot handle in-place data");
+
+  const uint64_t ndat  = input->get_ndat();
+  const unsigned nchan = input->get_nchan();
+  const unsigned npol  = input->get_npol();
+
+  if (npol != 2)
+    throw Error (InvalidParam, "CUDA::PolnSelectEngine::fpt_polnselect",
+      "number of input polarisations must be two");
+
+  uint64_t in_chan_span = 0;
+  uint64_t out_chan_span = 0;
+  if (nchan > 1)
+  {
+    in_chan_span = input->get_datptr (1, 0) - input->get_datptr (0, 0);
+    out_chan_span = output->get_datptr (1, 0) - output->get_datptr (0, 0);
+  }
+
+  // TODO (idea) this could be changed to a bunch of memcpy's in low nchan case
+
+  float * in  = (float *) input->get_datptr (0, ipol);
+  float * out = output->get_datptr (0, 0);
+
+#ifdef _DEBUG
+  cerr << "CUDA::PolnSelectEngine::fpt_polnselect channel spans: input=" << in_chan_span << " output=" << out_chan_span << endl;
+#endif
+
+  dim3 threads (gpu_config.get_max_threads_per_block());
+  dim3 blocks (ndat / threads.x, nchan);
+
+  if (ndat % threads.x)
+    blocks.x ++;
+
+  // pass span as number of complex values
+  fpt_polnselect_kernel<<<blocks,threads,0,stream>>> (in, out, in_chan_span, out_chan_span, ndat); 
+  if (dsp::Operation::record_time || dsp::Operation::verbose)
+    check_error ("CUDA::PolnSelectEngine::fpt_polnselect");
+}
+
+
+// each block polnselectes 1 time sample for many channels
+__global__ void tfp_polnselect_kernel (float * in, float * out, unsigned nchan)
+{
+  // isamp == blockIdx.y
+  // ichan == blockIdx.x * blockDim.x  + threadIdx.x
+
+  const unsigned isamp = blockIdx.y;
+  const unsigned ichan = (blockIdx.x * blockDim.x + threadIdx.x);
+  const unsigned npol  = 2;
+
+  if (ichan >= nchan)
+    return;
+
+  const unsigned int idx = (isamp * nchan * npol) + (ichan * npol);
+  const unsigned int odx = (isamp * nchan) + ichan;
+
+  out[odx] = in[idx];
+}
+
+void CUDA::PolnSelectEngine::tfp_polnselect (int ipol,
+                                             const dsp::TimeSeries* input,
+                                             dsp::TimeSeries* output)
+{
+  if (input == output)
+    throw Error (InvalidParam, "CUDA::PolnSelectEngine::tfp_polnselect"
+     "cannot handle in-place data");
+
+  const uint64_t ndat  = input->get_ndat();
+  const unsigned nchan = input->get_nchan();
+  const unsigned npol  = input->get_npol();
+
+  if (npol != 2)
+    throw Error (InvalidParam, "CUDA::PolnSelectEngine::fpt_scrunch",
+      "number of input polarisations must be two");
+
+  dim3 threads (gpu_config.get_max_threads_per_block());
+  if (nchan < gpu_config.get_max_threads_per_block())
+    threads.x = nchan;
+
+  dim3 blocks (nchan/threads.x, ndat);
+  if (nchan % threads.x)
+    blocks.x++;
+
+  // offset into the TFP array by ipol
+  float * in_base = (float *) input->get_dattfp () + ipol;
+  float * out_base = (float *) output->get_dattfp ();
+
+  tfp_polnselect_kernel<<<blocks,threads,0,stream>>> (in_base, out_base, nchan);
+}
diff -Nru bl-dspsr-0+git20160405/Signal/General/PScrunchCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Signal/General/PScrunchCUDA.cu
--- bl-dspsr-0+git20160405/Signal/General/PScrunchCUDA.cu	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/PScrunchCUDA.cu	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,159 @@
+//-*-C++-*-
+
+/***************************************************************************
+ *
+ *   Copyright (C) 2015 by Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#include "dsp/PScrunchCUDA.h"
+
+#include "Error.h"
+#include "debug.h"
+
+#include <memory>
+
+#include <string.h>
+
+using namespace std;
+
+void check_error (const char*);
+
+CUDA::PScrunchEngine::PScrunchEngine (cudaStream_t _stream)
+{
+  stream = _stream;
+}
+
+CUDA::PScrunchEngine::~PScrunchEngine ()
+{
+}
+
+//! get cuda device properties
+void CUDA::PScrunchEngine::setup()
+{
+  gpu_config.init();
+}
+
+
+//
+// each thread reads a single value from both polarisation
+// and adds them together
+//
+__global__ void fpt_pscrunch_kernel (float * in_p0,  float * in_p1,
+                                     float * out, uint64_t in_chan_span,
+                                     uint64_t out_chan_span, uint64_t in_ndat)
+{
+  // ichan: blockIdx.y
+  const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (idx >= in_ndat)
+    return;
+
+  // increment the input/output base pointers to this chan/pol
+  in_p0 += (blockIdx.y * in_chan_span);
+  in_p1 += (blockIdx.y * in_chan_span);
+  out   += (blockIdx.y * out_chan_span);
+
+  out[idx] = (in_p0[idx] + in_p1[idx]) * M_SQRT1_2;
+}
+
+void CUDA::PScrunchEngine::fpt_pscrunch (const dsp::TimeSeries* input, 
+                                         dsp::TimeSeries* output)
+{
+  if (input == output)
+    throw Error (InvalidParam, "CUDA::PScrunchEngine::fpt_pscrunch",
+     "cannot handle in-place data");
+
+  const uint64_t input_ndat  = input->get_ndat();
+  const unsigned input_nchan = input->get_nchan();
+  const unsigned input_npol  = input->get_npol();
+
+  if (input_npol != 2)
+    throw Error (InvalidParam, "CUDA::PScrunchEngine::fpt_scrunch",
+      "number of input polarisations must be two");
+
+  uint64_t in_chan_span = 0;
+  uint64_t out_chan_span = 0;
+  if (input_nchan > 1)
+  {
+    in_chan_span = input->get_datptr (1, 0) - input->get_datptr (0, 0);
+    out_chan_span = output->get_datptr (1, 0) - output->get_datptr (0, 0);
+  }
+
+  float * in_pol0 = (float *) input->get_datptr (0, 0);
+  float * in_pol1 = (float *) input->get_datptr (0, 1);
+  float * out     = output->get_datptr (0, 0);
+
+#ifdef _DEBUG
+  cerr << "CUDA::PScrunchEngine::fpt_pscrunch channel spans: input=" << in_chan_span << " output=" << out_chan_span << endl;
+#endif
+
+  dim3 threads (gpu_config.get_max_threads_per_block());
+  dim3 blocks (input_ndat / threads.x, input_nchan);
+
+  if (input_ndat % threads.x)
+    blocks.x ++;
+
+  // pass span as number of complex values
+  fpt_pscrunch_kernel<<<blocks,threads,0,stream>>> (in_pol0, in_pol1, out, in_chan_span, out_chan_span, input_ndat); 
+  if (dsp::Operation::record_time || dsp::Operation::verbose)
+    check_error ("CUDA::PScrunchEngine::fpt_pscrunch");
+}
+
+
+// each block pscrunches 1 time sample for many channels
+__global__ void tfp_pscrunch_kernel (float * in, float * out, unsigned nchan)
+{
+  extern __shared__ float pscr_shm[];
+
+  // isamp == blockIdx.y
+  // ipol  == even/odd threads
+  // ichan == blockIdx.x * blockDim.x  + threadIdx.x
+
+  const unsigned isamp = blockIdx.y;
+  const unsigned ichanpol = (blockIdx.x * blockDim.x + threadIdx.x);
+  const unsigned ichan = ichanpol / 2;
+  const unsigned ipol  = ichanpol & 0x1;  // % 2
+  const unsigned npol  = 2;
+
+  if (ichanpol >= nchan*npol)
+    return;
+
+  const unsigned int idx = (isamp * nchan * npol) + ichan * npol + ipol;
+  const unsigned int odx = (isamp * nchan) + ichan;
+
+  pscr_shm[threadIdx.x] = in[idx]; 
+
+  __syncthreads();
+
+  if (ipol == 0)
+    out[odx] = (pscr_shm[threadIdx.x] + pscr_shm[threadIdx.x+1]) * M_SQRT1_2;
+}
+
+void CUDA::PScrunchEngine::tfp_pscrunch (const dsp::TimeSeries* input,
+                                         dsp::TimeSeries* output)
+{
+  if (input == output)
+    throw Error (InvalidParam, "CUDA::PScrunchEngine::tfp_pscrunch"
+     "cannot handle in-place data");
+
+  const uint64_t input_ndat  = input->get_ndat();
+  const unsigned input_nchan = input->get_nchan();
+  const unsigned input_npol  = input->get_npol();
+
+  if (input_npol != 2)
+    throw Error (InvalidParam, "CUDA::PScrunchEngine::fpt_scrunch",
+      "number of input polarisations must be two");
+
+  dim3 threads (gpu_config.get_max_threads_per_block());
+  dim3 blocks (input_nchan*input_npol/threads.x, input_ndat);
+  if (input_nchan*input_npol % threads.x)
+    blocks.x++;
+
+  float * in_base = (float *) input->get_dattfp ();
+  float * out_base = (float *) output->get_dattfp ();
+  size_t shm_bytes = blocks.x * sizeof(float);
+
+  tfp_pscrunch_kernel<<<blocks,threads,shm_bytes,stream>>> (in_base, out_base, input_nchan);
+}
diff -Nru bl-dspsr-0+git20160405/Signal/General/SingleThread.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/SingleThread.C
--- bl-dspsr-0+git20160405/Signal/General/SingleThread.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/SingleThread.C	2018-03-12 23:02:35.000000000 +0000
@@ -24,6 +24,7 @@
 #if HAVE_CUDA
 #include "dsp/MemoryCUDA.h"
 #include "dsp/TransferCUDA.h"
+#include "dsp/TimeSeriesCUDA.h"
 #endif
 
 #include "dsp/ObservationChange.h"
@@ -58,6 +59,7 @@
 
   input_context = 0;
   gpu_stream = undefined_stream;
+  gpu_device = 0;
 }
 
 dsp::SingleThread::~SingleThread ()
@@ -216,35 +218,47 @@
 
   if (run_on_gpu)
   {
-    // disable input buffering when data must be copied between devices
-    if (config->get_total_nthread() > 1)
-      config->input_buffering = false;
-
-    int device = config->cuda_device[thread_id];
+    gpu_device = config->cuda_device[thread_id];
     cerr << "dspsr: thread " << thread_id 
-         << " using CUDA device " << device << endl;
+         << " using CUDA device " << gpu_device << endl;
 
     int ndevice = 0;
     cudaError err = cudaGetDeviceCount(&ndevice);
 
-    if (err != cudaSuccess || device >= ndevice)
+    if (err != cudaSuccess || gpu_device >= ndevice)
       throw Error (InvalidParam, "dsp::SingleThread::initialize",
-                   "device=%d >= ndevice=%d cudaError=%s", device, ndevice, cudaGetErrorString(err));
+                   "device=%d >= ndevice=%d cudaError=%s", gpu_device, ndevice, cudaGetErrorString(err));
 
-    err = cudaSetDevice (device);
+    err = cudaSetDevice (gpu_device);
     if (err != cudaSuccess)
       throw Error (InvalidState, "dsp::SingleThread::initialize",
                    "cudaMalloc failed: %s", cudaGetErrorString(err));
 
-    unsigned nstream = count (config->cuda_device, (unsigned)device);
+    unsigned nstream = count (config->cuda_device, (unsigned) gpu_device);
 
     // always create a stream, even for 1 thread
     cudaStreamCreate( &stream );
     cerr << "dspsr: thread " << thread_id << " on stream " << stream << endl;
 
     gpu_stream = stream;
+    device_memory = new CUDA::DeviceMemory (stream, gpu_device);
+    if (config->input_buffering)
+      cerr << "dspsr: input_buffering enabled" << endl;
+    else
+      cerr << "dspsr: input_buffering disabled" << endl;
+    if (unpacker->get_device_supported( device_memory ))
+      cerr << "dspsr: unpacker supports device memory" << endl;
 
-    device_memory = new CUDA::DeviceMemory (stream);
+    if ((thread_id == 0) && (!config->input_buffering) && unpacker->get_device_supported( device_memory ))
+    {
+      dsp::Seekable * seekable = dynamic_cast<dsp::Seekable*>( manager->get_input() );
+      if (seekable)
+      {
+        cerr << "dspsr: disabling input buffering, using overlap memory instead" << endl;
+        // overlap memory on stream/device of thread_id 0
+        seekable->set_overlap_buffer_memory (device_memory);
+      }
+    }
 
     if (unpacker->get_device_supported( device_memory ))
     {
@@ -253,9 +267,10 @@
 
       unpacker->set_device( device_memory );
       unpacked->set_memory( device_memory );
-        
+      unpacked->set_engine (new CUDA::TimeSeriesEngine (device_memory));
+
       BitSeries* bits = new BitSeries;
-      bits->set_memory (new CUDA::PinnedMemory);
+      bits->set_memory (device_memory);
       manager->set_output (bits);
     }
     else
@@ -339,10 +354,19 @@
 //! Run through the data
 void dsp::SingleThread::run () try
 {
-  if (Operation::verbose)
-    cerr << "dsp::SingleThread::run this=" << this 
+
+  if (Operation::verbose) {
+
+    cerr << "dsp::SingleThread::run this=" << this
          << " nops=" << operations.size() << endl;
 
+    for (unsigned iop=0; iop < operations.size(); iop++){
+      cerr << "dsp::SingleThread::run operation (" << iop << "): "
+           << operations[iop]->get_name() << endl;
+    }
+
+  }
+
   if (log)
     scratch->set_cerr (*log);
 
diff -Nru bl-dspsr-0+git20160405/Signal/General/SKComputerCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Signal/General/SKComputerCUDA.cu
--- bl-dspsr-0+git20160405/Signal/General/SKComputerCUDA.cu	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/SKComputerCUDA.cu	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,544 @@
+//-*-C++-*-
+
+/***************************************************************************
+ *
+ *   Copyright (C) 2016 by Andre Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#include "dsp/SKComputerCUDA.h"
+#include "dsp/MemoryCUDA.h"
+
+#include "Error.h"
+#include "templates.h"
+#include "debug.h"
+
+#include <stdio.h>
+#include <memory>
+#include <string.h>
+
+#include <cuComplex.h>
+
+#ifdef __CUDA_ARCH__
+    #if (__CUDA_ARCH__ >= 300)
+        #define HAVE_SHFL
+    #else
+        #define NO_SHFL
+    #endif
+#endif
+
+using namespace std;
+
+void check_error (const char*);
+void check_error_stream (const char*, cudaStream_t);
+
+/*
+ *  Important Note, this engine is only efficient for larger strides (256-512)
+ *  stride == nbeam for molongolo
+ */
+
+CUDA::SKComputerEngine::SKComputerEngine (dsp::Memory * memory)
+{
+  device_memory = dynamic_cast<CUDA::DeviceMemory *>(memory);
+  stream = device_memory->get_stream();
+
+  work_buffer_size = 0;
+  work_buffer = 0;
+}
+
+void CUDA::SKComputerEngine::setup ()
+{
+  if (dsp::Operation::verbose)
+    cerr << "CUDA::SKComputerEngine::setup ()" << endl;
+
+  // determine GPU capabilities
+  int device = 0;
+  cudaGetDevice(&device);
+  struct cudaDeviceProp device_properties;
+  cudaGetDeviceProperties (&device_properties, device);
+  max_threads_per_block = device_properties.maxThreadsPerBlock;
+}
+
+// each
+__global__ void reduce_sqld_new (float2 * in, float2 * sums, float * skestimates, uint64_t in_stride, unsigned M)
+{
+  extern __shared__ float s1s[];
+  float * s2s = s1s + 32;
+
+  // each block integrates M samples
+  const unsigned ichanpol = blockIdx.y;
+  const unsigned nchanpol = gridDim.y;
+
+  // offset to current channel, pol
+  in += (ichanpol * in_stride) + (blockIdx.x * M);
+
+  float power;
+  float2 val;
+  float s1 = 0;
+  float s2 = 0;
+
+  // in case M is > blockDim.x
+  for (unsigned i=threadIdx.x; i<M; i+=blockDim.x)
+  {
+    // load the complex value 
+    val = in[i];
+
+    power = (val.x * val.x) + (val.y * val.y);
+    s1 += power;
+    s2 += (power * power);
+  }
+
+#ifdef HAVE_SHFL
+  s1 += __shfl_down (s1, 16);
+  s1 += __shfl_down (s1, 8);
+  s1 += __shfl_down (s1, 4);
+  s1 += __shfl_down (s1, 2);
+  s1 += __shfl_down (s1, 1);
+
+  s2 += __shfl_down (s2, 16);
+  s2 += __shfl_down (s2, 8);
+  s2 += __shfl_down (s2, 4);
+  s2 += __shfl_down (s2, 2);
+  s2 += __shfl_down (s2, 1);
+
+  unsigned warp_idx = threadIdx.x % 32;
+  unsigned warp_num = threadIdx.x / 32;
+  
+  if (warp_idx == 0)
+  {
+    s1s[warp_num] = s1;
+    s2s[warp_num] = s2;
+  }
+  __syncthreads(); 
+
+  if (warp_num == 0)
+  {
+    s1 = s1s[warp_idx];
+    s2 = s2s[warp_idx];
+
+    s1 += __shfl_down (s1, 16);
+    s1 += __shfl_down (s1, 8);
+    s1 += __shfl_down (s1, 4);
+    s1 += __shfl_down (s1, 2);
+    s1 += __shfl_down (s1, 1);
+
+    s2 += __shfl_down (s2, 16);
+    s2 += __shfl_down (s2, 8);
+    s2 += __shfl_down (s2, 4);
+    s2 += __shfl_down (s2, 2);
+    s2 += __shfl_down (s2, 1);
+
+    // s1 and s2 sums across block are complete
+    if (warp_idx == 0)
+    {
+      val.x = s1;
+      val.y = s2;
+      unsigned odx = blockIdx.x*nchanpol + ichanpol;
+      sums [odx] = val;
+      skestimates[odx] = ((M+1) / (M-1)) * (M * (s2 / (s1 * s1)) - 1);
+    }
+  }
+#endif
+#ifdef NO_SHFL
+
+  s1s[threadIdx.x] = s1;
+  s2s[threadIdx.x] = s2;
+
+  __syncthreads();
+
+  int last_offset = blockDim.x/2;
+  for (int offset = last_offset; offset > 0;  offset >>= 1)
+  {
+    if (threadIdx.x < offset)
+    {
+      s1s[threadIdx.x] += s1s[threadIdx.x + offset];
+      s2s[threadIdx.x] += s2s[threadIdx.x + offset];
+    }
+    __syncthreads();
+  }
+
+  if (threadIdx.x == 0)
+  {
+    val.x = s1s[0];
+    val.y = s2s[0];
+    unsigned odx = blockIdx.x*nchanpol + ichanpol;
+    sums [odx] = val;
+    skestimates[odx] = ((M+1) / (M-1)) * (M * (val.y / (val.x * val.x)) - 1);
+  }
+
+
+#endif
+
+  // now we need to a reduction across the block
+}
+
+
+/* Perform a reduction including SQLD calculations */
+__global__ void reduce_sqld (float * in, float * out, const uint64_t ndat)
+{
+  extern __shared__ float sdata[];
+
+  unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+  unsigned int s1 = (threadIdx.x*2);
+  unsigned int s2 = (threadIdx.x*2) + 1;
+
+  float re = 0;
+  float im = 0;
+  if (i < ndat)
+  {
+    re = in[(2*i)];
+    im = in[(2*i) + 1];
+  }
+
+  sdata[s1] = (re * re) + (im * im);
+  sdata[s2] = sdata[s1] * sdata[s1];
+
+  __syncthreads();
+
+  int last_offset = blockDim.x/2 + blockDim.x % 2;
+
+  for (int offset = blockDim.x/2; offset > 0;  offset >>= 1)
+  {
+    // add a partial sum upstream to our own
+    if (threadIdx.x < offset)
+    {
+      sdata[s1] += sdata[s1 + (2*offset)];
+      sdata[s2] += sdata[s2 + (2*offset)];
+    }
+    __syncthreads();
+
+    // special case for non power of 2 reductions
+    if ((last_offset % 2) && (last_offset > 2) && (threadIdx.x == offset))
+    {
+      sdata[0] += sdata[s1 + (2*offset)];
+      sdata[1] += sdata[s2 + (2*offset)];
+    }
+
+    last_offset = offset;
+
+    // wait until all threads in the block have updated their partial sums
+    __syncthreads();
+  }
+
+  // thread 0 writes the final result
+  if (threadIdx.x == 0)
+  {
+    out[(2*blockIdx.x)]   = sdata[0];
+    out[(2*blockIdx.x)+1] = sdata[1];
+  }
+}
+
+/* sum each set of S1 and S2 and compute SK estimate for whole block */
+__global__ void reduce_sk_estimate_new (float2* input, float * output, unsigned nchanpol, unsigned ndat, float M)
+{
+  // input are stored in TFP order
+  const float M_fac = (M+1) / (M-1);
+
+  for (unsigned ichanpol=threadIdx.x; ichanpol<nchanpol; ichanpol+=blockDim.x)
+  {
+    float2* in = input;
+    float2 sum = make_cuComplex(0,0);;
+
+    for (unsigned idat=0; idat<ndat; idat++)
+    {
+      sum = cuCaddf (sum, in[ichanpol]);
+      in += nchanpol;
+    }
+    output[ichanpol] = M_fac * (M * (sum.y/ (sum.x * sum.x)) - 1);
+  }
+}
+
+
+__global__ void reduce_sk_estimate (float * in, float * out, const uint64_t ndat, float M, unsigned ichan)
+{
+  extern __shared__ float sdata[];
+
+  unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+  unsigned int s1 = (threadIdx.x*2);
+  unsigned int s2 = (threadIdx.x*2) + 1;
+
+  // load input into shared memory
+  float re = 0;
+  float im = 0;
+  if (i < ndat)
+  {
+    re = in[(2*i)];
+    im = in[(2*i) + 1];
+  }
+
+  sdata[s1] = re;
+  sdata[s2] = im;
+
+  __syncthreads();
+
+  int last_offset = blockDim.x/2 + blockDim.x % 2;
+  for (int offset = blockDim.x/2; offset > 0;  offset >>= 1)
+  {
+    // add a partial sum upstream to our own
+    if (threadIdx.x < offset)
+    {
+      sdata[s1] += sdata[s1 + (2*offset)];
+      sdata[s2] += sdata[s2 + (2*offset)];
+    }
+
+    __syncthreads();
+
+    // special case for non power of 2 reductions
+    if ((last_offset % 2) && (last_offset > 2) && (threadIdx.x == offset))
+    {
+      sdata[0] += sdata[s1 + (2*offset)];
+      sdata[1] += sdata[s2 + (2*offset)];
+    }
+
+    last_offset = offset;
+
+    // wait until all threads in the block have updated their partial sums
+    __syncthreads();
+  }
+
+  // thread 0 writes the final result
+  if (threadIdx.x == 0)
+  {
+    if (sdata[0] == 0)
+      out[0] = 0;
+    else
+    {
+      float M_fac = (M+1) / (M-1);
+      out[0] = M_fac * (M * (sdata[1] / (sdata[0]*sdata[0])) - 1);
+    }
+  }
+}
+
+__global__ void calc_sk_estimate (float * in, float * out, float M_fac, unsigned int M, size_t out_span)
+{
+  unsigned int i = threadIdx.x;
+  float S1_sum = in[(2*i)];
+  float S2_sum = in[(2*i)+1];
+  if (S1_sum == 0)
+    out[out_span*i] = 0;
+  else
+    out[out_span*i] = M_fac * (M * (S2_sum / (S1_sum * S1_sum)) - 1);
+}
+
+// calculate SK statistics
+void CUDA::SKComputerEngine::compute (const dsp::TimeSeries* input,
+           dsp::TimeSeries* output, dsp::TimeSeries *output_tscr, unsigned M)
+{
+  if (dsp::Operation::verbose)
+    std::cerr << "CUDA::SKComputerEngine::compute()" << std::endl;
+
+  const uint64_t ndat = output->get_ndat() * M;
+  const unsigned nchan = input->get_nchan ();
+  const unsigned npol  = input->get_npol ();
+  const unsigned nchanpol = nchan * npol;
+
+  if (dsp::Operation::verbose)
+    std::cerr << "CUDA::SKComputerEngine::compute ndat=" << ndat << " nchan="
+              << nchan << " npol=" << npol << " M=" << M << std::endl;
+
+  float * outdat = output->get_dattfp();
+  float * outdat_tscr = output_tscr->get_dattfp();
+  if (dsp::Operation::verbose)
+  {
+    std::cerr << "CUDA::SKComputerEngine::compute outdat=" << (void *) outdat << endl;
+    std::cerr << "CUDA::SKComputerEngine::compute outdat_tscr=" << (void *) outdat_tscr << endl;
+  }
+
+  // TODO: currently only support FPT on GPU due to FoldCUDA
+  switch (input->get_order())
+  {
+    case dsp::TimeSeries::OrderFPT:
+    {
+      if (dsp::Operation::verbose)
+        std::cerr << "CUDA::SKComputerEngine::compute OrderFPT" << std::endl;
+
+      float2 * indat = (float2*) input->get_datptr (0, 0);
+
+      unsigned nthreads = 1024;
+      if (M < nthreads)
+        nthreads = M;
+      dim3 blocks (ndat / M, nchanpol);
+
+      // this is by design, due to input buffering
+      assert (ndat % M == 0);
+
+      // work buffer for S1 and S2 values for each set of M samples 
+      size_t bytes_required = nchanpol * blocks.x * sizeof(float2);
+      if (bytes_required > work_buffer_size)
+      {
+        if (work_buffer)
+        {
+          cudaFree(work_buffer);
+        }
+        work_buffer_size = bytes_required;
+        cudaMalloc (&work_buffer, work_buffer_size);
+      }
+
+      if (dsp::Operation::verbose)
+        cerr << "CUDA::SKComputerEngine::compute ndat=" << ndat
+             << " blocks=(" << blocks.x << "," << blocks.y << ")"
+             << " nthreads=" << nthreads << endl; 
+
+      // require an S1 and S2 value for each warp in each block
+      size_t shm_bytes_1 = 32 * sizeof(float2);
+
+      if (dsp::Operation::verbose)
+        cerr << "CUDA::SKComputerEngine::compute work_buffer=" << (void *) work_buffer << endl;
+
+      uint64_t in_stride;
+      if (npol > 1)
+        in_stride = input->get_datptr (0, 1) - input->get_datptr (0, 0);
+      else
+        in_stride = input->get_datptr (1, 0) - input->get_datptr (0, 0);
+
+      // for float2
+      in_stride /= 2;
+
+      reduce_sqld_new<<<blocks,nthreads,shm_bytes_1,stream>>> ((float2 *) indat, (float2 *) work_buffer, outdat, in_stride, M);
+      if (dsp::Operation::record_time || dsp::Operation::verbose)
+        if (stream)
+          check_error_stream ("CUDA::SKComputerEngine::compute reduce_sqld_new [first]", stream);
+      else
+        check_error ("CUDA::SKComputerEngine::compute reduce_sqld_new [first]");
+
+      // compute a tscrunched output SK
+      nthreads = 1024;
+      if (nchanpol < nthreads)
+        nthreads = nchanpol;
+      reduce_sk_estimate_new<<<1,nthreads,0,stream>>>((float2*) work_buffer, outdat_tscr, nchanpol, blocks.x, ndat);
+
+#if 0
+
+
+      // TODO consider making ichan a ydim?
+      for (unsigned ichan=0; ichan<nchan; ichan++)
+      {
+        for (unsigned ipol=0; ipol<npol; ipol++)
+        {
+          indat = const_cast<float*>(input->get_datptr (ichan, ipol));
+
+          //cerr << "CUDA::SKComputerEngine::compute ichan=" << ichan << " pol=" << ipol << " indat=" << indat << endl;
+
+          // foreach block reduce to S1, S2 sums [out of place]
+          //cerr << "CUDA::SKComputerEngine::compute [1] [" << ichan << ", " << ipol << "] shm_bytes=" << shm_bytes_1 << endl;
+          reduce_sqld<<<nblocks,block_size,shm_bytes_1, stream>>> (indat, work_buffer, ndat_proc);
+          if (dsp::Operation::record_time || dsp::Operation::verbose)
+            if (stream)
+              check_error_stream ("CUDA::SKComputerEngine::compute reduce_sqld [first]", stream);
+            else
+              check_error ("CUDA::SKComputerEngine::compute reduce_sqld [first]");
+
+          // calculate S1, S2 sums for tscr [in place]
+          //cerr << "CUDA::SKComputerEngine::compute [2] [" << ichan << ", " << ipol << "] shm_bytes=" << shm_bytes_2 << endl;
+          reduce_sk_estimate<<<1,nblocks,shm_bytes_2,stream>>> (work_buffer, outdat_tscr, nblocks, ndat_proc, ichan);
+          if (dsp::Operation::record_time || dsp::Operation::verbose)
+            if (stream)
+              check_error_stream ("CUDA::SKComputerEngine::compute reduce_sqld [second]", stream);
+            else
+              check_error ("CUDA::SKComputerEngine::compute reduce_sqld [second]");
+
+          // caculate SK estimator for each block in place [out of place]
+          calc_sk_estimate<<<1,nblocks,0,stream>>> (work_buffer, outdat, M_fac, M, nchan*npol);
+          if (dsp::Operation::record_time || dsp::Operation::verbose)
+            if (stream)
+              check_error_stream ("CUDA::SKComputerEngine::compute sk_estimate", stream);
+            else
+              check_error ("CUDA::SKComputerEngine::compute sk_estimate");
+
+          outdat ++;
+          outdat_tscr ++;
+        }
+      }
+#endif
+
+      // now calculate the SK limit for the tscrunched data
+      break;
+    }
+
+    case dsp::TimeSeries::OrderTFP:
+    {
+      throw Error (InvalidState, "CUDA::SKComputerEngine::compute",
+                   "OrderTFP is unsupported input order");
+    }
+
+    default:
+    {
+      throw Error (InvalidState, "CUDA::SKComputerEngine::compute",
+                   "unsupported input order");
+    }
+  }
+}
+
+
+__global__ void copy1sample ( const float * in_base,
+           float2 * out_base,
+           uint64_t out_stride,
+           uint64_t ndat,
+           unsigned M)
+{
+  const unsigned idat  = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idat >= ndat)
+    return;
+
+  const unsigned ipol  = blockIdx.z;
+  const unsigned ichan = blockIdx.y;
+  const unsigned isk   = idat / M;
+
+  const unsigned nchan = gridDim.y;
+  const unsigned npol = gridDim.z;
+
+  // forward pointer to pol0 for this chan
+  out_base += (ichan * npol + ipol) * out_stride;
+
+  // get the SK estimate (TFP order) for this sample/pol
+  const float sk = in_base[isk * nchan * npol + ichan*npol + ipol];
+
+  out_base[idat].x = sk;
+  out_base[idat].y = sk;
+}
+
+
+
+void CUDA::SKComputerEngine::insertsk (const dsp::TimeSeries* input, 
+                                       dsp::TimeSeries* output,
+                                       unsigned M)
+{ 
+  // copy the SK estimates to the output timesseries
+  if (dsp::Operation::verbose)
+    cerr << "CUDA::SKMaskerEngine::insertsk M=" << M << endl;
+
+  uint64_t ndat  = output->get_ndat();
+  unsigned nchan = output->get_nchan();
+  unsigned npol  = output->get_npol();
+
+  // order is FPT
+  const float * in_base = (float *) input->get_dattfp ();
+  float2 * out_base     = (float2 *) output->get_datptr (0, 0);
+
+  uint64_t out_stride;
+  if (npol == 1)
+  {
+    out_stride = output->get_datptr (1, 0) - output->get_datptr (0, 0);
+  }
+  else
+  {
+    out_stride = output->get_datptr (0, 1) - output->get_datptr (0, 0);
+  }
+
+  out_stride /= 2;
+
+  unsigned threads = max_threads_per_block;
+  dim3 blocks (ndat / threads, nchan, npol);
+  if (ndat % threads)
+    blocks.x++;
+
+  cerr << "CUDA::SKComputerEngine::insertsk ndat=" << ndat << " nchan=" << nchan << " npol=" << npol << endl;
+  cerr << "CUDA::SKComputerEngine::insertsk out_stride=" << out_stride << endl;
+  cerr << "CUDA::SKComputerEngine::insertsk blocks=(" << blocks.x << ", " << blocks.y << ") threads=" << threads << endl;
+
+  copy1sample<<<blocks,threads,0,stream>>> (in_base, out_base, out_stride, ndat, M);
+
+  if (dsp::Operation::record_time || dsp::Operation::verbose)
+    check_error( "CUDA::SKComputerEngine::insertsk" );
+}
diff -Nru bl-dspsr-0+git20160405/Signal/General/SKDetector.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/SKDetector.C
--- bl-dspsr-0+git20160405/Signal/General/SKDetector.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/SKDetector.C	2018-03-12 23:02:35.000000000 +0000
@@ -14,8 +14,6 @@
 #include <assert.h>
 #include <iomanip>
 
-//#define USE_MEGA_THRESHOLDS 1
-
 using namespace std;
 
 dsp::SKDetector::SKDetector () 
@@ -26,18 +24,11 @@
   n_std_devs = 3;
   upper_thresh = 0;
   lower_thresh = 0;
-#ifdef USE_MEGA_THRESHOLDS
-  mega_upper_thresh = 0;
-  mega_lower_thresh = 0;
-#endif
   debugd = 1;
   s_chan = 0;
   e_chan = 0;
   ndat_zapped = 0;
   ndat_zapped_skfb = 0;
-#ifdef USE_MEGA_THRESHOLDS
-  ndat_zapped_mega = 0;
-#endif
   ndat_zapped_fscr = 0;
   ndat_zapped_tscr = 0;
   ndat_total = 0;
@@ -60,14 +51,17 @@
   cerr << "Zapped: " 
        << " total=" << (100 * (float) ndat_zapped / (float) ndat_total) << "\%" 
        << " skfb=" << (100 * (float) ndat_zapped_skfb / (float) ndat_total) << "\%"
-#ifdef USE_MEGA_THRESHOLDS
-       << " mega=" << (100 * (float) ndat_zapped_mega / (float) ndat_total) << "\%"
-#endif
        << " tscr=" << (100 * (float) ndat_zapped_tscr / (float) ndat_total) << "\%"
        << " fscr=" << (100 * (float) ndat_zapped_fscr / (float) ndat_total) << "\%"
        << endl;
 }
 
+void dsp::SKDetector::set_engine (Engine * _engine)
+{
+  engine = _engine;
+  engine->setup();
+}
+
 void dsp::SKDetector::set_thresholds (unsigned _M, unsigned _n_std_devs)
 {
   M = _M;
@@ -81,16 +75,6 @@
   upper_thresh = (float) limits.get_upper_threshold();
   lower_thresh = (float) limits.get_lower_threshold();
 
-#ifdef USE_MEGA_THRESHOLDS
-  if (verbose)
-    cerr << "dsp::SKDetector::set_thresholds SKlimits(" << M << ", " << n_std_devs + 3 << ")" << endl;
-  limits.set_std_devs(6);
-  limits.calc_limits();
-
-  mega_upper_thresh = (float) limits.get_upper_threshold();
-  mega_lower_thresh = (float) limits.get_lower_threshold();
-#endif
-
   if (verbose)
     cerr << "dsp::SKDetector::set_thresholds M=" << M << " n_std_devs="
          << n_std_devs  << " [" << lower_thresh << " - " << upper_thresh 
@@ -190,10 +174,12 @@
   unsigned char * outdat = 0;
 
   const unsigned npol    = input->get_npol();
-  assert(npol == 2);
+  unsigned zap_chan;
+  float V;
+  //assert(npol == 2);
 
-  float V_p0 = 0;
-  float V_p1 = 0;
+  //float V_p0 = 0;
+  //float V_p1 = 0;
 
   if (ndat && (tscr_M != M * ndat))
   {
@@ -214,20 +200,27 @@
            << "]" << endl;
   }
 
+  if (engine)
+  {
+    engine->detect_tscr(input, input_tscr, output, tscr_upper, tscr_lower);
+    return;
+  }
+
   for (uint64_t ichan=s_chan; ichan < e_chan; ichan++)
   {
-    // check the tscrunched value for this idat
-    V_p0 = indat[2*ichan];
-    V_p1 = indat[2*ichan+1];
-
-    if ( V_p0 > tscr_upper ||
-         V_p0 < tscr_lower ||
-         V_p1 > tscr_upper ||
-         V_p1 < tscr_lower )
+    zap_chan = 0;
+    for (unsigned ipol=0; ipol < npol; ipol++)
+    {
+      V = indat[ichan*npol + ipol];
+      if (V > tscr_upper || V < tscr_lower)
+        zap_chan = 1;
+
+    }
+    if (zap_chan)
     {
       if (verbose)
-        cerr << "dsp::SKDetector::detect_tscr zap [" << V_p0 << ", " 
-             << V_p1 << "] ichan=" << ichan << endl;
+        cerr << "dsp::SKDetector::detect_tscr zap V=" << V << ", " 
+             << "ichan=" << ichan << endl;
       outdat = output->get_datptr();
       for (unsigned idat=0; idat < ndat; idat++)
       {
@@ -245,66 +238,45 @@
   if (verbose)
     cerr << "dsp::SKDetector::detect_skfb()" << endl;
 
+  if (engine)
+  {
+    engine->detect_ft(input, output, upper_thresh, lower_thresh);
+    return;
+  }
+
   const unsigned nchan   = input->get_nchan();
   const unsigned npol    = input->get_npol();
   uint64_t ndat          = input->get_ndat();
   const float * indat    = input->get_dattfp();
   unsigned char * outdat = output->get_datptr();
-#ifdef USE_MEGA_THRESHOLDS
-  uint64_t zapped_mega = 0;
-#endif
-  float V_p0 = 0;
-  float V_p1 = 0;
+  float V = 0;
+  char zap;
 
   // compare SK estimator for each pol to expected values
   for (uint64_t idat=0; idat < ndat; idat++)
   {
-#ifdef USE_MEGA_THRESHOLDS
-    zapped_mega = 0;
-#endif
-
     // for each channel and pol in the SKFB
     for (unsigned ichan=0; ichan < nchan; ichan++)
     {
-      V_p0 =  indat[ichan*2];
-      V_p1 =  indat[ichan*2+1];
-
-      if ( V_p0 > upper_thresh || 
-           V_p0 < lower_thresh ||
-           V_p1 > upper_thresh || 
-           V_p1 < lower_thresh )
+      zap = 0;
+      for (unsigned ipol=0; ipol < npol; ipol++)
+      {
+        V = indat[npol*ichan + ipol];
+        if (V > upper_thresh || V < lower_thresh)
+        {
+          zap = 1;
+        }
+      }
+      if (zap)
       {
         outdat[ichan] = 1;
 
         // only count skfb zapped channels in the in-band region
         if (ichan > s_chan && ichan < e_chan)
           ndat_zapped_skfb++;
-
-#ifdef USE_MEGA_THRESHOLDS
-        if ( V_p0 > mega_upper_thresh || 
-             V_p0 < mega_lower_thresh ||
-             V_p1 > mega_upper_thresh || 
-             V_p1 < mega_lower_thresh )
-        {
-          zapped_mega ++;
-        }
-#endif
       }
     }
 
-#ifdef USE_MEGA_THRESHOLDS 
-    if (zapped_mega > 10)
-    {
-      if (verbose)
-        cerr << "ZAP mega n_bad_chan=" << zapped_mega << endl;
-      for (unsigned ichan=0; ichan<nchan; ichan++)
-      {
-        outdat[ichan] = 1;
-      }
-      ndat_zapped_mega += nchan;
-    }
-#endif
-    
     indat += nchan * npol;
     outdat += nchan; 
   }
@@ -312,6 +284,12 @@
 
 void dsp::SKDetector::reset_mask()
 {
+  if (engine)
+  {
+    engine->reset_mask(output);
+    return;
+  }
+
   unsigned nchan         = output->get_nchan();
   uint64_t ndat          = output->get_ndat();
   unsigned char * outdat = output->get_datptr();
@@ -332,6 +310,15 @@
   if (verbose)
     cerr << "dsp::SKDetector::count_zapped hits=" << unfiltered_hits << endl;
 
+  int zapped = 0;
+
+  if (engine)
+  {
+    zapped = engine->count_mask (output);
+    ndat_zapped += zapped;
+    return;
+  }
+
   unsigned npol          = input->get_npol();
   const float * indat    = input->get_dattfp();
 
@@ -355,6 +342,7 @@
 
   for (uint64_t idat=0; idat < ndat; idat++)
   {
+    // number of SK idats (same for each channel)
     unfiltered_hits ++;
 
     for (unsigned ichan=s_chan; ichan < e_chan; ichan++)
@@ -362,19 +350,20 @@
       uint64_t index = (idat*nchan + ichan) * npol;
       unsigned outdex = ichan * npol;
 
+      // sum of all SK values 
       unfiltered_sum[outdex] += indat[index];
       if (npol == 2)
-	unfiltered_sum[outdex+1] += indat[index+1];
-	
+        unfiltered_sum[outdex+1] += indat[index+1];
+  
       if (outdat[(idat*nchan) + ichan] == 1)
       {
         ndat_zapped ++;
-	continue;
+        continue;
       }
 
       filtered_sum[outdex] += indat[index];
       if (npol == 2)
-	filtered_sum[outdex+1] += indat[index+1];
+        filtered_sum[outdex+1] += indat[index+1];
 
       filtered_hits[ichan] ++;
     }
@@ -386,66 +375,81 @@
   if (verbose)
     cerr << "dsp::SKDetector::detect_fscr()" << endl;
 
+  float _M = (float) M;
+  float mu2 = (4 * _M * _M) / ((_M-1) * (_M + 2) * (_M + 3));
   unsigned nchan       = input->get_nchan();
+
+  if (engine)
+  {
+    float one_sigma_idat   = sqrt(mu2 / float((e_chan-s_chan)+1));
+    const float upper = 1 + ((1+n_std_devs) * one_sigma_idat);
+    const float lower = 1 - ((1+n_std_devs) * one_sigma_idat);
+    engine->detect_fscr(input, output, lower, upper, s_chan, e_chan);
+    return;
+  }
+
   const unsigned npol  = input->get_npol();
   const uint64_t ndat  = input->get_ndat();
 
   const float * indat  = input->get_dattfp();
   unsigned char * outdat = output->get_datptr();
 
-  float sk_avg_p0 = 0;
-  float sk_avg_p1 = 0;
+  float sk_avg;
   unsigned sk_avg_cnt = 0;
   
+  unsigned zap_idat;
+  uint64_t nzap = 0;
+
   // foreach SK integration
   for (uint64_t idat=0; idat < ndat; idat++)
   {
-    sk_avg_p0 = 0;
-    sk_avg_p1 = 0;
-    sk_avg_cnt = 0;
-
-    if (verbose)
-      cerr << "dsp::SKDetector::detect_fscr idat=" << idat << endl;
-    // accumulate the avg values for p0 and p1 
-    for (unsigned ichan=s_chan; ichan < e_chan; ichan++)
+    zap_idat = 0;
+    for (unsigned ipol=0; ipol < npol; ipol++)
     {
-      if (outdat[ichan] == 0)
+      sk_avg = 0;
+      sk_avg_cnt = 0;
+
+      for (unsigned ichan=s_chan; ichan < e_chan; ichan++)
+      {
+        if (outdat[ichan] == 0)
+        {
+          sk_avg += indat[ichan*npol + ipol];
+          sk_avg_cnt++;
+        }
+      }
+
+      if (sk_avg_cnt > 0)
       {
-        sk_avg_p0 += indat[ichan*2];
-        sk_avg_p1 += indat[ichan*2+1];
-        sk_avg_cnt++;
+        sk_avg /= (float) sk_avg_cnt;
+
+        float one_sigma_idat = sqrt(mu2 / (float) sk_avg_cnt);
+        float avg_upper_thresh = 1 + ((1+n_std_devs) * one_sigma_idat);
+        float avg_lower_thresh = 1 - ((1+n_std_devs) * one_sigma_idat);
+        if ((sk_avg > avg_upper_thresh) || (sk_avg < avg_lower_thresh))
+        {
+          if (verbose)
+            cerr << "Zapping idat=" << idat << " ipol=" << ipol << " sk_avg=" << sk_avg
+                 << " [" << avg_lower_thresh << " - " << avg_upper_thresh
+                 << "] cnt=" << sk_avg_cnt << endl;
+          zap_idat = 1;
+        }
       }
     }
 
-    if (sk_avg_cnt > 0)
+    if (zap_idat)
     {
-      sk_avg_p0 /= (float) sk_avg_cnt;
-      sk_avg_p1 /= (float) sk_avg_cnt;
-
-      float _M = (float) M;
-      float mu2 = (4 * _M * _M) / ((_M-1) * (_M + 2) * (_M + 3));
-      float one_sigma_idat = sqrt(mu2 / (float) sk_avg_cnt);
-
-      float avg_upper_thresh = 1 + ((n_std_devs) * one_sigma_idat);
-      float avg_lower_thresh = 1 - ((n_std_devs) * one_sigma_idat);
-
-      if ((sk_avg_p0 > avg_upper_thresh) || (sk_avg_p0 < avg_lower_thresh) ||
-          (sk_avg_p1 > avg_upper_thresh) || (sk_avg_p1 < avg_lower_thresh))
+      for (unsigned ichan=0; ichan<nchan; ichan++)
       {
-        if (verbose)
-          cerr << "Zapping all p0=" << sk_avg_p0
-               << " p1=" << sk_avg_p1 << " [" << avg_lower_thresh << " - "
-               << avg_upper_thresh << "] cnt=" << sk_avg_cnt << endl;
-        for (unsigned ichan=0; ichan<nchan; ichan++)
-        {
-          outdat[ichan] = 1;
-        }
-        ndat_zapped_fscr += sk_avg_cnt;
+        outdat[ichan] = 1;
       }
+      //ndat_zapped_fscr += sk_avg_cnt;
+      ndat_zapped_fscr += nchan;
+      nzap += nchan;
     }
 
-    indat += nchan * npol; 
+    indat += nchan * npol;
     outdat += nchan;
   }
-  
+  //cerr << "dsp::SKDetector::detect_fscr ZAP=" << nzap << endl;
 }
+
diff -Nru bl-dspsr-0+git20160405/Signal/General/SKDetectorCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Signal/General/SKDetectorCUDA.cu
--- bl-dspsr-0+git20160405/Signal/General/SKDetectorCUDA.cu	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/SKDetectorCUDA.cu	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,399 @@
+//-*-C++-*-
+
+/***************************************************************************
+ *
+ *   Copyright (C) 2016 by Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+
+#include "dsp/SKDetectorCUDA.h"
+
+#include <iostream>
+#include <thrust/system/cuda/execution_policy.h>
+#include <thrust/reduce.h>
+#include <thrust/device_ptr.h>
+
+#include <cuComplex.h>
+//#define _DEBUG 1
+
+// TODO consider having schan / echan in mask represented by values other than 0, 1
+
+using namespace std;
+
+void check_error (const char*);
+
+CUDA::SKDetectorEngine::SKDetectorEngine (dsp::Memory * memory)
+{
+  device_memory = dynamic_cast<CUDA::DeviceMemory *>(memory);
+  stream = device_memory->get_stream();
+
+  estimates_host = new dsp::TimeSeries();
+  zapmask_host = new dsp::BitSeries();
+
+  pinned_memory  = new PinnedMemory ();
+  estimates_host->set_memory ((dsp::Memory *) pinned_memory);
+  zapmask_host->set_memory ((dsp::Memory *) pinned_memory);
+
+  transfer_estimates = new dsp::TransferCUDA (stream);
+  transfer_estimates->set_kind (cudaMemcpyDeviceToHost);
+  transfer_estimates->set_output( estimates_host );
+
+  transfer_zapmask = new dsp::TransferBitSeriesCUDA (stream);
+  transfer_zapmask->set_kind (cudaMemcpyDeviceToHost);
+  transfer_zapmask->set_output( zapmask_host );
+}
+
+void CUDA::SKDetectorEngine::setup ()
+{
+  if (dsp::Operation::verbose)
+    cerr << "CUDA::SKDetectorEngine::setup ()" << endl;
+
+  // determine GPU capabilities
+  int device = 0;
+  cudaGetDevice(&device);
+  struct cudaDeviceProp device_properties;
+  cudaGetDeviceProperties (&device_properties, device);
+  max_threads_per_block = device_properties.maxThreadsPerBlock;
+}
+
+
+// faster kernel for npol=1
+__global__ void detect_one_pol (const float * indat, unsigned char * outdat, uint64_t nval, float upper, float lower)
+{
+  unsigned idat  = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (idat < nval)
+  {
+    float V = indat[idat];
+    if (V < lower || V > upper)
+      outdat[idat] = 1;
+  }
+}
+
+__global__ void detect_two_pol (const float2 * indat, unsigned char * outdat, uint64_t nval, float upper, float lower)
+{
+  unsigned idat  = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (idat < nval)
+  {
+    const float2 V = indat[idat]; 
+    if (V.x < lower || V.x > upper || V.y < lower || V.y > upper)
+    {
+      outdat[idat] = 1;
+    }
+  }
+}
+
+
+// detect SK limits for N polarisations
+__global__ void detect_one_sample (const float * indat, unsigned char * outdat, uint64_t nval, float upper, float lower, unsigned npol)
+{
+  unsigned idat  = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  if (idat < nval)
+  {
+    unsigned zap = 0;
+    float V;
+
+    for (int ipol=0; ipol<npol; ipol++)
+    {
+      V = indat[(idat * npol) + ipol];
+      if (V < lower || V > upper)
+      {
+        zap = 1;
+      }
+    }
+    if (zap)
+      outdat[idat] = 1;
+  }
+}
+
+void CUDA::SKDetectorEngine::detect_ft (const dsp::TimeSeries* input,
+      dsp::BitSeries* output, float upper_thresh, float lower_thresh)
+{
+  if (dsp::Operation::verbose)
+    cerr << "CUDA::SKDetectorEngine::detect_ft()" << endl;
+
+  const unsigned nchan = input->get_nchan();
+  const unsigned npol  = input->get_npol();
+  const int64_t  ndat  = input->get_ndat();
+
+  const float * indat    = input->get_dattfp();   // TFP
+  unsigned char * outdat = output->get_datptr();  // TFP also!
+
+  uint64_t nval   = nchan * ndat;
+  uint64_t nblocks  = nval / max_threads_per_block;
+  if (nval % max_threads_per_block)
+    nblocks++;
+
+  dim3 threads (max_threads_per_block);
+  dim3 blocks (nblocks);
+
+  if (dsp::Operation::verbose)
+  {
+    cerr << "CUDA::SKDetectorEngine::detect_ft nval=" << nval << " nblocks=" << nblocks << " max_threads_per_block=" << max_threads_per_block << endl;
+    cerr << "CUDA::SKDetectorEngine::detect_ft thresholds [" << lower_thresh << " - " << upper_thresh << "]" << endl;
+    cerr << "CUDA::SKDetectorEngine::detect_ft npol=" << npol << endl;
+  }
+
+  if (npol == 1)
+    detect_one_pol<<<blocks,threads,npol,stream>>> (indat, outdat, nval, upper_thresh, lower_thresh);
+  else if (npol == 2)
+    detect_two_pol<<<blocks,threads,npol,stream>>> ((const float2 *) indat, outdat, nval, upper_thresh, lower_thresh);
+  else
+    detect_one_sample<<<blocks,threads,npol,stream>>> (indat, outdat, nval, upper_thresh, lower_thresh, npol);
+
+  if (dsp::Operation::record_time || dsp::Operation::verbose)
+    check_error( "CUDA::SKDetectorEngine::detect_ft detect_one_xxx" );
+
+#ifdef _DEBUG
+  int sum = count_mask(output);
+  cerr << "CUDA::SKDetectorEngine::detect_ft sum now " << sum << endl;
+#endif
+}
+
+// each block reads 1 time sample, all channels/pols
+// then do a block-wide sum
+
+// input data are stored TFP, 1 warp per time sample, 32 warps / block to sum across channels
+__global__ void reduce_sum_fscr_1pol (const float * input, unsigned char * out,
+                                      const unsigned nchan, float lower, float upper, 
+                                      unsigned schan, unsigned echan)
+{
+  extern __shared__ float sdata[];
+
+  unsigned idat = blockIdx.x;
+  const float * in = input + (idat * nchan);
+
+  float sum = 0;
+  for (unsigned ichan=threadIdx.x; ichan<nchan; ichan+=blockDim.x)
+  {
+    if (ichan >= schan && ichan < echan)
+      sum += in[ichan];
+  }
+
+  sdata[threadIdx.x] = sum;
+  __syncthreads();
+
+  // now do a block wide sum across all threads
+  int last_offset = blockDim.x / 2 ;
+  for (int offset = last_offset; offset > 0;  offset >>= 1)
+  {
+    if (threadIdx.x < offset)
+      sdata[threadIdx.x] += sdata[threadIdx.x + offset];
+
+    __syncthreads();
+  }
+
+  if (threadIdx.x == 0)
+  {
+    float val = sdata[0] / float((echan - schan) + 1);
+    if (val < lower || val > upper)
+      out[idat] = 1;
+  }
+}
+
+__global__ void reduce_sum_fscr_2pol (const float2 * input, unsigned char * out,
+                                      const unsigned nchan, float lower, float upper, 
+                                      unsigned schan, unsigned echan)
+{
+  extern __shared__ float2 sdata2[];
+
+  // idat = blockIdx.x
+  const float2 * in = input + (blockIdx.x * nchan);
+
+  float2 sum = make_cuComplex(0,0);
+  for (unsigned ichan=threadIdx.x; ichan<nchan; ichan+=blockDim.x)
+  {
+    if (ichan >= schan && ichan < echan)
+      sum = cuCaddf(sum, in[ichan]);
+  }
+
+  sdata2[threadIdx.x] = sum;
+  __syncthreads();
+
+  // now do a block wide sum across all threads
+  int last_offset = blockDim.x / 2;
+  for (int offset = last_offset; offset > 0;  offset >>= 1)
+  {
+    if (threadIdx.x < offset)
+      sdata2[threadIdx.x] = cuCaddf(sdata2[threadIdx.x], sdata2[threadIdx.x + offset]);
+    __syncthreads();
+  }
+
+  if (threadIdx.x == 0)
+  {
+    float nvalidchan = float((echan - schan) + 1);
+    float p0 = sdata2[0].x / nvalidchan;
+    float p1 = sdata2[0].y / nvalidchan;
+
+    if (p0 < lower || p0 > upper || p1 < lower || p1 > upper)
+      out[blockIdx.x] = 1;
+  }
+}
+
+
+void CUDA::SKDetectorEngine::detect_fscr (const dsp::TimeSeries* input, dsp::BitSeries* output, const float lower, const float upper, unsigned schan, unsigned echan)
+{
+  if (dsp::Operation::verbose)
+    cerr << "CUDA::SKDetectorEngine::detect_fscr()" << endl;
+
+  const unsigned nchan = input->get_nchan();
+  const unsigned npol  = input->get_npol();
+  const int64_t ndat   = input->get_ndat();
+
+  const unsigned nblocks = ndat;
+  unsigned nthreads = max_threads_per_block;
+  if (nchan < nthreads)
+    nthreads = nchan;
+  const size_t shared_bytes = nthreads * npol * sizeof(float);
+
+  // indat is the SK estimatesestimates
+  const float * indat    = input->get_dattfp();
+
+  // outdat is the bitmask
+  unsigned char * outdat = output->get_datptr();
+
+  if (dsp::Operation::verbose)
+  {
+    cerr << "CUDA::SKDetectorEngine::detect_fscr nchan=" << nchan << " ndat=" << ndat << endl;
+    cerr << "CUDA::SKDetectorEngine::detect_fscr nblocks=" << nblocks << " nthreads=" << nthreads << " shared_bytes=" << shared_bytes << endl;
+    cerr << "CUDA::SKDetectorEngine::detect_fscr thresholds [" << lower << " - " << upper << "]" << endl;
+  }
+
+  if (npol == 1)
+    reduce_sum_fscr_1pol<<<nblocks,nthreads,shared_bytes,stream>>>(indat, outdat, nchan, lower, upper, schan, echan);
+  else
+    reduce_sum_fscr_2pol<<<nblocks,nthreads,shared_bytes,stream>>>((float2*) indat, outdat, nchan, lower, upper, schan, echan);
+
+  if (dsp::Operation::record_time || dsp::Operation::verbose)
+    check_error( "CUDA::SKDetectorEngine::detect_fscr_element" );
+
+#ifdef _DEBUG
+  int sum = count_mask(output);
+  cerr << "CUDA::SKDetectorEngine::detect_fscr mask_sum=" << sum << endl;
+#endif
+
+  if (dsp::Operation::record_time || dsp::Operation::verbose)
+    check_error( "CUDA::SKDetectorEngine::detect detect_fscr" );
+}
+
+__global__ void detect_tscr_element (const float * indat, unsigned char * outdat, uint64_t nval, float upper, float lower, unsigned npol, unsigned nchan)
+{
+  extern __shared__ char sk_tscr[];
+
+  unsigned int idat  = (blockIdx.x * blockDim.x + threadIdx.x);
+
+  if (idat < nval)
+  {
+    const unsigned nchanpol = nchan * npol;
+    const unsigned ichanpol = idat % nchanpol;
+
+    // first nchan threads to fill shared mem with the tscr SK estimates for each chan & pol (TFP)
+    if (threadIdx.x < nchanpol)
+    {
+      sk_tscr[threadIdx.x] = (char) (indat[threadIdx.x] > upper || indat[threadIdx.x] < lower);
+    }
+    __syncthreads();
+
+    outdat[idat/npol] = sk_tscr[ichanpol];
+  }
+}
+
+
+void CUDA::SKDetectorEngine::detect_tscr (const dsp::TimeSeries* input,
+      const dsp::TimeSeries* input_tscr, dsp::BitSeries* output,
+      float upper_thresh, float lower_thresh)
+{
+  if (dsp::Operation::verbose)
+    cerr << "CUDA::SKDetectorEngine::detect_tscr()" << endl;
+  const unsigned nchan   = input->get_nchan();
+  const unsigned npol    = input->get_npol();
+  const int64_t ndat     = output->get_ndat();
+
+  // indat is the tscr mask [nchan vals]
+  const float * indat    = input_tscr->get_dattfp();
+
+  // outdat is the bitmask
+  unsigned char * outdat = output->get_datptr();
+
+  // this kernel is indexed on output rather than input
+  const uint64_t nval = ndat * nchan;
+  uint64_t nblocks  = nval / max_threads_per_block;
+  if (nval % max_threads_per_block)
+    nblocks++;
+
+  dim3 threads (max_threads_per_block);
+  dim3 blocks (nblocks);
+  unsigned shared_bytes = nchan*npol*sizeof(char);
+
+  if (dsp::Operation::verbose)
+    cerr << "CUDA::SKDetectorEngine::detect_tscr_element ndat=" << ndat
+         << " nchan=" << nchan << " nval=" << nval
+         << " max_threads=" << max_threads_per_block
+         << " nblocks=" << nblocks << endl;
+
+  detect_tscr_element<<<blocks,threads,shared_bytes,stream>>>(indat, outdat, nval, upper_thresh, lower_thresh, npol, nchan);
+
+  if (dsp::Operation::record_time || dsp::Operation::verbose)
+    check_error( "CUDA::SKDetectorEngine::detect_tscr_element" );
+
+#ifdef _DEBUG
+  int sum = count_mask(output);
+  cerr << "CUDA::SKDetectorEngine::detect_tscr mask_sum=" << sum << endl;
+#endif
+}
+
+
+void CUDA::SKDetectorEngine::reset_mask (dsp::BitSeries* output)
+{
+  unsigned nchan         = output->get_nchan();
+  int64_t ndat           = output->get_ndat();
+  unsigned char * outdat = output->get_datptr();
+
+  size_t nbytes = nchan * ndat;
+
+  cudaError error = cudaMemsetAsync (outdat, 0, nbytes, stream);
+  if (error != cudaSuccess)
+    throw Error (FailedCall, "CUDA::SKDetectorEngine::reset_mask ",
+                 "cudaMemset (%p, 0, %u): %s", outdat, nbytes,
+                 cudaGetErrorString (error));
+#ifdef _DEBUG
+  int sum = count_mask(output);
+  cerr << "CUDA::SKDetectorEngine::reset_mask sum now " << sum << endl;
+#endif
+}
+
+int CUDA::SKDetectorEngine::count_mask (const dsp::BitSeries* output)
+{
+  unsigned char * outdat = const_cast<unsigned char *>(output->get_datptr());
+  const unsigned nchan   = output->get_nchan();
+  const int64_t ndat     = output->get_ndat();
+  int sum = 0;
+/*
+  const uint64_t nval    = (uint64_t) ndat * nchan;
+  cudaStreamSynchronize(stream);
+  thrust::device_ptr<unsigned char> d = thrust::device_pointer_cast(outdat);
+  int sum = thrust::reduce(thrust::cuda::par.on(stream), d, d+nval, (int) 0, thrust::plus<int>());
+  cudaStreamSynchronize(stream);
+*/
+
+  return sum;
+}
+
+float * CUDA::SKDetectorEngine::get_estimates (const dsp::TimeSeries * input)
+{
+  transfer_estimates->set_input (input);
+  transfer_estimates->operate ();
+  cudaStreamSynchronize (stream);
+  return estimates_host->get_dattfp();
+}
+
+unsigned char * CUDA::SKDetectorEngine::get_zapmask (const dsp::BitSeries * input)
+{
+  transfer_zapmask->set_input (input);
+  transfer_zapmask->operate ();
+  cudaStreamSynchronize (stream);
+  return zapmask_host->get_datptr();
+}
+
diff -Nru bl-dspsr-0+git20160405/Signal/General/SKFilterbank.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/SKFilterbank.C
--- bl-dspsr-0+git20160405/Signal/General/SKFilterbank.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/SKFilterbank.C	2018-03-12 23:02:35.000000000 +0000
@@ -63,6 +63,13 @@
 void dsp::SKFilterbank::custom_prepare ()
 {
   output->set_order( TimeSeries::OrderTFP );
+  if (engine)
+  {
+    cerr << "dsp::SKFilterbank::custom_prepare engine->setup()" << endl;
+    engine->setup();
+    cerr << "dsp::SKFilterbank::custom_prepare engine->prepare (input, " << nsamp_fft << ")" << endl;
+    engine->prepare (input, nsamp_fft);
+  }
 }
 
 /* 
@@ -71,7 +78,6 @@
  */
 uint64_t dsp::SKFilterbank::get_skfb_inc (uint64_t blocksize)
 {
-
   if (verbose)
     cerr << "dsp::SKFilterbank::get_skfb_inc M=" << tscrunch 
          << " nsamp_fft=" << nsamp_fft << " blocksize=" << blocksize << endl;
@@ -139,101 +145,110 @@
     output_tscr->set_npol(npol);
     output_tscr->set_ndim(1);
     output_tscr->resize(1);
-
-    if (verbose)
-      cerr << "dsp::SKFilterbank::filterbank S?_tscr.resize(" << nchan*npol << ")" << endl;
-    S1_tscr.resize(nchan * npol);
-    S2_tscr.resize(nchan * npol);
   }
 
-  // initialise tscr
-  if (output_tscr)
+  if (engine)
+  {
+    engine->perform (input, output, output_tscr);
+  }
+  else
   {
-    for (unsigned i=0; i<nchan*npol; i++)
+    if (output_tscr && nchan > output_tscr->get_nchan())
+    { 
+      if (verbose)
+        cerr << "dsp::SKFilterbank::filterbank S?_tscr.resize(" << nchan*npol << ")" << endl;
+      S1_tscr.resize(nchan * npol);
+      S2_tscr.resize(nchan * npol);
+    }
+
+    // initialise tscr
+    if (output_tscr)
     {
-      S1_tscr[i]=0;
-      S2_tscr[i]=0;
+      for (unsigned i=0; i<nchan*npol; i++)
+      {
+        S1_tscr[i]=0;
+        S2_tscr[i]=0;
+      }
     }
-  }
-  if (verbose)
-    cerr << "dsp::SKFilterbank::filterbank starting threads" << endl;
+    if (verbose)
+      cerr << "dsp::SKFilterbank::filterbank starting threads" << endl;
 
-  // start SK threads
-  start_threads();
+    // start SK threads
+    start_threads();
 
-  // wait for completion
-  wait_threads();
+    // wait for completion
+    wait_threads();
 
-  if (verbose)
-    cerr << "dsp::SKFilterbank::filterbank threads ended" << endl;
+    if (verbose)
+      cerr << "dsp::SKFilterbank::filterbank threads ended" << endl;
 
-  // now we need to combine the results from each SK Thread. Note
-  // first thread should already be "in place"
+    // now we need to combine the results from each SK Thread. Note
+    // first thread should already be "in place"
 
-  uint64_t decimated_ndat = ndat / (nsamp_fft * tscrunch);
-  uint64_t thread_ndat = decimated_ndat / n_threads;
-  uint64_t ndat_span = decimated_ndat / n_threads;
-  uint64_t out_span = nchan * npol;
-  uint64_t in_span  = nchan * npol * 2 * tscrunch;
+    uint64_t decimated_ndat = ndat / (nsamp_fft * tscrunch);
+    uint64_t thread_ndat = decimated_ndat / n_threads;
+    uint64_t ndat_span = decimated_ndat / n_threads;
+    uint64_t out_span = nchan * npol;
+    uint64_t in_span  = nchan * npol * 2 * tscrunch;
 
-  if (debugd < 1)
-    cerr << "dsp::SKFilterbank::filterbank out_span=" << out_span << " in_span=" << in_span << endl;
+    if (debugd < 1)
+      cerr << "dsp::SKFilterbank::filterbank out_span=" << out_span << " in_span=" << in_span << endl;
 
-  // the 0th thread will operate inplace, only need to memcpy the others
-  for (unsigned ithread=1; ithread<n_threads; ithread++)
-  {
-    // last thread can have additional ndat
-    if (ithread == n_threads - 1)
+    // the 0th thread will operate inplace, only need to memcpy the others
+    for (unsigned ithread=1; ithread<n_threads; ithread++)
     {
-      thread_ndat += decimated_ndat % n_threads;
-    }
-
-    uint64_t out_offset = ithread * ndat_span * out_span;
-    uint64_t in_offset  = ithread * ndat_span * in_span;
+      // last thread can have additional ndat
+      if (ithread == n_threads - 1)
+      {
+        thread_ndat += decimated_ndat % n_threads;
+      }
+
+      uint64_t out_offset = ithread * ndat_span * out_span;
+      uint64_t in_offset  = ithread * ndat_span * in_span;
+
+      float *outdat = output->get_dattfp () + out_offset;
+      float *indat  = output->get_dattfp () + in_offset;
+      size_t size = thread_ndat * out_span * sizeof(float);
+
+      if (debugd < 1)
+        cerr << "dsp::SKFilterbank::filterbank [" << ithread << "] memcpy "
+             << " out_offset=" << out_offset << " in_offset=" << in_offset
+             << " ndat=" << thread_ndat * out_span << " size=" << size << endl;
 
-    float *outdat = output->get_dattfp () + out_offset;
-    float *indat  = output->get_dattfp () + in_offset;
-    size_t size = thread_ndat * out_span * sizeof(float);
+      memcpy (outdat, indat, size);
+    }
 
+    // now compute the SK statisics for the tscr vector, from the S1 and S2 arrays
     if (debugd < 1)
-      cerr << "dsp::SKFilterbank::filterbank [" << ithread << "] memcpy "
-           << " out_offset=" << out_offset << " in_offset=" << in_offset
-           << " ndat=" << thread_ndat * out_span << " size=" << size << endl;
-
-    memcpy (outdat, indat, size);
-  }
-
-  // now compute the SK statisics for the tscr vector, from the S1 and S2 arrays
-  if (debugd < 1)
-    cerr << "dsp::SKFilterbank::filterbank calculating tscrunch SK estimates" << endl;
-
-  if (output_tscr)
-  {
-    float S1 = 0;
-    float S2 = 0;
-    float M = (float) (tscrunch * decimated_ndat);
-    float M_fac = (M+1) / (M-1);
-    float * outdat = output_tscr->get_dattfp();
+      cerr << "dsp::SKFilterbank::filterbank calculating tscrunch SK estimates" << endl;
 
-    if (debugd < 1)
-      cerr << "dsp::SKFilterbank::filterbank tscr M=" << M <<" M_fac=" << M_fac << endl;
-    for (unsigned ichan=0; ichan<nchan; ichan++)
-    { 
-      // pol0 
-      S1 = S1_tscr[2*ichan];
-      S2 = S2_tscr[2*ichan];
-      outdat[2*ichan] = M_fac * (M * (S2 / (S1*S1)) - 1);
-
-      // pol1
-      S1 = S1_tscr[2*ichan+1];
-      S2 = S2_tscr[2*ichan+1];
-      outdat[2*ichan+1] = M_fac * (M * (S2 / (S1*S1)) - 1);
+    if (output_tscr)
+    {
+      float S1 = 0;
+      float S2 = 0;
+      float M = (float) (tscrunch * decimated_ndat);
+      float M_fac = (M+1) / (M-1);
+      float * outdat = output_tscr->get_dattfp();
+
+      if (debugd < 1)
+        cerr << "dsp::SKFilterbank::filterbank tscr M=" << M <<" M_fac=" << M_fac << endl;
+      for (unsigned ichan=0; ichan<nchan; ichan++)
+      { 
+        // pol0 
+        S1 = S1_tscr[2*ichan];
+        S2 = S2_tscr[2*ichan];
+        outdat[2*ichan] = M_fac * (M * (S2 / (S1*S1)) - 1);
+
+        // pol1
+        S1 = S1_tscr[2*ichan+1];
+        S2 = S2_tscr[2*ichan+1];
+        outdat[2*ichan+1] = M_fac * (M * (S2 / (S1*S1)) - 1);
+      }
     }
-  }
- 
+  } 
   if (debugd < 1)
     cerr << "dsp::SKFilterbank::filterbank setting ndat=" << nscrunches << endl;
-  
+    
   output->set_ndat (nscrunches);
   output->set_npol (npol);
   output->set_state (Signal::PPQQ);
diff -Nru bl-dspsr-0+git20160405/Signal/General/SKFilterbankCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Signal/General/SKFilterbankCUDA.cu
--- bl-dspsr-0+git20160405/Signal/General/SKFilterbankCUDA.cu	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/SKFilterbankCUDA.cu	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,252 @@
+//-*-C++-*-
+
+/***************************************************************************
+ *
+ *   Copyright (C) 2016 by Andre Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#include "dsp/SKFilterbankCUDA.h"
+#include "dsp/MemoryCUDA.h"
+
+#include "CUFFTError.h"
+#include "Error.h"
+#include "templates.h"
+#include "debug.h"
+
+#include <stdio.h>
+#include <memory>
+#include <string.h>
+
+//#define _DEBUG 1
+
+using namespace std;
+
+void check_error_stream (const char*, cudaStream_t);
+
+/* Perform a reduction including SQLD calculations */
+__global__ void reduce_sqld (cufftComplex* input, cufftComplex* output, float * skout, unsigned nchan, unsigned npol, unsigned M)
+{
+  // each block is a tsrunch, threads are channels
+
+  // increment input and output pointer
+  input  += (blockIdx.x * nchan * M);
+  output += (blockIdx.x * nchan);
+  skout  += (blockIdx.x * nchan);
+
+  const float M_fac = (M+1) / (M-1);
+
+  cufftComplex val;
+  for (unsigned ichan=threadIdx.x; ichan<nchan; ichan+=blockDim.x)
+  {
+    float s1 = 0;
+    float s2 = 0;
+    cufftComplex* in = input;
+    for (unsigned idat=0; idat<M; idat++)
+    {
+      val = in[ichan];
+      float power = ((val.x * val.x) + (val.y * val.y));
+      s1 += power;
+      s2 += power * power;
+      in += nchan;
+    }
+    val.x = s1;
+    val.y = s2;
+    output[ichan] = val;
+
+    // write out the SK estimate for block of M
+    skout[npol*ichan] = M_fac * (M * (s2 / (s1 * s1)) - 1);
+  }
+}
+
+/* sum each set of S1 and S2 and compute SK estimate for whole block */
+__global__ void reduce_sk_estimate (cufftComplex* input, float * output, unsigned nchan, unsigned npol, unsigned ndat, float M)
+{
+  // input are stored in TF order
+  cufftComplex val;
+  const float M_fac = (M+1) / (M-1);
+
+  for (unsigned ichan=threadIdx.x; ichan<nchan; ichan+=blockDim.x)
+  {
+    float s1 = 0;
+    float s2 = 0;
+    cufftComplex* in = input;
+
+    for (unsigned idat=0; idat<ndat; idat++)
+    {
+      val = in[ichan];
+      s1 += val.x;
+      s2 += val.y;
+      in += nchan;
+    }
+    output[npol*ichan] = M_fac * (M * (s2 / (s1 * s1)) - 1);
+  }
+}
+
+CUDA::SKFilterbankEngine::SKFilterbankEngine (dsp::Memory * _memory, unsigned _tscrunch)
+{
+  memory = dynamic_cast<CUDA::DeviceMemory*>(_memory);
+  stream = memory->get_stream();
+  tscrunch = _tscrunch;
+
+  cufftResult result = cufftCreate (&plan);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::SKFilterbankEngine::SKFilterbankEngine",
+                      "cufftCreate(plan)");
+  npt = 0;
+}
+
+CUDA::SKFilterbankEngine::~SKFilterbankEngine ()
+{
+}
+
+void CUDA::SKFilterbankEngine::setup ()
+{
+  if (dsp::Operation::verbose)
+    cerr << "CUDA::SKFilterbankEngine::setup ()" << endl;
+
+  // determine GPU capabilities
+  int device = 0;
+  cudaGetDevice(&device);
+  struct cudaDeviceProp device_properties;
+  cudaGetDeviceProperties (&device_properties, device);
+  max_threads_per_block = device_properties.maxThreadsPerBlock;
+}
+
+void CUDA::SKFilterbankEngine::prepare (const dsp::TimeSeries * input, unsigned _npt)
+{
+  // real or complex input
+  cufftType type = CUFFT_C2C;
+  if (input->get_state() == Signal::Nyquist)
+    type = CUFFT_R2C;
+
+  npt = _npt;
+
+  unsigned ndim = input->get_ndim();
+  uint64_t ndat = input->get_ndat();
+  unsigned nbatch = (ndat / npt);
+
+  // 1D transform
+  int rank = 1;
+  int inembed[1] = { npt };
+  int onembed[1] = { npt / ndim };
+
+  // distance between successive elements
+  int istride = 1;
+  int ostride = 1;
+
+  // distance between sucessive batches
+  int idist = npt;
+  int odist = npt / ndim;
+  nchan = odist;
+
+  size_t work_size;
+
+  cufftResult result = cufftMakePlanMany (plan, rank, &npt,
+                              inembed, istride, idist,
+                              onembed, ostride, odist,
+                              type, nbatch, &work_size);
+
+  result = cufftSetStream (plan, stream);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::SKFilterbankEngine::prepare",
+          "cufftSetStream(plan)");
+
+  size_t bytes_required = nbatch * nchan * sizeof (cufftComplex);
+  if (bytes_required > buffer_size)
+  {
+    if (buffer) 
+      memory->do_free (buffer);
+    buffer = memory->do_allocate (bytes_required);
+    buffer_size = bytes_required;
+  }
+
+  bytes_required = (nbatch / tscrunch) * nchan * sizeof(cufftComplex);
+  if (bytes_required > sums_size)
+  {
+    if (sums)
+      memory->do_free (sums);
+    sums = memory->do_allocate (sums_size);
+    sums_size = bytes_required;
+  }
+}
+
+void CUDA::SKFilterbankEngine::perform (const dsp::TimeSeries* input,
+                                        dsp::TimeSeries* output,
+                                        dsp::TimeSeries* output_tscr)
+{
+  if (dsp::Operation::verbose)
+    std::cerr << "CUDA::SKFilterbankEngine::perform()" << std::endl;
+
+  uint64_t ndat  = input->get_ndat();
+  unsigned npol  = input->get_npol ();
+  unsigned npart = (unsigned) (ndat / npt);
+
+  if (input->get_order() != dsp::TimeSeries::OrderFPT)
+    throw Error(InvalidState, "CUDA::SKFilterbankEngine::perform",
+                "Only OrderFPT input order is supported");
+
+  // TODO decide what to do about multi-input channel data
+
+  // adjust FFT plan if required, TODO work on how npt is passed
+  if (npart != nbatch)
+    prepare (input, npt);
+   
+  // FFT output buffer from batched FFT
+  cufftComplex * buf = (cufftComplex *) buffer;
+
+  unsigned input_nchan = input->get_nchan ();
+  if (dsp::Operation::verbose)
+    std::cerr << "CUDA::SKFilterbankEngine::perform ndat=" << ndat 
+              << " input_nchan=" << input_nchan << " output_nchan=" << nchan 
+              << " npol=" << npol << " tscrunch=" << tscrunch << std::endl;
+
+  for (unsigned ipol=0; ipol<npol; ipol++)
+  {
+    // input time series
+    float * in = (float *) input->get_datptr (0, ipol);
+
+    // output SK estimates at (1/M) time sampling
+    float * out = (float *) output->get_dattfp();
+
+    // output SK estimates at block resolution
+    float * out_tscr = (float *) output_tscr->get_dattfp();
+
+    // batch FFT all the input data
+    if (type == CUFFT_R2C)
+      fft_real ((cufftReal *) in, buf);
+    else
+      fft_complex ((cufftComplex *)in, buf);
+
+    // specta now exist in out in TF format
+    int nthread = nchan;    
+    int nblocks = nbatch;
+
+    // convert the spectra into tscrunched S1 and S2 sums in Re and Im
+    reduce_sqld<<<nblocks,nthread,0,stream>>> (buf, (cufftComplex *) sums, out + ipol, nchan, npol, tscrunch);
+
+    // compute a tscrunched output SK
+    reduce_sk_estimate<<<1,nthread,0,stream>>>((cufftComplex *) sums, out_tscr + ipol, nchan, npol, npart, tscrunch);
+  }
+
+  check_error_stream("CUDA::SKFilterBank::perform", stream);
+
+}
+
+void CUDA::SKFilterbankEngine::fft_real (cufftReal *in, cufftComplex * out)
+{
+  cufftResult result = cufftExecR2C (plan, in, out);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::SKFilterbankEngine::fft_real",
+                      "cufftExecR2C(plan)");
+}
+
+void CUDA::SKFilterbankEngine::fft_complex (cufftComplex *in, cufftComplex * out)
+{
+  cufftResult result = cufftExecC2C (plan, in, out, CUFFT_FORWARD);
+  if (result != CUFFT_SUCCESS)
+    throw CUFFTError (result, "CUDA::SKFilterbankEngine::fft_complex",
+                      "cufftExecC2C(plan)");
+}
+
diff -Nru bl-dspsr-0+git20160405/Signal/General/SKMasker.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/SKMasker.C
--- bl-dspsr-0+git20160405/Signal/General/SKMasker.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/SKMasker.C	2018-03-12 23:02:35.000000000 +0000
@@ -63,6 +63,7 @@
   const unsigned ddfb_nchan = input->get_nchan();
   const uint64_t ddfb_ndat  = input->get_ndat();
   const uint64_t ddfb_npol  = input->get_npol();
+  const unsigned ddfb_ndim  = input->get_ndim();
 
   const unsigned mask_nchan = mask_input->get_nchan();
   const unsigned mask_npol  = mask_input->get_npol();
@@ -71,8 +72,6 @@
   const uint64_t ddfb_input_sample = input->get_input_sample();
   const uint64_t mask_input_sample = mask_input->get_input_sample();
 
-  const unsigned output_ndim = output->get_ndim();
-
   if (mask_npol != 1)
     throw Error (InvalidParam, "dsp::SKMasker::transformation",
                  "mask_npol != 1");
@@ -92,8 +91,13 @@
   // indicate the output timeseries contains zeroed data
   output->set_zeroed_data (true);
 
-  // and resize the output to ensure the hits array is reallocated
-  output->resize (output->get_ndat());
+  // resize the output to ensure the hits array is reallocated
+  if (engine)
+  {
+    if (verbose)
+      cerr << "dsp::SKMasker::transformation output->resize(" << output->get_ndat() << ")" << endl;
+    output->resize (output->get_ndat());
+  }
 
   // get base pointer to mask bitseries
   unsigned char * mask = mask_input->get_datptr ();
@@ -162,7 +166,11 @@
   uint64_t ddfb_end_idat;
 
   if (engine)
-    engine->setup (ddfb_nchan, ddfb_npol, output->get_nfloat_span());
+  {
+    if (verbose)
+      cerr << "dsp::SKMasker::transformation engine->setup()" << endl;
+    engine->setup ();
+  }
 
   for (uint64_t idat=0; idat < mask_ndat; idat++)
   {
@@ -208,10 +216,10 @@
     if (engine) 
     {
       unsigned mask_offset = mask_nchan * mask_npol * idat;
-      unsigned offset      = ddfb_start_idat*output_ndim;
-      unsigned end         = ddfb_nsamples*output_ndim;
+      unsigned offset      = ddfb_start_idat*ddfb_ndim;
+      unsigned end         = ddfb_nsamples*ddfb_ndim;
 
-      engine->perform (mask_input, mask_offset, output, offset, end);
+      //engine->perform (mask_input, mask_offset, output, offset, end);
     }
     else
     {
@@ -220,13 +228,11 @@
       {
         if (mask[ichan])
         {
-          float * zerop0 = output->get_datptr(ichan, 0) + (ddfb_start_idat*output_ndim);
-          float * zerop1 = output->get_datptr(ichan, 1) + (ddfb_start_idat*output_ndim);
-
-          for (unsigned j=0; j<ddfb_nsamples*output_ndim; j++)
+          for (unsigned ipol=0; ipol < ddfb_npol; ipol++)
           {
-            zerop0[j] = 0;
-            zerop1[j] = 0;
+            float * zero = output->get_datptr(ichan, ipol) + (ddfb_start_idat*ddfb_ndim);
+            for (unsigned j=0; j<ddfb_nsamples*ddfb_ndim; j++)
+              zero[j] = 0;
           }
         }
       }
diff -Nru bl-dspsr-0+git20160405/Signal/General/SKMaskerCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Signal/General/SKMaskerCUDA.cu
--- bl-dspsr-0+git20160405/Signal/General/SKMaskerCUDA.cu	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/SKMaskerCUDA.cu	2018-03-12 23:02:35.000000000 +0000
@@ -15,21 +15,14 @@
 
 void check_error (const char*);
 
-CUDA::SKMaskerEngine::SKMaskerEngine (cudaStream_t _stream)
+CUDA::SKMaskerEngine::SKMaskerEngine (dsp::Memory * memory)
 {
-  stream = _stream;
+  device_memory = dynamic_cast<CUDA::DeviceMemory *>(memory);
+  stream = device_memory->get_stream();
 }
 
-void CUDA::SKMaskerEngine::setup (unsigned _nchan, unsigned _npol, unsigned _span)
+void CUDA::SKMaskerEngine::setup ()
 {
-  if (dsp::Operation::verbose)
-    cerr << "CUDA::SKMaskerEngine::setup nchan=" << _nchan << " npol=" << _npol
-         << " span=" << _span << endl;
-
-  nchan = _nchan;
-  npol = _npol;
-  span = _span;
-
   // determine GPU capabilities 
   int device = 0;
   cudaGetDevice(&device);
@@ -38,88 +31,102 @@
   max_threads_per_block = device_properties.maxThreadsPerBlock;
 }
 
-
-/* cuda kernel to mask 1 channel for both polarisations */
-__global__ void mask1chan (unsigned char * mask_base,
-           float * out_base,
-           unsigned npol,
-           unsigned end,
-           unsigned span)
-{
-  // ichan = blockIdx.x * blockDim.x + threadIdx.x
-
-  float * p0 = out_base + span * npol * (blockIdx.x * blockDim.x + threadIdx.x);
-  float * p1 = out_base + span * npol * (blockIdx.x * blockDim.x + threadIdx.x) + span;
-
-  mask_base += (blockIdx.x * blockDim.x + threadIdx.x);
-
-  if (mask_base[0])
-  {
-    for (unsigned j=0; j<end; j++)
-    {
-      p0[j] = 0;
-      p1[j] = 0;
-    }
-  }
-
-}
-
 /*
  *  masks just 1 sample in the DDFB for the given SKFB channel and sample, uses __sync_threads
  *  with a __shared__ mask to improve read performance on access to the common mask value for all
  *  threads in a block
  */
 __global__ void mask1sample (unsigned char * mask_base,
-           float * out_base,
+           const float2 * in_base,
+           float2 * out_base,
+           uint64_t in_stride,
+           uint64_t out_stride,
+           uint64_t ndat,
            unsigned npol,
-           unsigned end,
-           unsigned span)
+           unsigned M)
 {
-  int ichan = blockIdx.x;
+  const unsigned idat = blockIdx.x * blockDim.x + threadIdx.x; 
+  if (idat >= ndat)
+    return;
 
-  __shared__ char mask;
+  const unsigned ichan = blockIdx.y;
+  const unsigned imask = idat / M;
 
-  if (threadIdx.x == 0)
-    mask = mask_base[ichan];
+  // load the mask
+  const unsigned char mask = mask_base[imask * gridDim.y + ichan];
 
-  __syncthreads();
+  // forward pointer to pol0 for this chan
+  out_base += ichan * npol * out_stride;
+  in_base  += ichan * npol * in_stride;
 
-  // zap if mask 
-  if (mask)
-  {
-    int idat = threadIdx.x;
-    int out_offset = (span * npol * ichan) + idat;
 
-    out_base[out_offset] = 0;         // p0
-    out_base[out_offset +span ] = 0;  // p1
+  for (unsigned ipol=0; ipol<npol; ipol++)
+  {
+    if (mask)
+    {
+      out_base[idat].x = 0;
+      out_base[idat].y = 0;
+    }
+    else
+    {
+      out_base[idat] = in_base[idat];
+    }
+    in_base  += in_stride;
+    out_base += out_stride;
   }
 }
 
 
-void CUDA::SKMaskerEngine::perform (dsp::BitSeries* mask, unsigned mask_offset, 
-           dsp::TimeSeries * output, unsigned offset, unsigned end)
+void CUDA::SKMaskerEngine::perform (dsp::BitSeries* mask, const dsp::TimeSeries * input,
+           dsp::TimeSeries * output, unsigned M)
 {
 
   if (dsp::Operation::verbose)
-    cerr << "CUDA::SKMaskerEngine::perform mask_offset=" << mask_offset << " offset=" << offset << " end=" << end << endl;
+    cerr << "CUDA::SKMaskerEngine::perform M=" << M << endl;
+  
+  // use output, since input may be InputBuffered
+  uint64_t ndat  = output->get_ndat();
+  unsigned nchan = output->get_nchan();
+  unsigned npol  = output->get_npol();
+  unsigned ndim  = output->get_ndim();
+  // TODO assert that ndim == 2
 
   // order is FPT
-  float * out_base = output->get_datptr(0, 0) + offset;
-  unsigned char * mask_base = mask->get_datptr() + mask_offset;
+  const float2 * in_base = (const float2 *) input->get_datptr (0, 0);
+  float2 * out_base = (float2 *) output->get_datptr (0, 0);
 
-  if (end > max_threads_per_block)
+  // order is TFP
+  unsigned char * mask_base = mask->get_datptr();
+
+  uint64_t in_stride, out_stride;
+  if (npol == 1)
   {
-    dim3 threads (128);
-    dim3 blocks (nchan/threads.x);
-    mask1chan<<<blocks,threads,0,stream>>> (mask_base, out_base, npol, end, span);
+    in_stride = input->get_datptr (1, 0) - input->get_datptr (0, 0);
+    out_stride = output->get_datptr (1, 0) - output->get_datptr (0, 0);
   }
   else
   {
-    dim3 threads (end);
-    dim3 blocks (nchan);
-    mask1sample<<<blocks,threads,0,stream>>> (mask_base, out_base, npol, end, span);
+    in_stride = input->get_datptr (0, 1) - input->get_datptr (0, 0);
+    out_stride = output->get_datptr (0, 1) - output->get_datptr (0, 0);
   }
 
+  // strides are numbers of floats between
+  in_stride /= ndim;
+  out_stride /= ndim;
+
+  unsigned threads = max_threads_per_block;
+  dim3 blocks (ndat / threads, nchan);
+  if (ndat % threads)
+    blocks.x++;
+
+#ifdef _DEBUG
+  cerr << "CUDA::SKMaskerEngine::perform ndat=" << ndat << " nchan=" << nchan << " npol=" << npol << " ndim=" << ndim << endl;
+  cerr << "CUDA::SKMaskerEngine::perform in_stride=" << in_stride << " out_stride=" << out_stride << endl;
+  cerr << "CUDA::SKMaskerEngine::perform blocks=(" << blocks.x << ", " << blocks.y << ") threads=" << threads << endl;
+#endif
+
+  mask1sample<<<blocks,threads,0,stream>>> (mask_base, in_base, out_base, in_stride, out_stride, ndat, npol, M);
+
   if (dsp::Operation::record_time || dsp::Operation::verbose)
     check_error( "CUDA::SKMaskerEngine::perform" );
 }
diff -Nru bl-dspsr-0+git20160405/Signal/General/SpectralKurtosis.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/SpectralKurtosis.C
--- bl-dspsr-0+git20160405/Signal/General/SpectralKurtosis.C	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/SpectralKurtosis.C	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,790 @@
+/***************************************************************************
+ *
+ *   Copyright (C) 2016 by Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#include "dsp/SpectralKurtosis.h"
+#include "dsp/InputBuffering.h"
+#include "dsp/SKLimits.h"
+
+#include <errno.h>
+#include <assert.h>
+#include <string.h>
+
+using namespace std;
+
+dsp::SpectralKurtosis::SpectralKurtosis() : Transformation<TimeSeries,TimeSeries>("SpectralKurtosis", outofplace)
+{
+  M = 128;
+  debugd = 1;
+
+  estimates = new TimeSeries;
+  estimates_tscr = new TimeSeries;
+  zapmask = new BitSeries;
+
+  // SK Detector
+  std_devs = 3;
+  channels.resize(2);
+  npart_total = 0;
+  thresholds.resize(2);
+  thresholds_tscr.resize(2);
+  zap_counts.resize(4);
+  detection_flags.resize(3);
+  std::fill (detection_flags.begin(), detection_flags.end(), false);
+  detection_flags.resize(3);
+  M_tscr = 0;
+
+  unfiltered_hits = 0;
+
+  prepared = false;
+
+  set_buffering_policy(new InputBuffering(this));
+}
+
+dsp::SpectralKurtosis::~SpectralKurtosis ()
+{
+  if (verbose)
+    cerr << "dsp::SpectralKurtosis::SpectralKurtosis~" << endl;
+
+  float percent_all = 0;
+  float percent_skfb = 0;
+  float percent_tscr = 0;
+  float percent_fscr = 0;
+
+  if (npart_total)
+  {
+    percent_all  = (100 * (float) zap_counts[ZAP_ALL]  / (float) npart_total);
+    percent_skfb = (100 * (float) zap_counts[ZAP_SKFB] / (float) npart_total);
+    percent_tscr = (100 * (float) zap_counts[ZAP_TSCR] / (float) npart_total);
+    percent_fscr = (100 * (float) zap_counts[ZAP_FSCR] / (float) npart_total);
+  }
+
+  cerr << "Zapped: " 
+       << " total=" << percent_all <<  "\%" << " skfb=" << percent_skfb << "\%"
+       << " tscr=" << percent_tscr << "\%" << " fscr=" << percent_fscr << "\%"
+       << endl;
+
+  delete estimates;
+  delete estimates_tscr;
+  delete zapmask;
+}
+
+bool dsp::SpectralKurtosis::get_order_supported (TimeSeries::Order order) const
+{
+  if (order == TimeSeries::OrderFPT || order == TimeSeries::OrderTFP)
+    return true;
+}
+
+
+void dsp::SpectralKurtosis::set_engine (Engine* _engine)
+{
+  if (verbose)
+    cerr << "dsp::SpectralKurtosis::set_engine()" << endl;
+  engine = _engine;
+}
+
+
+/*
+ * These are preparations that could be performed once at the start of
+ * the data processing
+ */
+void dsp::SpectralKurtosis::prepare ()
+{
+  if (verbose)
+    cerr << "dsp::SpectralKurtosis::prepare()" << endl;
+
+  nchan = input->get_nchan();
+  npol = input->get_npol();
+  ndim = input->get_ndim();
+
+  Memory * memory = const_cast<Memory *>(input->get_memory());
+  estimates->set_memory (memory);
+  estimates_tscr->set_memory (memory);
+  zapmask->set_memory (memory);
+
+  if (has_buffering_policy())
+  {
+    get_buffering_policy()->set_minimum_samples (M);
+  }
+
+  if (engine)
+  {
+    engine->setup ();
+  }
+  else
+  {
+    if (!detection_flags[1])
+    {
+      S1_tscr.resize(nchan * npol);
+      S2_tscr.resize(nchan * npol);
+    }
+  }
+
+  // ensure output containers are configured correctly
+  prepare_output ();
+
+  prepared = true;
+}
+
+/*! ensure output parameters are configured correctly */
+void dsp::SpectralKurtosis::prepare_output ()
+{
+  if (verbose)
+    cerr << "dsp::SpectralKurtosis::prepare_output()" << endl;
+    
+  double mask_rate = input->get_rate() / M;
+
+  estimates->copy_configuration (get_input());
+  estimates->set_ndim (1);                      // SK estimates have only single dimension
+  estimates->set_order (TimeSeries::OrderTFP);  // stored in TFP order
+  estimates->set_scale (1.0);                   // no scaling
+  estimates->set_rate (mask_rate);              // rate is /= M
+
+  if (input->get_npol() == 2)
+    estimates->set_state (Signal::PPQQ);
+  else
+    estimates->set_state (Signal::Intensity);
+
+  double tscrunch_mask_rate = mask_rate;
+  if (npart > 0)
+    tscrunch_mask_rate /= npart;
+
+  // tscrunched estimates have same configuration, except number of samples
+  estimates_tscr->copy_configuration (estimates);
+  estimates_tscr->set_order (TimeSeries::OrderTFP);  // stored in TFP order
+  estimates_tscr->set_rate (tscrunch_mask_rate);
+
+  // zap mask has same configuration as estimates with following changes
+  zapmask->copy_configuration (estimates);
+  zapmask->set_nbit (8);
+  zapmask->set_npol (1);
+
+  // configure output timeseries (out-of-place) to match input
+  output->copy_configuration (get_input()); 
+  output->set_input_sample (input->get_input_sample ());
+}
+
+/* ensure containers have correct dynamic size */ 
+void dsp::SpectralKurtosis::reserve ()
+{
+  if (verbose)
+    cerr << "dsp::SpectralKurtosis::reserve()" << endl;
+
+  const uint64_t ndat  = input->get_ndat();
+  npart = ndat / M;
+  output_ndat = npart * M;
+
+  if (verbose)
+    cerr << "dsp::SpectralKurtosis::reserve input_ndat=" << ndat 
+         << " npart=" << npart << " output_ndat=" << output_ndat << endl;
+
+  // use resize since out of place operation
+  estimates->resize (npart);
+  estimates_tscr->resize (npart > 0); // 1 if npart != 0
+  zapmask->resize (npart);
+  output->resize (output_ndat);
+}
+
+/* call set of transformations */
+void dsp::SpectralKurtosis::transformation ()
+{
+  if (!prepared)
+    prepare();
+
+  const uint64_t ndat  = input->get_ndat();
+  if (verbose || debugd < 1)
+    cerr << "dsp::SpectralKurtosis::transformation input ndat=" << ndat
+         << " tscrunch=" << M << endl;
+
+  npart = ndat / M;
+  output_ndat = npart * M;
+
+  if (verbose || debugd < 1)
+    cerr << "dsp::SpectralKurtosis::transformation input npart=" << npart
+         << " output_ndat=" << output_ndat << endl;
+
+  if (has_buffering_policy())
+  {
+    if (verbose || debugd < 1)
+      cerr << "dsp::SpectralKurtosis::transformation setting next_start_sample="
+           << output_ndat << endl;
+    get_buffering_policy()->set_next_start (output_ndat);
+  }
+
+  prepare_output ();
+
+  // ensure output containers are sized correctly
+  reserve ();
+  
+  if ((ndat == 0) || (npart == 0))
+    return;
+
+  // perform SK functions
+  compute ();
+  detect ();
+  mask ();
+  //insertsk();
+}
+
+void dsp::SpectralKurtosis::compute ()
+{
+  if (verbose)
+    cerr << "dsp::SpectralKurtosis::compute" << endl;
+
+  if (engine)
+  {
+    engine->compute (input, estimates, estimates_tscr, M);
+  }
+  else
+  {
+    // initialise tscr
+    if (!detection_flags[1])
+    {
+      std::fill(S1_tscr.begin(), S1_tscr.end(), 0);
+      std::fill(S2_tscr.begin(), S2_tscr.end(), 0);
+    }
+
+    float S1_sum, S2_sum;
+    const float M_fac = (M+1) / (M-1);
+    float * outdat = estimates->get_dattfp();
+
+    switch (input->get_order())
+    {
+      case dsp::TimeSeries::OrderTFP:
+      {
+        const unsigned int chan_stride = nchan * npol * ndim;
+        float * indat;
+
+        for (unsigned ipart=0; ipart < npart; ipart++)
+        {
+          indat = (float *) input->get_dattfp() + (M * ipart * chan_stride);
+
+          for (unsigned ichan=0; ichan<nchan; ichan++)
+          {
+            for (unsigned ipol=0; ipol < npol; ipol++)
+            {
+              S1_sum = 0;
+              S2_sum = 0;
+
+              // Square Law Detect for S1 + S2
+              for (unsigned i=0; i<M; i++)
+              {
+                float re = indat[chan_stride*i];
+                float im = indat[chan_stride*i+1];
+                float sqld = (re * re) + (im * im);
+                S1_sum += sqld;
+                S2_sum += (sqld * sqld);
+              }
+
+              // add the sums to the M timeseries
+              S1_tscr [ichan*npol + ipol] += S1_sum;
+              S2_tscr [ichan*npol + ipol] += S2_sum;
+
+              // calculate the SK estimator
+              if (S1_sum == 0)
+                outdat[ichan*npol + ipol] = 0;
+              else
+                outdat[ichan*npol + ipol] = M_fac * (M * (S2_sum / (S1_sum * S1_sum)) - 1);
+
+              indat += ndim;
+            }
+          }
+          outdat += nchan * npol;
+        }
+        break;
+      }
+
+      case dsp::TimeSeries::OrderFPT:
+      {
+        const unsigned int nfloat = M * ndim;
+        // foreach input channel
+        for (unsigned ipart=0; ipart < npart; ipart++)
+        {
+          for (unsigned ichan=0; ichan<nchan; ichan++)
+          {
+            for (unsigned ipol=0; ipol < npol; ipol++)
+            {
+              // input pointer for channel pol
+              const float* indat = input->get_datptr (ichan, ipol) + ipart * nfloat;
+
+              S1_sum = 0;
+              S2_sum = 0;
+
+              // Square Law Detect for S1 + S2
+              for (unsigned i=0; i<nfloat; i+=2)
+              {
+                float sqld = (indat[i] * indat[i]) + (indat[i+1] * indat[i+1]);
+                S1_sum += sqld;
+                S2_sum += (sqld * sqld);
+              }
+
+              // add the sums to the M timeseries
+              if (!detection_flags[1])
+              {
+                S1_tscr [ichan*npol + ipol] += S1_sum;
+                S2_tscr [ichan*npol + ipol] += S2_sum;
+              }
+
+              // calculate the SK estimator
+              if (S1_sum == 0)
+                outdat[ichan*npol + ipol] = 0;
+              else
+                outdat[ichan*npol + ipol] = M_fac * (M * (S2_sum / (S1_sum * S1_sum)) - 1);
+            }
+          }
+          outdat += nchan * npol;
+        }
+        break;
+      }
+
+      default:
+      {
+        throw Error (InvalidState, "dsp::SpectralKurtosis::compute", "unsupported input order");
+      }
+    }
+
+    // calculate the SK Estimator for the whole block of data
+    if (!detection_flags[1])
+    {
+      float M_t = (float) (M * npart);
+      float M_fac = (M_t+1) / (M_t-1);
+      float * outdat = estimates_tscr->get_dattfp();
+      if (verbose || debugd < 1)
+        cerr << "dsp::SpectralKurtosis::compute tscr M=" << M_t <<" M_fac=" << M_fac << endl;
+      for (unsigned ichan=0; ichan<nchan; ichan++)
+      {
+        for (unsigned ipol=0; ipol<npol; ipol++)
+        {
+          S1_sum = S1_tscr[ichan*npol + ipol];
+          S2_sum = S2_tscr[ichan*npol + ipol];
+          if (S1_sum == 0)
+            outdat[ichan*npol + ipol] = 0;
+          else
+            outdat[ichan*npol + ipol] = M_fac * (M_t * (S2_sum / (S1_sum * S1_sum)) - 1);
+        }
+      }
+    }
+  }
+
+  if (verbose || debugd < 1)
+    cerr << "dsp::SpectralKurtosis::compute done" << endl;
+  if (debugd < 1)
+    debugd++;
+}
+
+void dsp::SpectralKurtosis::set_thresholds (unsigned _M, unsigned _std_devs)
+{
+  M = _M;
+  std_devs = _std_devs;
+
+  if (verbose)
+    cerr << "dsp::SpectralKurtosis::set_thresholds SKlimits(" << M << ", " << std_devs << ")" << endl;
+  dsp::SKLimits limits(M, std_devs);
+  limits.calc_limits();
+
+  thresholds[0] = (float) limits.get_lower_threshold();
+  thresholds[1] = (float) limits.get_upper_threshold();
+
+  if (verbose)
+    cerr << "dsp::SpectralKurtosis::set_thresholds M=" << M << " std_devs="
+         << std_devs  << " [" << thresholds[0] << " - " << thresholds[1]
+         << "]" << endl;
+}
+
+void dsp::SpectralKurtosis::set_channel_range (unsigned start, unsigned end)
+{
+  channels[0] = start;
+  channels[1] = end;
+}
+
+void dsp::SpectralKurtosis::set_options (bool _disable_fscr, 
+    bool _disable_tscr, bool _disable_ft)
+{
+  detection_flags[0] = _disable_fscr;
+  detection_flags[1] = _disable_tscr;
+  detection_flags[2] = _disable_ft;
+}
+
+void dsp::SpectralKurtosis::detect ()
+{
+  if (verbose)
+    cerr << "dsp::SpectralKurtosis::detect" << endl;
+
+  // if no end channel was specified, do them all
+  if (channels[1] == 0)
+    channels[1] = nchan;
+
+  if (verbose || debugd < 1)
+  {
+    cerr << "dsp::SpectralKurtosis::detect npart= " << npart  
+         << " nchan=" << nchan << " nbit=" << input->get_nbit() 
+         << " npol=" << npol << " ndim=" << ndim << endl;
+
+    cerr << "dsp::SpectralKurtosis::detect OUTPUT ndat="
+         << zapmask->get_ndat() << " nchan=" << zapmask->get_nchan()
+         << " nbit=" << zapmask->get_nbit() << " npol=" << zapmask->get_npol() 
+         << " ndim=" << zapmask->get_ndim() << endl;
+  }
+
+  npart_total += (npart * nchan);
+
+  // reset the mask to all 0 (no zapping)
+  reset_mask();
+
+  // apply the tscrunches SKFB estiamtes to the mask
+  if (!detection_flags[1])
+    detect_tscr ();
+
+  // apply the SKFB estimates to the mask
+  if (!detection_flags[2])
+    detect_skfb ();
+
+  if (!detection_flags[0])
+    detect_fscr ();
+
+  count_zapped ();
+
+  if (debugd < 1)
+    debugd++;
+}
+
+/*
+ * Use the tscrunched SK statistic from the SKFB to detect RFI on eah channel
+ */
+void dsp::SpectralKurtosis::detect_tscr ()
+{
+  if (verbose)
+    cerr << "dsp::SpectralKurtosis::detect_tscr(" << npart << ")" << endl;
+
+  const float * indat    = estimates_tscr->get_dattfp();
+  unsigned char * outdat = 0;
+  unsigned zap_chan;
+  float V;
+
+  if (npart && (M_tscr != M * npart))
+  {
+    M_tscr = (float) (M * npart);
+
+    if (verbose)
+      cerr << "dsp::SpectralKurtosis::detect_tscr SKlimits(" << M_tscr << ", " << std_devs << ")" << endl;
+
+    dsp::SKLimits limits(M_tscr, std_devs);
+    limits.calc_limits();
+
+    thresholds_tscr[0] = (float) limits.get_lower_threshold();
+    thresholds_tscr[1] = (float) limits.get_upper_threshold();
+
+    if (verbose)
+      cerr << "dsp::SpectralKurtosis::detect_tscr M=" << M_tscr << " std_devs="
+           << std_devs  << " [" << thresholds_tscr[0] << " - " << thresholds_tscr[1]
+           << "]" << endl;
+  }
+
+  if (engine)
+  {
+    engine->detect_tscr (estimates, estimates_tscr, zapmask, thresholds_tscr[1], thresholds_tscr[0]);
+    return;
+  }
+
+  for (uint64_t ichan=channels[0]; ichan < channels[1]; ichan++)
+  {
+    zap_chan = 0;
+    for (unsigned ipol=0; ipol < npol; ipol++)
+    {
+      V = indat[ichan*npol + ipol];
+      if (V > thresholds_tscr[1] || V < thresholds_tscr[0])
+        zap_chan = 1;
+    }
+
+    if (zap_chan)
+    {
+      if (verbose)
+        cerr << "dsp::SpectralKurtosis::detect_tscr zap V=" << V << ", " 
+             << "ichan=" << ichan << endl;
+      outdat = zapmask->get_datptr();
+      for (unsigned ipart=0; ipart < npart; ipart++)
+      {
+        outdat[ichan] = 1;
+        zap_counts[ZAP_TSCR]++;
+        outdat += nchan;
+      }
+    }
+  }
+}
+
+void dsp::SpectralKurtosis::detect_skfb ()
+{
+  if (verbose)
+    cerr << "dsp::SpectralKurtosis::detect_skfb(" << npart << ")" << endl;
+
+  if (engine)
+  {
+    engine->detect_ft (estimates, zapmask, thresholds[1], thresholds[0]);
+    return;
+  }
+
+  const float * indat    = estimates->get_dattfp();
+  unsigned char * outdat = zapmask->get_datptr();
+  float V = 0;
+  char zap;
+
+  // compare SK estimator for each pol to expected values
+  for (uint64_t ipart=0; ipart < npart; ipart++)
+  {
+    // for each channel and pol in the SKFB
+    for (unsigned ichan=0; ichan < nchan; ichan++)
+    {
+      zap = 0;
+      for (unsigned ipol=0; ipol < npol; ipol++)
+      {
+        V = indat[npol*ichan + ipol];
+        if (V > thresholds[1] || V < thresholds[0])
+        {
+          zap = 1;
+        }
+      }
+      if (zap)
+      {
+        outdat[ichan] = 1;
+
+        // only count skfb zapped channels in the in-band region
+        if (ichan > channels[0] && ichan < channels[1])
+          zap_counts[ZAP_SKFB]++;
+      }
+    }
+
+    indat += nchan * npol;
+    outdat += nchan; 
+  }
+}
+
+void dsp::SpectralKurtosis::reset_mask ()
+{
+  if (engine)
+  {
+    engine->reset_mask (zapmask);
+    return;
+  }
+
+  unsigned char * outdat = zapmask->get_datptr();
+
+  for (unsigned ichan=0; ichan < nchan; ichan++)
+  {
+    for (uint64_t ipart=0; ipart < npart; ipart++)
+    {
+      outdat[(ipart*nchan) + ichan] = 0;
+    }
+  }
+}
+
+void dsp::SpectralKurtosis::count_zapped ()
+{
+  if (verbose)
+    cerr << "dsp::SpectralKurtosis::count_zapped hits=" << unfiltered_hits << endl;
+
+  int zapped = 0;
+
+  const float * indat;
+  unsigned char * outdat;
+
+  if (engine)
+  {
+    int zapped = engine->count_mask (zapmask);
+    indat = engine->get_estimates (estimates);
+    outdat = engine->get_zapmask(zapmask);
+    zap_counts[ZAP_ALL] += zapped;
+  }
+  else
+  {
+    indat    = estimates->get_dattfp();
+    outdat = zapmask->get_datptr();
+  }
+
+  assert (npart == estimates->get_ndat());
+  if (unfiltered_hits == 0)
+  {
+    filtered_sum.resize (npol * nchan);
+    std::fill (filtered_sum.begin(), filtered_sum.end(), 0);
+
+    filtered_hits.resize (nchan);
+    std::fill (filtered_hits.begin(), filtered_hits.end(), 0);
+
+    unfiltered_sum.resize (npol * nchan);
+    std::fill (unfiltered_sum.begin(), unfiltered_sum.end(), 0);
+  }
+
+  for (uint64_t ipart=0; ipart < npart; ipart++)
+  {
+    unfiltered_hits ++;
+
+    for (unsigned ichan=channels[0]; ichan < channels[1]; ichan++)
+    {
+      uint64_t index = (ipart*nchan + ichan) * npol;
+      unsigned outdex = ichan * npol;
+
+      unfiltered_sum[outdex] += indat[index];
+      if (npol == 2)
+        unfiltered_sum[outdex+1] += indat[index+1];
+  
+      if (outdat[(ipart*nchan) + ichan] == 1)
+      {
+        zap_counts[ZAP_ALL] ++;
+        continue;
+      }
+
+      filtered_sum[outdex] += indat[index];
+      if (npol == 2)
+        filtered_sum[outdex+1] += indat[index+1];
+
+      filtered_hits[ichan] ++;
+    }
+  }
+}
+
+void dsp::SpectralKurtosis::detect_fscr ()
+{
+  if (verbose)
+    cerr << "dsp::SpectralKurtosis::detect_fscr()" << endl;
+
+  float _M = (float) M;
+  float mu2 = (4 * _M * _M) / ((_M-1) * (_M + 2) * (_M + 3));
+
+  if (engine)
+  {
+    float one_sigma_idat   = sqrt(mu2 / (float) nchan);
+    const float upper = 1 + ((1+std_devs) * one_sigma_idat);
+    const float lower = 1 - ((1+std_devs) * one_sigma_idat);
+    engine->detect_fscr (estimates, zapmask, lower, upper, channels[0], channels[1]);
+    return;
+  }
+
+  const uint64_t ndat  = estimates->get_ndat();
+
+  const float * indat  = estimates->get_dattfp();
+  unsigned char * outdat = zapmask->get_datptr();
+
+  float sk_avg;
+  unsigned sk_avg_cnt = 0;
+  
+  unsigned zap_ipart;
+  uint64_t nzap = 0;
+
+  // foreach SK integration
+  for (uint64_t ipart=0; ipart < npart; ipart++)
+  {
+    zap_ipart = 0;
+    for (unsigned ipol=0; ipol < npol; ipol++)
+    {
+      sk_avg = 0;
+      sk_avg_cnt = 0;
+
+      for (unsigned ichan=channels[0]; ichan < channels[1]; ichan++)
+      {
+        if (outdat[ichan] == 0)
+        {
+          sk_avg += indat[ichan*npol + ipol];
+          sk_avg_cnt++;
+        }
+      }
+
+      if (sk_avg_cnt > 0)
+      {
+        sk_avg /= (float) sk_avg_cnt;
+
+        float one_sigma_idat = sqrt(mu2 / (float) sk_avg_cnt);
+        float avg_upper_thresh = 1 + ((1+std_devs) * one_sigma_idat);
+        float avg_lower_thresh = 1 - ((1+std_devs) * one_sigma_idat);
+        if ((sk_avg > avg_upper_thresh) || (sk_avg < avg_lower_thresh))
+        {
+          if (verbose)
+            cerr << "Zapping ipart=" << ipart << " ipol=" << ipol << " sk_avg=" << sk_avg
+                 << " [" << avg_lower_thresh << " - " << avg_upper_thresh
+                 << "] cnt=" << sk_avg_cnt << endl;
+          zap_ipart = 1;
+        }
+      }
+    }
+
+    if (zap_ipart)
+    {
+      for (unsigned ichan=0; ichan<nchan; ichan++)
+      {
+        outdat[ichan] = 1;
+      }
+      zap_counts[ZAP_FSCR] += nchan;
+      nzap += nchan;
+    }
+
+    indat += nchan * npol;
+    outdat += nchan;
+  }
+  //cerr << "dsp::SpectralKurtosis::detect_fscr ZAP=" << nzap << endl;
+}
+
+
+//! Perform the transformation on the input time series
+void dsp::SpectralKurtosis::mask ()
+{
+  // indicate the output timeseries contains zeroed data
+  output->set_zeroed_data (true);
+
+  // resize the output to ensure the hits array is reallocated
+  if (engine)
+  {
+    if (verbose)
+      cerr << "dsp::SpectralKurtosis::transformation output->resize(" << output->get_ndat() << ")" << endl;
+    output->resize (output->get_ndat());
+  }
+
+  // get base pointer to mask bitseries
+  unsigned char * mask = zapmask->get_datptr ();
+
+  if (engine)
+  {
+    if (verbose)
+      cerr << "dsp::SpectralKurtosis::transformation engine->setup(" << nchan << ")" << endl;
+    engine->mask (zapmask, input, output, M);
+  }
+  else
+  {
+    // mask is a TFP ordered bit series, output is FTP order Timeseries
+    const unsigned nfloat = M * ndim;      
+    for (unsigned ichan=0; ichan < nchan; ichan++)
+    {
+      for (unsigned ipol=0; ipol < npol; ipol++)
+      {
+        const float * indat  = input->get_datptr(ichan, ipol);
+        float * outdat = output->get_datptr(ichan, ipol);
+        for (uint64_t ipart=0; ipart < npart; ipart++)
+        {
+          if (mask[ipart*nchan+ichan])
+          {
+            for (unsigned j=0; j<nfloat; j++)
+              outdat[j] = 0;
+          }
+          else
+          {
+            for (unsigned j=0; j<nfloat; j++)
+              outdat[j] = indat[j];
+          }
+
+          indat += nfloat;
+          outdat += nfloat;
+        }
+      }
+    }
+  }
+
+  if (debugd < 1)
+    debugd++;
+}
+
+//! 
+void dsp::SpectralKurtosis::insertsk ()
+{
+  if (engine)
+    engine->insertsk (estimates, output, M);
+}
+
diff -Nru bl-dspsr-0+git20160405/Signal/General/SpectralKurtosisCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Signal/General/SpectralKurtosisCUDA.cu
--- bl-dspsr-0+git20160405/Signal/General/SpectralKurtosisCUDA.cu	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/SpectralKurtosisCUDA.cu	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,104 @@
+//-*-C++-*-
+
+/***************************************************************************
+ *
+ *   Copyright (C) 2016 by Andre Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#include "dsp/SpectralKurtosisCUDA.h"
+
+using namespace std;
+
+CUDA::SpectralKurtosisEngine::SpectralKurtosisEngine (dsp::Memory * memory)
+{
+  work_buffer_size = 0;
+  work_buffer = 0;
+  
+  device_memory = dynamic_cast<CUDA::DeviceMemory*>(memory);
+  stream = device_memory->get_stream ();
+
+  // sub-engines
+  computer = new CUDA::SKComputerEngine (memory);
+  detector = new CUDA::SKDetectorEngine (memory);
+  masker   = new CUDA::SKMaskerEngine (memory);
+}
+
+void CUDA::SpectralKurtosisEngine::setup ()
+{
+  if (dsp::Operation::verbose)
+    cerr << "CUDA::SpectralKurtosisEngine::setup ()" << endl;
+
+  // determine GPU capabilities
+  int device = 0;
+  cudaGetDevice(&device);
+  struct cudaDeviceProp device_properties;
+  cudaGetDeviceProperties (&device_properties, device);
+  max_threads_per_block = device_properties.maxThreadsPerBlock;
+
+  computer->setup ();
+  detector->setup ();
+  masker->setup ();
+}
+
+void CUDA::SpectralKurtosisEngine::compute ( const dsp::TimeSeries* input,
+           dsp::TimeSeries* output, dsp::TimeSeries *output_tscr, unsigned tscrunch)
+{
+  computer->compute (input, output, output_tscr, tscrunch);
+}
+
+void CUDA::SpectralKurtosisEngine::detect_ft (const dsp::TimeSeries* input,
+      dsp::BitSeries* output, float upper_thresh, float lower_thresh)
+{
+  detector->detect_ft (input, output, upper_thresh, lower_thresh);
+}
+
+void CUDA::SpectralKurtosisEngine::detect_fscr (const dsp::TimeSeries* input, 
+                                                dsp::BitSeries* output, 
+                                                const float lower, const float upper,
+                                                unsigned schan, unsigned echan)
+
+{
+  detector->detect_fscr(input, output, upper, lower, schan, echan);
+}
+
+void CUDA::SpectralKurtosisEngine::detect_tscr (const dsp::TimeSeries* input,
+      const dsp::TimeSeries* input_tscr, dsp::BitSeries* output,
+      float upper_thresh, float lower_thresh)
+{
+  detector->detect_tscr( input, input_tscr, output, upper_thresh, lower_thresh);
+}
+
+void CUDA::SpectralKurtosisEngine::reset_mask (dsp::BitSeries* output)
+{
+  detector->reset_mask(output);
+}
+
+int CUDA::SpectralKurtosisEngine::count_mask (const dsp::BitSeries* output)
+{
+  int nzapped = detector->count_mask (output);
+  return nzapped;
+}
+
+float * CUDA::SpectralKurtosisEngine::get_estimates (const dsp::TimeSeries* estimates_device)
+{ 
+  return detector->get_estimates (estimates_device);
+} 
+
+unsigned char * CUDA::SpectralKurtosisEngine::get_zapmask (const dsp::BitSeries* zapmask_device)
+{ 
+  return detector->get_zapmask (zapmask_device);
+} 
+
+void CUDA::SpectralKurtosisEngine::mask (dsp::BitSeries* mask, const dsp::TimeSeries * input,
+           dsp::TimeSeries * output, unsigned M)
+{
+  masker->perform (mask, input, output, M);
+}
+
+void CUDA::SpectralKurtosisEngine::insertsk (const dsp::TimeSeries* input, dsp::TimeSeries* out, unsigned M)
+{
+  computer->insertsk (input, out, M);
+}
+
diff -Nru bl-dspsr-0+git20160405/Signal/General/stokes_detect.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/stokes_detect.h
--- bl-dspsr-0+git20160405/Signal/General/stokes_detect.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/stokes_detect.h	2018-03-12 23:02:35.000000000 +0000
@@ -4,10 +4,7 @@
  *   Licensed under the Academic Free License version 2.1
  *
  ***************************************************************************/
-/* $Source: /cvsroot/dspsr/dspsr/Signal/General/stokes_detect.h,v $
-   $Revision: 1.1 $
-   $Date: 2006/10/15 19:09:24 $
-   $Author: straten $ */
+// dspsr/Signal/General/stokes_detect.h
 
 #ifndef __stokes_detect_h
 #define __stokes_detect_h
diff -Nru bl-dspsr-0+git20160405/Signal/General/TransferCUDA.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/TransferCUDA.C
--- bl-dspsr-0+git20160405/Signal/General/TransferCUDA.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/TransferCUDA.C	2018-03-12 23:02:35.000000000 +0000
@@ -34,9 +34,14 @@
   {
     cerr << "dsp::TransferCUDA::transformation input ndat="
          << input->get_ndat() << " ndim=" << input->get_ndim();
-    if (input->get_npol() > 1)
-      cerr << " span=" << input->get_datptr (0,1) - input->get_datptr(0,0);
-    cerr << " offset=" << input->get_datptr(0,0) - (float*)input->internal_get_buffer() << endl;
+    if (input->get_order() == TimeSeries::OrderFPT)
+    {
+      if (input->get_npol() > 1)
+        cerr << " span=" << input->get_datptr (0,1) - input->get_datptr(0,0);
+      cerr << " offset=" << input->get_datptr(0,0) - (float*)input->internal_get_buffer() << endl;
+    }
+    else
+      cerr << endl;
   }
 
   cudaError error;
@@ -58,10 +63,14 @@
   {
     cerr << "dsp::TransferCUDA::transformation output ndat=" 
        << output->get_ndat() << " ndim=" << output->get_ndim();
-    if (output->get_npol() > 1)
-      cerr << " span=" << output->get_datptr (0, 1) - output->get_datptr(0,0);
-
-    cerr << " offset=" << output->get_datptr(0,0) - (float*)output->internal_get_buffer() << endl;
+    if (output->get_order() == TimeSeries::OrderFPT)
+    {
+      if (output->get_npol() > 1)
+        cerr << " span=" << output->get_datptr (0, 1) - output->get_datptr(0,0);
+      cerr << " offset=" << output->get_datptr(0,0) - (float*)output->internal_get_buffer() << endl;
+    }
+    else
+      cerr << endl;
   }
 }
 
diff -Nru bl-dspsr-0+git20160405/Signal/General/TScrunchCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Signal/General/TScrunchCUDA.cu
--- bl-dspsr-0+git20160405/Signal/General/TScrunchCUDA.cu	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/TScrunchCUDA.cu	2018-03-12 23:02:35.000000000 +0000
@@ -9,6 +9,7 @@
 
 #include "dsp/TScrunchCUDA.h"
 
+#include <cuComplex.h>
 #include "Error.h"
 #include "debug.h"
 
@@ -50,6 +51,70 @@
   *out_base = result;
 }
 
+__global__ void fpt_ndim2_ndim2_shm (float2* in_base, float2* out_base,
+    unsigned in_Fstride, unsigned in_Pstride,
+    unsigned out_Fstride, unsigned out_Pstride,
+    unsigned ndat_out, unsigned sfactor)
+{
+  // shared memory for coalesced reads
+  extern __shared__ cuFloatComplex shm[];
+
+  // blockIdx.y == channel index
+  // threadIdx.y == polarization index
+  unsigned ndat_in = ndat_out * sfactor;
+
+  const unsigned block_offset = blockIdx.x * blockDim.x * sfactor;
+
+  // X dimension is indexed on output samples. This is the input sample each thread will start to read
+  unsigned isamp_thr = block_offset + threadIdx.x;
+ 
+  // offset into buffer = the index the first read sample for this block
+  in_base += (blockIdx.y*in_Fstride) + (threadIdx.y*in_Pstride) + block_offset;
+
+  cuFloatComplex result = make_cuComplex(0,0);
+  unsigned isamp = threadIdx.x * sfactor;
+  unsigned esamp = isamp + sfactor;
+  unsigned shm_start = 0;
+  unsigned shm_end = blockDim.x;
+
+  // ensure we don't overshoot the number of ndat
+  for (unsigned j=0; j<sfactor; j++)
+  {
+    // just whole block to coalesce read into SHM
+    if (isamp_thr < ndat_in)
+      shm[threadIdx.x] = in_base[isamp_thr];
+
+    __syncthreads();
+
+    // each thread adds time samples into its output result, wait for
+    // the right time samples to be located in shm
+
+    // if this thread's output value is located in SHM, add to result
+    while (isamp >= shm_start && isamp < shm_end && isamp < esamp)
+    {
+      //if (blockIdx.y == 0 && blockIdx.z == 0) 
+      //  printf ("[%d][%d] isamp=%u esamp=%u start=%u end=%u\n", blockIdx.x, threadIdx.x, isamp, esamp, shm_start, shm_end);
+      result = cuCaddf (result, shm[isamp-shm_start]);
+      isamp++;
+    }
+
+    isamp_thr += blockDim.x;
+    shm_start += blockDim.x;
+    shm_end   += blockDim.x;
+  }
+ 
+  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= ndat_out)
+    return;
+
+  //if (blockIdx.y == 0 && blockIdx.z == 0)
+  //  printf ("[%d][%d] i=%u\n", blockIdx.x, threadIdx.x, i);
+
+  out_base += (blockIdx.y*out_Fstride) + (threadIdx.y*out_Pstride) + i;
+  *out_base = result;
+}
+
+
 void CUDA::TScrunchEngine::fpt_tscrunch(const dsp::TimeSeries *in,
     dsp::TimeSeries* out, unsigned sfactor)
 {
@@ -85,21 +150,41 @@
     throw Error (InvalidParam, "CUDA::TScrunchEngine::fpt_scrunch",
 		 "only out-of-place transformation implemented");
 
+  if (in->get_ndat() == 0)
+    return;
+
   uint64_t in_Fstride = (in->get_datptr(1)-in->get_datptr(0)) / 2;
   uint64_t in_Pstride = (in->get_datptr(0,1)-in->get_datptr(0,0)) / 2;
   uint64_t out_Fstride = (out->get_datptr(1)-out->get_datptr(0)) / 2;
   uint64_t out_Pstride = (out->get_datptr(0,1)-out->get_datptr(0,0)) / 2;
   // use a 2-dimensional thread block to eliminate 3rd grid dimension
-  dim3 threads (128, in->get_npol());
-  dim3 blocks (out->get_ndat()/threads.x, in->get_nchan() );
 
+#define USE_SHARED
+#ifdef USE_SHARED
+  // set number of threads to be number of output samples, cap at 512
+  dim3 threads (512);
+  if (out->get_ndat() < 512)
+    threads.x = out->get_ndat();
+  dim3 blocks (out->get_ndat()/threads.x, in->get_nchan(), in->get_npol());
   if (out->get_ndat() % threads.x)
     blocks.x ++;
 
+  size_t shm_bytes = threads.x * sizeof(float2);
+  fpt_ndim2_ndim2_shm<<<blocks,threads,shm_bytes,stream>>> (
+    (float2*)(in->get_datptr(0)), (float2*)(out->get_datptr(0)), 
+    in_Fstride, in_Pstride, out_Fstride, out_Pstride, 
+    out->get_ndat(), sfactor);
+#else
+  dim3 threads (128, in->get_npol());
+  dim3 blocks (out->get_ndat()/threads.x, in->get_nchan(), in->get_npol());
+  if (out->get_ndat() % threads.x)
+    blocks.x ++;
   fpt_ndim2_ndim2<<<blocks,threads,0,stream>>> (
     (float2*)(in->get_datptr(0)), (float2*)(out->get_datptr(0)), 
     in_Fstride, in_Pstride, out_Fstride, out_Pstride, 
     out->get_ndat(), sfactor);
+#endif
+
 
   if (dsp::Operation::record_time || dsp::Operation::verbose)
     check_error ("CUDA::TScrunchEngine::fpt_scrunch");
diff -Nru bl-dspsr-0+git20160405/Signal/General/UnderSamplingBench.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/UnderSamplingBench.C
--- bl-dspsr-0+git20160405/Signal/General/UnderSamplingBench.C	1970-01-01 00:00:00.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/UnderSamplingBench.C	2018-03-12 23:02:35.000000000 +0000
@@ -0,0 +1,79 @@
+/***************************************************************************
+ *
+ *   Copyright (C) 2015 by Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#include "dsp/UnderSamplingBench.h"
+#include "debug.h"
+
+#include <fstream>
+#include <math.h>
+
+using namespace std;
+
+bool dsp::UnderSamplingBench::verbose = false;
+
+dsp::UnderSamplingBench::UnderSamplingBench (const std::string& name)
+{
+  library = name;
+  nchan = 0;
+}
+
+//! Set the number of channels
+void dsp::UnderSamplingBench::set_nchan (unsigned _chan)
+{
+  if (_chan != nchan)
+    reset ();
+
+  nchan = _chan;
+}
+
+void dsp::UnderSamplingBench::load () const
+{
+  max_nfft = 0;
+
+  string filename = path + "/filterbank_bench_" + library + ".dat";
+
+  if (verbose)
+    cerr << "dsp::UnderSamplingBench::load filename=" << filename << endl;
+
+  load (library, filename);
+  loaded = true;
+}
+
+void dsp::UnderSamplingBench::load (const std::string& library,
+			      const std::string& filename) const
+{
+  ifstream in (filename.c_str());
+  if (!in)
+    throw Error (FailedSys, "dsp::UnderSamplingBench::load",
+                 "std::ifstream (" + filename + ")");
+
+  while (!in.eof())
+  {
+    Entry entry;
+    double log2nchan, log2nfft, mflops;
+    unsigned _chan;
+
+    in >> _chan >> entry.nfft >> entry.cost >> log2nchan >> log2nfft >> mflops;
+
+    if (in.eof())
+      continue;
+
+    entry.library = library;
+    
+    DEBUG(library << " " << _chan << " " << entry.nfft << " " << entry.cost);
+
+    if (_chan != nchan)
+      continue;
+
+    DEBUG("ADD nchan=" << nchan << " nfft=" << entry.nfft);
+    entries.push_back (entry);
+
+    if (entry.nfft > max_nfft)
+      max_nfft = entry.nfft;
+  }
+}
+
diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/ArchiverExtensions.C bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/ArchiverExtensions.C
--- bl-dspsr-0+git20160405/Signal/Pulsar/ArchiverExtensions.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/ArchiverExtensions.C	2018-03-12 23:02:35.000000000 +0000
@@ -16,7 +16,7 @@
 #include "dsp/Convolution.h"
 #include "dsp/Dedispersion.h"
 #include "dsp/TScrunch.h"
-#include "dsp/SKDetector.h"
+#include "dsp/SpectralKurtosis.h"
 #include "dsp/OperationThread.h"
 
 #include "Pulsar/dspReduction.h"
@@ -195,10 +195,10 @@
         set_coherent_dedispersion (input->get_state(), response);
 
         if (input->get_state() == Signal::Nyquist)
-	{
-	  nsamp_fft *= 2;
-	  nsamp_overlap_pos *= 2;
-	  nsamp_overlap_neg *= 2;
+        {
+          nsamp_fft *= 2;
+          nsamp_overlap_pos *= 2;
+          nsamp_overlap_neg *= 2;
         }
 
         dspR->set_nsamp_fft ( nsamp_fft );
@@ -208,7 +208,7 @@
 
       // save it for the Passband Extension
       if ( convolution->has_passband() )
-	passband = convolution->get_passband();
+        passband = convolution->get_passband();
     }
 
     // ////////////////////////////////////////////////////////////////////
@@ -224,52 +224,52 @@
     //
     // Spectral Kurtosis RFI mitigation extension
     //
-    SKDetector* skdetect = dynamic_cast<SKDetector*>( operation );
+    SpectralKurtosis* skestimator = dynamic_cast<SpectralKurtosis*>( operation );
 
-    if (skdetect)
+    if (skestimator)
     {
       if (verbose > 2)
-        cerr << "dsp::Archiver::set SKDetector in use" << endl;
+        cerr << "dsp::Archiver::set SpectralKurtosis in use" << endl;
 
       unsigned nsubint = archive->get_nsubint();
       Integration* subint = archive->get_Integration(nsubint - 1);
 
-      SpectralKurtosis* ext = subint -> getadd<SpectralKurtosis>();
+      Pulsar::SpectralKurtosis* ext = subint -> getadd<Pulsar::SpectralKurtosis>();
 
-      unsigned nchan = skdetect->get_input()->get_nchan();
+      unsigned nchan = skestimator->get_input()->get_nchan();
       ext->set_nchan( nchan );
 
-      unsigned npol = skdetect->get_input()->get_npol();
+      unsigned npol = skestimator->get_input()->get_npol();
       ext->set_npol( npol );
 
-      ext->set_M( skdetect->get_M() );
-      ext->set_excision_threshold( skdetect->get_excision_threshold() );
+      ext->set_M( skestimator->get_M() );
+      ext->set_excision_threshold( skestimator->get_excision_threshold() );
 
       vector<float> data;
-      skdetect->get_filtered_sum (data);
+      skestimator->get_filtered_sum (data);
       for (unsigned ichan = 0; ichan < nchan; ichan++)
-	for (unsigned ipol = 0; ipol < npol; ipol++)
-	  ext->set_filtered_sum (ichan, ipol, data[ichan*npol + ipol]);
+        for (unsigned ipol = 0; ipol < npol; ipol++)
+          ext->set_filtered_sum (ichan, ipol, data[ichan*npol + ipol]);
 
       vector<uint64_t> hits;
-      skdetect->get_filtered_hits (hits);
+      skestimator->get_filtered_hits (hits);
       for (unsigned ichan = 0; ichan < nchan; ichan++)
-	ext->set_filtered_hits (ichan, hits[ichan]);
+        ext->set_filtered_hits (ichan, hits[ichan]);
 
-      skdetect->get_unfiltered_sum (data);
+      skestimator->get_unfiltered_sum (data);
       for (unsigned ichan = 0; ichan < nchan; ichan++)
-	for (unsigned ipol = 0; ipol < npol; ipol++)
-	  ext->set_unfiltered_sum (ichan, ipol, data[ichan*npol + ipol]);
+        for (unsigned ipol = 0; ipol < npol; ipol++)
+          ext->set_unfiltered_sum (ichan, ipol, data[ichan*npol + ipol]);
 
-      ext->set_unfiltered_hits( skdetect->get_unfiltered_hits() );
+      ext->set_unfiltered_hits( skestimator->get_unfiltered_hits() );
 
-      skdetect->reset_count();
+      skestimator->reset_count();
     }
   }
 
 
 void dsp::Archiver::set_coherent_dedispersion (Signal::State state,
-					       const Response* response)
+                                               const Response* response)
 {
   if (verbose > 2)
     cerr << "dsp::Archiver::set_coherent_dedispersion" << endl;
@@ -320,7 +320,7 @@
     for (unsigned ichan_output=0; ichan_output<nchan_output; ichan_output++)
     {
       CoherentDedispersion::OutputChannel& output 
-	= input.get_output( ichan_output );
+        = input.get_output( ichan_output );
 
       output.set_centre_frequency( dedisp->frequency_output[ichan_total] );
       output.set_bandwidth( dedisp->bandwidth_output[ichan_total] );
@@ -343,7 +343,7 @@
   {
     if (verbose > 2)
       cerr << "dsp::Archiver::set Pulsar::TwoBitStats no ExcisionUnpacker"
-	   << endl;
+           << endl;
     return;
   }
 
@@ -460,7 +460,7 @@
   
   if (passband->get_ndim() != 1)
     throw Error (InvalidState, "dsp::Archiver::set_passband",
-		 "Passband Response ndim != 1");
+                 "Passband Response ndim != 1");
   
   for (unsigned ipol=0; ipol<npol; ipol++)
     for (unsigned iband=0; iband<nband; iband++)
diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/dsp/Archiver.h bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/Archiver.h
--- bl-dspsr-0+git20160405/Signal/Pulsar/dsp/Archiver.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/Archiver.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/Pulsar/dsp/Archiver.h,v $
-   $Revision: 1.39 $
-   $Date: 2012/02/24 20:05:54 $
-   $Author: straten $ */
+// dspsr/Signal/Pulsar/dsp/Archiver.h
 
 
 #ifndef __Archiver_h
diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/dsp/FoldCUDA.h bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/FoldCUDA.h
--- bl-dspsr-0+git20160405/Signal/Pulsar/dsp/FoldCUDA.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/FoldCUDA.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/Pulsar/dsp/FoldCUDA.h,v $
-   $Revision: 1.11 $
-   $Date: 2011/08/04 21:06:43 $
-   $Author: straten $ */
+// dspsr/Signal/Pulsar/dsp/FoldCUDA.h
 
 #ifndef __baseband_cuda_Fold_h
 #define __baseband_cuda_Fold_h
diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/dsp/Fold.h bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/Fold.h
--- bl-dspsr-0+git20160405/Signal/Pulsar/dsp/Fold.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/Fold.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/Pulsar/dsp/Fold.h,v $
-   $Revision: 1.70 $
-   $Date: 2011/11/14 19:43:34 $
-   $Author: straten $ */
+// dspsr/Signal/Pulsar/dsp/Fold.h
 
 #ifndef __baseband_dsp_Fold_h
 #define __baseband_dsp_Fold_h
diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/dsp/LoadToFold1.h bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/LoadToFold1.h
--- bl-dspsr-0+git20160405/Signal/Pulsar/dsp/LoadToFold1.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/LoadToFold1.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/Pulsar/dsp/LoadToFold1.h,v $
-   $Revision: 1.32 $
-   $Date: 2011/09/20 20:49:14 $
-   $Author: straten $ */
+// dspsr/Signal/Pulsar/dsp/LoadToFold1.h
 
 #ifndef __dspsr_LoadToFold_h
 #define __dspsr_LoadToFold_h
@@ -33,6 +30,7 @@
 
   class OperationThread;
   class SKFilterbank;
+  class SpectralKurtosis;
   class Resize;
   class SampleDelay;
   class PhaseLockedFilterbank;
@@ -115,7 +113,10 @@
     Reference::To<Response> passband;
 
     //! Optional SK filterbank
-    Reference::To<SKFilterbank> skfilterbank;
+    // Reference::To<SKFilterbank> skfilterbank;
+
+    //! Optional Spectral Kurtosis (for convolution)
+    Reference::To<SpectralKurtosis> skestimator;
 
     //! Optional SK Resizer 
     Reference::To<Resize> skresize;
diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/dsp/LoadToFoldConfig.h bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/LoadToFoldConfig.h
--- bl-dspsr-0+git20160405/Signal/Pulsar/dsp/LoadToFoldConfig.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/LoadToFoldConfig.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/Pulsar/dsp/LoadToFoldConfig.h,v $
-   $Revision: 1.43 $
-   $Date: 2011/09/20 21:25:28 $
-   $Author: straten $ */
+// dspsr/Signal/Pulsar/dsp/LoadToFoldConfig.h
 
 #ifndef __baseband_dsp_LoadToFoldConfig_h
 #define __baseband_dsp_LoadToFoldConfig_h
diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/dsp/LoadToFoldN.h bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/LoadToFoldN.h
--- bl-dspsr-0+git20160405/Signal/Pulsar/dsp/LoadToFoldN.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/LoadToFoldN.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/Pulsar/dsp/LoadToFoldN.h,v $
-   $Revision: 1.14 $
-   $Date: 2011/09/09 02:38:14 $
-   $Author: straten $ */
+// dspsr/Signal/Pulsar/dsp/LoadToFoldN.h
 
 #ifndef __baseband_dsp_LoadToFoldN_h
 #define __baseband_dsp_LoadToFoldN_h
diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/dsp/PhaseLockedFilterbank.h bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/PhaseLockedFilterbank.h
--- bl-dspsr-0+git20160405/Signal/Pulsar/dsp/PhaseLockedFilterbank.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/PhaseLockedFilterbank.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/Pulsar/dsp/PhaseLockedFilterbank.h,v $
-   $Revision: 1.5 $
-   $Date: 2011/04/28 23:30:12 $
-   $Author: demorest $ */
+// dspsr/Signal/Pulsar/dsp/PhaseLockedFilterbank.h
 
 #ifndef __baseband_dsp_PhaseLockedFilterbank_h
 #define __baseband_dsp_PhaseLockedFilterbank_h
diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/dsp/PhaseSeries.h bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/PhaseSeries.h
--- bl-dspsr-0+git20160405/Signal/Pulsar/dsp/PhaseSeries.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/PhaseSeries.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/Pulsar/dsp/PhaseSeries.h,v $
-   $Revision: 1.41 $
-   $Date: 2011/08/04 21:07:02 $
-   $Author: straten $ */
+// dspsr/Signal/Pulsar/dsp/PhaseSeries.h
 
 #ifndef __PhaseSeries_h
 #define __PhaseSeries_h
diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/dsp/PhaseSeriesUnloader.h bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/PhaseSeriesUnloader.h
--- bl-dspsr-0+git20160405/Signal/Pulsar/dsp/PhaseSeriesUnloader.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/PhaseSeriesUnloader.h	2018-03-12 23:02:35.000000000 +0000
@@ -7,10 +7,7 @@
  ***************************************************************************/
 //-*-C++-*-
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/Pulsar/dsp/PhaseSeriesUnloader.h,v $
-   $Revision: 1.24 $
-   $Date: 2011/08/31 20:46:04 $
-   $Author: demorest $ */
+// dspsr/Signal/Pulsar/dsp/PhaseSeriesUnloader.h
 
 #ifndef __PhaseSeriesUnloader_h
 #define __PhaseSeriesUnloader_h
diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/dsp/SubFold.h bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/SubFold.h
--- bl-dspsr-0+git20160405/Signal/Pulsar/dsp/SubFold.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/SubFold.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/Pulsar/dsp/SubFold.h,v $
-   $Revision: 1.20 $
-   $Date: 2010/11/13 01:42:50 $
-   $Author: demorest $ */
+// dspsr/Signal/Pulsar/dsp/SubFold.h
 
 #ifndef __SubFold_h
 #define __SubFold_h
diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/dsp/TimeDivide.h bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/TimeDivide.h
--- bl-dspsr-0+git20160405/Signal/Pulsar/dsp/TimeDivide.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/TimeDivide.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/Pulsar/dsp/TimeDivide.h,v $
-   $Revision: 1.17 $
-   $Date: 2009/06/17 10:32:32 $
-   $Author: straten $ */
+// dspsr/Signal/Pulsar/dsp/TimeDivide.h
 
 #ifndef __baseband_dsp_TimeDivide_h
 #define __baseband_dsp_TimeDivide_h
diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/dsp/UnloaderShare.h bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/UnloaderShare.h
--- bl-dspsr-0+git20160405/Signal/Pulsar/dsp/UnloaderShare.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/UnloaderShare.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/Pulsar/dsp/UnloaderShare.h,v $
-   $Revision: 1.24 $
-   $Date: 2010/11/16 01:43:21 $
-   $Author: demorest $ */
+// dspsr/Signal/Pulsar/dsp/UnloaderShare.h
 
 #ifndef __UnloaderShare_h
 #define __UnloaderShare_h
diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/dspsr.C bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dspsr.C
--- bl-dspsr-0+git20160405/Signal/Pulsar/dspsr.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dspsr.C	2018-03-12 23:02:35.000000000 +0000
@@ -25,12 +25,14 @@
 
 #include "load_factory.h"
 #include "dirutil.h"
+#include "strutil.h"
 
 #include <iostream>
-
+#include <sstream>     
 #include <stdlib.h>
 #include <errno.h>
 #include <string.h>
+#include <stdio.h>
 
 using namespace std;
 
@@ -73,10 +75,20 @@
 
   Reference::To<dsp::Pipeline> engine;
 
-  if (config->get_total_nthread() > 1)
+  if (config->get_total_nthread() > 1){
+
+    if(dsp::Observation::verbose)
+      cerr << "using dsp::LoadToFoldN" << endl;
+
     engine = new dsp::LoadToFoldN (config);
-  else
+  }
+  else{
+
+    if(dsp::Observation::verbose)
+      cerr << "using dsp::LoadToFold" << endl;
+
     engine = new dsp::LoadToFold (config);
+  }
 
   bool time_prep = dsp::Operation::record_time || config->get_cuda_ndevice();
 
@@ -412,6 +424,10 @@
   arg = menu.add (predictor, 'P', "file");
   arg->set_help ("phase predictor used for folding");
 
+  string predictors_file;
+  arg = menu.add (predictors_file, 'w', "file");
+  arg->set_help ("phase predictors used for folding.");
+
   arg = menu.add (config->additional_pulsars, 'X', "name");
   arg->set_help ("additional pulsar to be folded");
 
@@ -577,6 +593,118 @@
       ( factory<Pulsar::Predictor> (predictor[i]) );
   }
 
+  if(!predictors_file.empty()) {
+
+    cerr << "dspsr: Loading phase models from " << predictors_file << endl;
+
+    vector<char> buffer (10240);
+    char* buf = &buffer[0];
+
+    FILE* fptr = fopen (predictors_file.c_str(), "r");
+    if (!fptr)
+      throw Error (FailedSys, "parse_options",
+		   "fopen (%s)", predictors_file.c_str());
+
+    string key_string;
+    // choose first non commented and non empty line and attempt to parse header.
+    while( fgets (buf, buffer.size(), fptr) ==buf ){
+
+      string temp  = buf;
+      temp = stringtok ( temp, "#\n", false);  // get rid of comments and empty lines
+
+      if(temp.empty())
+        continue;
+
+      key_string = temp;
+      break;
+      
+    }
+    if(key_string.empty())
+      throw Error(InvalidState,"parse_options","Bad input file to -w flag.");
+
+    string delim = " \t\n";
+
+    vector<string> keys;
+    string key_next;
+    string key_rest;
+
+    cerr << " read header string: " << key_string << endl;
+
+    do {
+
+      string_split_on_any( key_string, key_next, key_rest, delim );
+
+      if(key_next.empty() && !key_rest.empty())
+	throw Error (InvalidState,"dspsr", "Key in candiate file was empty.");
+
+      if(key_next.empty() && key_rest.empty())
+	key_next = key_string;
+
+      cerr<< "Considering Key = '" << key_next << "'"<<endl;      
+      keys.push_back(key_next);
+
+      key_string = key_rest;
+
+
+    }while(!key_rest.empty());
+
+    int nkeys = keys.size();
+    int nline = 1;
+
+
+    cerr << "loaded " << nkeys << " keys." << endl;
+    
+
+    while ( fgets (buf, buffer.size(), fptr) == buf ) {
+
+      nline++;
+
+      string value_string = buf;
+      value_string = stringtok (value_string, "#\n", false);  // get rid of comments and empty lines
+
+      if(value_string.empty())
+	continue;
+
+      vector<string> values(nkeys);
+      stringstream lines;
+      string value_next;
+      string value_rest;
+
+      for(int i=0; i< nkeys; i++ ) {
+	
+        string_split_on_any( value_string, value_next, value_rest, delim );	
+	
+
+
+	
+	if(value_next.empty() && !value_rest.empty()){
+	  stringstream err;
+	  cerr <<  "Value in candiate file was empty on line " << nline << endl; 
+	  throw Error (InvalidState,"dspsr", err.str().c_str());
+	}
+
+	if(value_next.empty() && value_rest.empty())
+	  value_next = value_string;
+
+	if(dsp::Observation::verbose)
+	  cerr<< "Considering Key = '" << keys.at(i) << "' value='" << value_next << "'" <<endl;
+
+	lines << keys.at(i) << ": " << value_next << endl;
+	value_string = value_rest;
+      }
+
+      cerr << lines.str();
+
+      char* line_buffer = (char*)lines.str().c_str();
+
+#if HAVE_FMEMOPEN
+      FILE* virtual_ptr = fmemopen( line_buffer, strlen(line_buffer) ,"r" );
+      config->predictors.push_back ( factory<Pulsar::Predictor> ( virtual_ptr ));
+#endif
+    }
+  }
+
+
   for (unsigned i=0; i<jobs.size(); i++)
     separate (jobs[i], config->jobs, ",");
 
diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/Fold.C bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/Fold.C
--- bl-dspsr-0+git20160405/Signal/Pulsar/Fold.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/Fold.C	2018-03-12 23:02:35.000000000 +0000
@@ -821,7 +821,7 @@
     {
     	if (verbose) {
     		cerr << "Fold::fold finishing fold w/ engine. zeroed_samples was true so correcting integration length from:" << result->integration_length
-    				<< " by:" << (engine->get_ndat_folded() / get_input()->get_rate()) <<endl;
+    				<< " by:" << (engine->get_ndat_folded() / get_input()->get_rate()) << endl;
     	}
       result->integration_length += engine->get_ndat_folded() / get_input()->get_rate();
     }
@@ -867,7 +867,7 @@
         } // for each idat
       } // for each pol
 
-      if (zeroed_samples && ichan < nchan-1)
+      if (zeroed_samples && ichan < nchan-1 && output->get_hits_nchan() > 1)
         hits += folding_nbin;
     } // for each chan 
   }
diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/FoldCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/FoldCUDA.cu
--- bl-dspsr-0+git20160405/Signal/Pulsar/FoldCUDA.cu	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/FoldCUDA.cu	2018-03-12 23:02:35.000000000 +0000
@@ -7,21 +7,30 @@
  *
  ***************************************************************************/
 
-// #define _DEBUG 1
-
 #include "dsp/FoldCUDA.h"
 #include "dsp/MemoryCUDA.h"
 
+//#define _DEBUG
+
 #include "Error.h"
 #include "debug.h"
 
+#include <cuComplex.h>
 #include <memory>
 
+#ifdef __CUDA_ARCH__
+    #if (__CUDA_ARCH__ >= 300)
+        #define HAVE_SHFL
+    #else
+        #define NO_SHFL
+    #endif
+#endif
+
 using namespace std;
 
 CUDA::FoldEngine::FoldEngine (cudaStream_t _stream, bool _hits_on_gpu)
 {
-	use_set_bins = false;
+  use_set_bins = false;
   d_bin = 0;
   d_bin_size = 0;
 
@@ -104,11 +113,11 @@
 }
 
 uint64_t CUDA::FoldEngine::get_bin_hits (int ibin){
-	return 0; // Fix this
+  return 0; // Fix this
 }
 uint64_t CUDA::FoldEngine::set_bins (double phi, double phase_per_sample, uint64_t _ndat, uint64_t idat_start)
 {
-	return 0;
+  return 0;
 }
 dsp::PhaseSeries* CUDA::FoldEngine::get_profiles ()
 {
@@ -178,7 +187,7 @@
 
   if (stream)
     error = cudaMemcpyAsync (d_bin, binplan, mem_size,
-			     cudaMemcpyHostToDevice, stream);
+                             cudaMemcpyHostToDevice, stream);
   else
     error = cudaMemcpy (d_bin, binplan, mem_size, cudaMemcpyHostToDevice);
 
@@ -186,93 +195,384 @@
     throw Error (InvalidState, "CUDA::FoldEngine::set_binplan",
                  "cudaMemcpy%s %s", 
                  stream?"Async":"", cudaGetErrorString (error));
-
-//  cudaThreadSynchronize();
 }
 
-
-/* 
- * CUDA Folding Kernels
- *   ipol = threadIdx.y
- *   npol = blockDim.y
+/* All CUDA folding kernels utilise the dimensionality:
+ *   ipol = blockIdx.z
+ *   npol = gridDim.z
  *   ichan = blockIdx.y
  *   nchan = gridDim.y
- *   idim = threadIdx.z
  */
 
-__global__ void fold1bin (const float* in_base,
+// 2dim data
+__global__ void fold1bin2dim (const cuFloatComplex * in_base,
            unsigned in_span,
-           float* out_base,
+           cuFloatComplex * out_base,
+           unsigned out_span,
+           unsigned nbin,
+           unsigned binplan_size,
+           const CUDA::bin* binplan)
+{
+  in_base  += in_span  * (blockIdx.y * gridDim.z + blockIdx.z);
+  out_base += out_span * (blockIdx.y * gridDim.z + blockIdx.z);
+
+  for (unsigned ibin=threadIdx.x; ibin<binplan_size; ibin+=blockDim.x)
+  {
+    const cuFloatComplex * input = in_base + binplan[ibin].offset;
+    cuFloatComplex total = make_cuComplex (0,0);
+    for (unsigned i=0; i < binplan[ibin].hits; i++)
+      total = cuCaddf (total, input[i]);
+    const unsigned output_ibin = binplan[ibin].ibin;
+    atomicAdd(&(out_base[output_ibin].x), total.x);
+    atomicAdd(&(out_base[output_ibin].y), total.y);
+  }
+}
+
+__global__ void fold1bin2dim_shared (const cuFloatComplex * in_base,
+           unsigned in_span,
+           cuFloatComplex * out_base,
+           unsigned out_span,
+           unsigned nbin,
+           unsigned binplan_size,
+           const CUDA::bin* binplan)
+{
+  // one shared memory bin for each output phase bin for this chanpol
+  extern __shared__ cuFloatComplex f1b2d_shared[];
+
+  // pointers for the current channel and polarisation
+  in_base  += in_span  * (blockIdx.y * gridDim.z + blockIdx.z);
+  out_base += out_span * (blockIdx.y * gridDim.z + blockIdx.z);
+
+  // coalesced read the existing phase bin values
+  for (unsigned ibin=threadIdx.x; ibin<nbin; ibin+=blockDim.x)
+    f1b2d_shared[ibin] = out_base[ibin];
+
+  __syncthreads();
+
+  for (unsigned ibin=threadIdx.x; ibin<binplan_size; ibin+=blockDim.x)
+  {
+    // input pointer for this phase bin
+    const cuFloatComplex * input = in_base + binplan[ibin].offset;
+    cuFloatComplex total = make_cuComplex (0,0);
+    for (unsigned i=0; i < binplan[ibin].hits; i++)
+      total = cuCaddf (total, input[i]);
+    const unsigned output_ibin = binplan[ibin].ibin;
+    atomicAdd(&(f1b2d_shared[output_ibin].x), total.x);
+    atomicAdd(&(f1b2d_shared[output_ibin].y), total.y);
+  }
+
+  __syncthreads();
+
+  // coalesced write the new phase bin values
+  for (unsigned ibin=threadIdx.x; ibin<nbin; ibin+=blockDim.x)
+    out_base[ibin] = f1b2d_shared[ibin];
+}
+
+// each warp will fold a single binplan bin
+__global__ void fold1bin2dim_warp (const float2* in_base,
+           unsigned in_span,
+           float2* out_base,
            unsigned out_span,
-           unsigned ndim,
            unsigned nbin,
            unsigned binplan_size,
            CUDA::bin* binplan)
 {
+  extern __shared__ cuFloatComplex warp_fold[];
 
-  unsigned ibin = blockIdx.x * blockDim.x + threadIdx.x;
+  const int warps_per_block = blockDim.x / 32;
+  const int warp_idx = threadIdx.x & 0x1F;      // % 32
+  const int warp_num = threadIdx.x / 32;
+  
+  // the ibin that threads in this warp will add up together
+  const int ibin = blockIdx.x * warps_per_block + warp_num;
 
-  if (ibin >= binplan_size)
-    return;
+  cuFloatComplex total = make_cuComplex (0,0);
 
-  unsigned output_ibin = binplan[ibin].ibin;
+  // only add up bins that we have
+  if (ibin < binplan_size)
+  {
+    in_base += in_span * (blockIdx.y * gridDim.z + blockIdx.z);
 
-  in_base  += in_span  * (blockIdx.y * blockDim.y + threadIdx.y) + threadIdx.z;
-  out_base += out_span * (blockIdx.y * blockDim.y + threadIdx.y) + threadIdx.z;
+    // start/end sample for this input bin
+    const int sbin = binplan[ibin].offset;
+    const int ebin = sbin + binplan[ibin].hits;
 
-  float total = 0;
+    // each thread of a warp will load samples for this ibin
+    for (int i=sbin+warp_idx; i<ebin; i+=32)
+    {
+      total = cuCaddf (total, in_base[i]);
+    }
+  }
+
+  // now add totals together
+#ifdef HAVE_SHFL
+  total.x += __shfl_down (total.x, 16);
+  total.x += __shfl_down (total.x, 8);
+  total.x += __shfl_down (total.x, 4);
+  total.x += __shfl_down (total.x, 2);
+  total.x += __shfl_down (total.x, 1);
+
+  total.y += __shfl_down (total.y, 16);
+  total.y += __shfl_down (total.y, 8);
+  total.y += __shfl_down (total.y, 4);
+  total.y += __shfl_down (total.y, 2);
+  total.y += __shfl_down (total.y, 1);
+
+  // copy to shm for warp 0 to write out to gmem
+  if (warp_idx == 0)
+    warp_fold[warp_num] = total; 
+  __syncthreads();
+
+  if (warp_num == 0)
+  {
+    out_base += out_span * (blockIdx.y * gridDim.z + blockIdx.z);
+    const int ibin = blockIdx.x * warps_per_block + warp_idx;
+    if (ibin >= binplan_size)
+      return;
+    int output_ibin = binplan[ibin].ibin;
+    out_base[ output_ibin ] = cuCaddf (out_base[ output_ibin ], warp_fold[warp_idx]);
+  }
+#endif
+#ifdef NO_SHFL
+  int last_offset = 16;
+  warp_fold[threadIdx.x] = total;
+  __syncthreads();
+  for (int offset = last_offset; offset > 0;  offset >>= 1)
+  {
+    if (warp_idx < offset)
+      warp_fold[threadIdx.x] = cuCaddf(warp_fold[threadIdx.x], warp_fold[threadIdx.x + offset]);
+    __syncthreads();
+  }
 
-  for (; ibin < binplan_size; ibin += nbin)
+  if (warp_idx == 0)
   {
-    const float* input = in_base + binplan[ibin].offset * ndim;
+    if (ibin < binplan_size)
+    {
+      out_base += out_span * (blockIdx.y * gridDim.z + blockIdx.z);
+      int output_ibin = binplan[ibin].ibin;
+      out_base[ output_ibin ] = cuCaddf (out_base[ output_ibin ], warp_fold[threadIdx.x]);
+    }
+  }
+#endif
+}
+
 
+// 1dim kernels 
+__global__ void fold1bin1dim_shared (const float* in_base, unsigned in_span,
+           float* out_base, unsigned out_span, 
+           unsigned nbin, unsigned binplan_size,
+           CUDA::bin* binplan)
+{
+  // one shared memory bin for each output phase bin for this chanpol
+  extern __shared__ float f1b1d_shared[];
+
+  // pointers for the current channel and polarisation
+  in_base  += in_span  * (blockIdx.y * gridDim.z + blockIdx.z);
+  out_base += out_span * (blockIdx.y * gridDim.z + blockIdx.z);
+
+  // coalesced read the existing phase bin values
+  for (unsigned ibin=threadIdx.x; ibin<nbin; ibin+=blockDim.x)
+    f1b1d_shared[ibin] = out_base[ibin];
+
+  __syncthreads();
+
+  for (unsigned ibin=threadIdx.x; ibin<binplan_size; ibin+=blockDim.x)
+  {
+    // input pointer for this phase bin
+    const float * input = in_base + binplan[ibin].offset;
+    float total = 0;
     for (unsigned i=0; i < binplan[ibin].hits; i++)
-      total += input[i*ndim];
+      total += input[i];
+    const unsigned obin = binplan[ibin].ibin;
+    atomicAdd(&(f1b1d_shared[obin]), total);
   }
 
-  out_base[ output_ibin * ndim ] += total;
+  __syncthreads();
+
+  // coalesced write all the new phase bin values
+  for (unsigned ibin=threadIdx.x; ibin<nbin; ibin+=blockDim.x)
+    out_base[ibin] = f1b1d_shared[ibin];
 }
 
+__global__ void fold1bin1dim (const float* in_base, unsigned in_span,
+           float* out_base, unsigned out_span,
+           unsigned nbin, unsigned binplan_size,
+           CUDA::bin* binplan)
+{
+  in_base  += in_span  * (blockIdx.y * gridDim.z + blockIdx.z);
+  out_base += out_span * (blockIdx.y * gridDim.z + blockIdx.z);
+
+  for (unsigned ibin=threadIdx.x; ibin<binplan_size; ibin+=blockDim.x)
+  {
+    const float* input = in_base + binplan[ibin].offset;
+    float total = 0;
+    for (unsigned i=0; i < binplan[ibin].hits; i++)
+      total += input[i];
+    const unsigned obin = binplan[ibin].ibin;
+    atomicAdd(&(out_base[obin]), total);
+  }
+}
 
-__global__ void fold1binhits (const float* in_base,
-			     unsigned in_span,
-			     float* out_base,
-			     unsigned out_span,
+// 2dim kernel hits
+__global__ void fold1bin2dimhits_shared (const float2* in_base, unsigned in_span,
+           float2* out_base, unsigned out_span,
            unsigned* hits_base,
-			     unsigned ndim,
-			     unsigned nbin,
-			     unsigned binplan_size,
-			     CUDA::bin* binplan)
+           unsigned nbin, unsigned binplan_size, CUDA::bin* binplan)
 {
-  unsigned ibin = blockIdx.x * blockDim.x + threadIdx.x;
+  // one shared memory bin for each output phase bin for this chanpol
+  extern __shared__ float2 f1b2dh_shared[];
+  unsigned * hits_shared = (unsigned *) (f1b2dh_shared + nbin);
+
+  //           stride   * (  ichan    *  npol      + ipol     )
+  in_base   += in_span  * (blockIdx.y * gridDim.z + blockIdx.z);
+  out_base  += out_span * (blockIdx.y * gridDim.z + blockIdx.z);
+  hits_base += nbin * blockIdx.y;
 
-  if (ibin >= binplan_size)
-    return;
+  // coalesced read the existing phase bin values
+  for (unsigned ibin=threadIdx.x; ibin<nbin; ibin+=blockDim.x)
+  {
+    f1b2dh_shared[ibin] = out_base[ibin];
+    if (blockIdx.z == 0)
+      hits_shared[ibin] = hits_base[ibin];
+  }
+  __syncthreads();
+
+  for (unsigned ibin=threadIdx.x; ibin < binplan_size; ibin += blockDim.x)
+  {
+    const float2* input = in_base + binplan[ibin].offset;
+    float2 total = make_cuComplex (0,0);
+    unsigned hits = 0;
+    for (unsigned i=0; i < binplan[ibin].hits; i++)
+    {
+      total = cuCaddf( total, input[i]);
+      if (blockIdx.z == 0)
+        hits += (input[i].x != 0);
+    }
+    const unsigned obin = binplan[ibin].ibin;
+    atomicAdd(&(f1b2dh_shared[obin].x), total.x);
+    atomicAdd(&(f1b2dh_shared[obin].y), total.y);
+    if (blockIdx.z == 0)
+      atomicAdd(&(hits_shared[obin]), hits);
+  }
+
+  __syncthreads();
 
-  unsigned output_ibin = binplan[ibin].ibin;
+  // coalesced write all the new phase bin values
+  for (unsigned ibin=threadIdx.x; ibin<nbin; ibin+=blockDim.x)
+  {
+    out_base[ibin] = f1b2dh_shared[ibin];
+    if (blockIdx.z == 0)
+      hits_base[ibin] = hits_shared[ibin];
+  }
+}
 
-  in_base  += in_span  * (blockIdx.y * blockDim.y + threadIdx.y) + threadIdx.z;
-  out_base += out_span * (blockIdx.y * blockDim.y + threadIdx.y) + threadIdx.z;
+__global__ void fold1bin2dimhits (const float2* in_base, unsigned in_span,
+           float2* out_base, unsigned out_span,
+           unsigned* hits_base,
+           unsigned nbin, unsigned binplan_size, CUDA::bin* binplan)
+{
+  //           stride   * (  ichan    *  npol      + ipol     )
+  in_base   += in_span  * (blockIdx.y * gridDim.z + blockIdx.z);
+  out_base  += out_span * (blockIdx.y * gridDim.z + blockIdx.z);
   hits_base += nbin * blockIdx.y;
 
-  float total = 0;
-  unsigned hits = 0;
+  for (unsigned ibin=threadIdx.x; ibin < binplan_size; ibin += blockDim.x)
+  {
+    const float2* input = in_base + binplan[ibin].offset;
+    float2 total = make_cuComplex (0,0);
+    unsigned hits = 0;
+    for (unsigned i=0; i < binplan[ibin].hits; i++)
+    {
+      total = cuCaddf( total, input[i]);
+      if (blockIdx.z == 0)
+        hits += (input[i].x != 0);
+    }
+    const unsigned obin = binplan[ibin].ibin;
+    atomicAdd(&(out_base[obin].x), total.x);
+    atomicAdd(&(out_base[obin].y), total.y);
+    if (blockIdx.z == 0)
+      atomicAdd(&(hits_base[obin]), hits);
+  }
+}
+
+__global__ void fold1bin1dimhits_shared (const float* in_base, unsigned in_span,
+           float* out_base, unsigned out_span,
+           unsigned* hits_base,
+           unsigned nbin, unsigned binplan_size, CUDA::bin* binplan)
+{
+  // one shared memory bin for each output phase bin for this chanpol
+  extern __shared__ float f1b1dh_shared[];
+  unsigned * hits_shared = (unsigned *) (f1b1dh_shared + nbin);
+
+  //           stride   * (  ichan    *  npol      + ipol     )
+  in_base   += in_span  * (blockIdx.y * gridDim.z + blockIdx.z);
+  out_base  += out_span * (blockIdx.y * gridDim.z + blockIdx.z);
+  hits_base += nbin * blockIdx.y;
 
-  for (; ibin < binplan_size; ibin += nbin)
+  // coalesced read the existing phase bin values
+  for (unsigned ibin=threadIdx.x; ibin<nbin; ibin+=blockDim.x)
   {
-    const float* input = in_base + binplan[ibin].offset * ndim;
+    f1b1dh_shared[ibin] = out_base[ibin];
+    if (blockIdx.z == 0)
+      hits_shared[ibin] = hits_base[ibin];
+  }
+  __syncthreads();
 
+  for (unsigned ibin=threadIdx.x; ibin < binplan_size; ibin += blockDim.x)
+  {
+    const float* input = in_base + binplan[ibin].offset;
+    float total = 0;
+    unsigned hits = 0;
     for (unsigned i=0; i < binplan[ibin].hits; i++)
     {
-      total += input[i*ndim];
-      hits += (input[i*ndim] != 0);
+      total += input[i];
+      if (blockIdx.z == 0)
+        hits += (input[i] != 0);
     }
+    const unsigned obin = binplan[ibin].ibin;
+    atomicAdd(&(f1b1dh_shared[obin]), total);
+    // for ipol == 0 only
+    if (blockIdx.z == 0)
+      atomicAdd(&(hits_shared[obin]), hits);
   }
 
-  out_base[ output_ibin * ndim ] += total;
-  // if ipol and idim both equal 0
-  if ((threadIdx.y + threadIdx.z) == 0)
-    hits_base[ output_ibin ] += hits;
+  __syncthreads();
+
+  // coalesced write all the new phase bin values
+  for (unsigned ibin=threadIdx.x; ibin<nbin; ibin+=blockDim.x)
+  {
+    out_base[ibin] = f1b1dh_shared[ibin];
+    if (blockIdx.z == 0)
+      hits_base[ibin] = hits_shared[ibin];
+  }
+}
+
+__global__ void fold1bin1dimhits (const float* in_base, unsigned in_span,
+			     float* out_base, unsigned out_span,
+           unsigned* hits_base,
+			     unsigned nbin, unsigned binplan_size, CUDA::bin* binplan)
+{
+  //           stride   * (  ichan    *  npol      + ipol     )
+  in_base   += in_span  * (blockIdx.y * gridDim.z + blockIdx.z);
+  out_base  += out_span * (blockIdx.y * gridDim.z + blockIdx.z);
+  hits_base += nbin * blockIdx.y;
+  
+  for (unsigned ibin=threadIdx.x; ibin < binplan_size; ibin += blockDim.x)
+  {
+    const float* input = in_base + binplan[ibin].offset;
+    float total = 0;
+    unsigned hits = 0;
+    for (unsigned i=0; i < binplan[ibin].hits; i++)
+    {
+      total += input[i];
+      hits += (input[i] != 0);
+    }
+    const unsigned obin = binplan[ibin].ibin;
+    atomicAdd(&(out_base[obin]), total);
+    // ipol == 0 only
+    if (blockIdx.z == 0)
+      atomicAdd(&(hits_base[obin]), hits);
+  }
 }
 
 std::ostream& operator<< (std::ostream& ostr, const dim3& v)
@@ -288,46 +588,102 @@
   setup ();
   send_binplan ();
 
+  // total number of input phase bins to be opereated (capped at folding_nbin)
   unsigned bin_dim = folding_nbin;
   if (binplan_nbin < folding_nbin)
     bin_dim = binplan_nbin;
 
-  unsigned bin_threads = 128;
-  if (bin_threads > bin_dim)
-    bin_threads = 32;
-
-  unsigned bin_blocks = bin_dim / bin_threads;
-  if (bin_dim % bin_threads)
-    bin_blocks ++;
+  // number of threads in the block (capped a max TPB)
+  unsigned bin_threads = bin_dim;
+  if (bin_threads > 1024);
+    bin_threads = 1024;
+
+  // to ensure block coherrency
+  unsigned bin_blocks = 1;
 
-  dim3 blockDim (bin_threads, npol, ndim);
-  dim3 gridDim (bin_blocks, nchan, 1);
+  dim3 blockDim (bin_threads, 1, 1);
+  dim3 gridDim (bin_blocks, nchan, npol);
 
 #if 0
+  cerr << "bin_dim=" << bin_dim << endl;
   cerr << "blockDim=" << blockDim << endl;
   cerr << "gridDim=" << gridDim << endl;
 #endif
 
+  DEBUG("bin_dim=" << bin_dim);
+  DEBUG("bin_threads=" << bin_threads << " bin_blocks=" << bin_blocks);
+  DEBUG("input=" << (void *) input << " output=" << (void *) output);
   DEBUG("input span=" << input_span << " output span=" << output_span);
   DEBUG("ndim=" << ndim << " nbin=" << folding_nbin << " binplan_nbin=" << binplan_nbin);
+  DEBUG("hits_on_gpu=" << hits_on_gpu << " zeroed_samples=" << zeroed_samples << " hits_nchan=" << hits_nchan);
 
-  //cudaThreadSynchronize();
-
+  size_t shared_max = 32768;
+  size_t shared_bytes = folding_nbin * sizeof(float) * ndim;
   if (hits_on_gpu && zeroed_samples && hits_nchan == nchan)
   {
-    fold1binhits<<<gridDim,blockDim,0,stream>>> (input, input_span,
-	  				   output, output_span, hits,
-		  			   ndim, folding_nbin,
-			  		   binplan_nbin, d_bin);
-
+    shared_bytes += folding_nbin * sizeof(unsigned);
+    if (ndim == 2)
+    {
+      if (shared_bytes <= shared_max)
+        fold1bin2dimhits_shared<<<gridDim,blockDim,shared_bytes,stream>>> ((float2*)input, input_span/2,
+                   (float2*) output, output_span/2, hits,
+                   folding_nbin, binplan_nbin, d_bin);
+      else
+        fold1bin2dimhits<<<gridDim,blockDim,0,stream>>> ((float2*)input, input_span/2,
+                   (float2*) output, output_span/2, hits,
+                   folding_nbin, binplan_nbin, d_bin);
+    }
+    else
+    {
+      if (shared_bytes <= shared_max)
+        fold1bin1dimhits_shared<<<gridDim,blockDim,shared_bytes,stream>>> (input, input_span,
+                   output, output_span, hits,
+                   folding_nbin, binplan_nbin, d_bin);
+      else
+        fold1bin1dimhits<<<gridDim,blockDim,0,stream>>> (input, input_span,
+                   output, output_span, hits,
+                   folding_nbin, binplan_nbin, d_bin);
+    }
   }
   else
   {
-    fold1bin<<<gridDim,blockDim,0,stream>>> (input, input_span,
-               output, output_span,
-               ndim, folding_nbin,
-               binplan_nbin, d_bin);
-
+    if (ndim == 2)
+    {
+      if (shared_bytes <= shared_max)
+      {
+        fold1bin2dim_shared<<<gridDim,blockDim,shared_bytes,stream>>> ((cuFloatComplex *) input, input_span/2,
+                 (cuFloatComplex *) output, output_span/2,
+                 folding_nbin, binplan_nbin, d_bin);
+      }
+      else
+      {
+        fold1bin2dim<<<gridDim,blockDim,0,stream>>> ((cuFloatComplex *) input, input_span/2,
+                   (cuFloatComplex *) output, output_span/2,
+                   folding_nbin, binplan_nbin, d_bin);
+      }
+/*
+      dim3 threads(1024, 1, 1);
+      unsigned nwarps = threads.x / 32;
+      dim3 blocks (binplan_nbin/nwarps, nchan, npol);
+      if (binplan_nbin % nwarps)
+        blocks.x++;
+      size_t sbytes = threads.x * sizeof(float2);
+      fold1bin2dim_warp<<<blocks,threads,sbytes,stream>>> ((cuFloatComplex *) input, input_span/2,
+                 (cuFloatComplex *) output, output_span/2,
+                 folding_nbin, binplan_nbin, d_bin);
+*/
+    }
+    else
+    {
+      if (shared_bytes <= shared_max)
+        fold1bin1dim_shared<<<gridDim,blockDim,shared_bytes,stream>>> (input, input_span,
+                   output, output_span,
+                   folding_nbin, binplan_nbin, d_bin);
+      else 
+        fold1bin1dim<<<gridDim,blockDim,0,stream>>> (input, input_span,
+                   output, output_span,
+                   folding_nbin, binplan_nbin, d_bin);
+    }
   }
 
   // profile on the device is no longer synchronized with the one on the host
diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/LoadToFold1.C bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/LoadToFold1.C
--- bl-dspsr-0+git20160405/Signal/Pulsar/LoadToFold1.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/LoadToFold1.C	2018-03-12 23:02:35.000000000 +0000
@@ -27,26 +27,30 @@
 
 #include "dsp/Filterbank.h"
 #include "dsp/FilterbankEngine.h"
-#include "dsp/SKFilterbank.h"
-#include "dsp/SKDetector.h"
-#include "dsp/SKMasker.h"
+#include "dsp/SpectralKurtosis.h"
 #include "dsp/OptimalFFT.h"
 #include "dsp/Resize.h"
 
 #if HAVE_CFITSIO
+#if HAVE_fits
 #include "dsp/FITSFile.h"
+#include "dsp/MultiFile.h"
 #include "dsp/FITSUnpacker.h"
 #endif
+#endif
 
 #if HAVE_CUDA
+#include "dsp/ConvolutionCUDA.h"
+#include "dsp/ConvolutionCUDASpectral.h"
 #include "dsp/FilterbankCUDA.h"
 #include "dsp/OptimalFilterbank.h"
 #include "dsp/TransferCUDA.h"
+#include "dsp/TimeSeriesCUDA.h"
 #include "dsp/TransferBitSeriesCUDA.h"
 #include "dsp/DetectionCUDA.h"
 #include "dsp/FoldCUDA.h"
 #include "dsp/MemoryCUDA.h"
-#include "dsp/SKMaskerCUDA.h"
+#include "dsp/SpectralKurtosisCUDA.h"
 #include "dsp/CyclicFoldEngineCUDA.h"
 #endif
 
@@ -131,18 +135,43 @@
     }
 
 #if HAVE_CFITSIO
+#if HAVE_fits
     // Use callback to handle scales/offsets for read-in
     if (manager->get_info()->get_machine() == "FITS")
     {
       if (Operation::verbose)
-        std::cout << "Using callback to read PSRFITS file." << std::endl;
+        cerr << "Using callback to read PSRFITS file." << endl;
       // connect a callback
-      FITSFile* tmp = dynamic_cast<FITSFile*> (manager->get_input());
-      tmp->update.connect (
-          dynamic_cast<FITSUnpacker*> ( manager->get_unpacker() ),
-          &FITSUnpacker::set_parameters);
+      bool success = false;
+      FITSUnpacker* funp = dynamic_cast<FITSUnpacker*> (
+          manager->get_unpacker());
+      FITSFile* ffile = dynamic_cast<FITSFile*> (manager->get_input());
+      if (funp && ffile)
+      {
+        ffile->update.connect ( funp, &FITSUnpacker::set_parameters );
+        success = true;
+      }
+      else 
+      {
+        MultiFile* mfile = dynamic_cast<MultiFile*> (manager->get_input());
+        if (mfile)
+        {
+          for (unsigned i=0; i < mfile->nfiles(); ++i)
+          {
+            ffile = dynamic_cast<FITSFile*> (mfile->get_files()[i].get());
+            if (funp && ffile) {
+              ffile->update.connect (
+                  funp, &FITSUnpacker::set_parameters );
+              success = true;
+            }
+          }
+        }
+      }
+      if (not success)
+        cerr << "dspsr: WARNING: FITS input input but unable to apply scales and offsets." << endl;
     }
 #endif
+#endif
 
     config->coherent_dedispersion = false;
     prepare_interchan (unpacked);
@@ -259,103 +288,18 @@
     }
   }
 
-  // only the Filterbank must be out-of-place
-  TimeSeries* convolved = unpacked;
-
-  TimeSeries* skoutput = 0;
-  BitSeries * skzapmask = 0;
-  Reference::To<OperationThread> skthread;
-
-  if (config->sk_zap)
-  {
-    // put the SK signal path into a separate thread
-    skthread = new OperationThread();
-
-    TimeSeries* skfilterbank_input = unpacked;
-
-#if HAVE_CUDA
-    if (run_on_gpu) 
-    {
-      Unpacker* unpack_on_cpu = 0;
-      unpack_on_cpu = manager->get_unpacker()->clone();
-      unpack_on_cpu->set_device (Memory::get_manager());
-
-      unpack_on_cpu->set_input( manager->get_unpacker()->get_input() );
-      unpack_on_cpu->set_output( skfilterbank_input = new_time_series() );
-
-      skthread->append_operation( unpack_on_cpu );
-      manager->set_post_load_operation( skthread.get() );
-    }
-#endif
-
-    skoutput = new_time_series ();
-
-    // Spectral Kurtosis filterbank constructor
-    if (!skfilterbank)
-      skfilterbank = new SKFilterbank (config->sk_nthreads);
-
-    if (!config->input_buffering)
-      skfilterbank->set_buffering_policy (NULL);
-
-    skfilterbank->set_input ( skfilterbank_input );
-
-    skfilterbank->set_output ( skoutput );
-    skfilterbank->set_nchan ( config->filterbank.get_nchan() );
-    skfilterbank->set_M ( config->sk_m );
-
-    // SKFB also maintains trscunched SK stats
-    TimeSeries* skoutput_tscr = new_time_series();
-
-    skfilterbank->set_output_tscr (skoutput_tscr);
-
-    skthread->append_operation (skfilterbank.get());
-
-    // SK Mask Generator
-    skzapmask = new BitSeries;
-    skzapmask->set_nbit (8);
-    skzapmask->set_npol (1);
-    skzapmask->set_nchan (config->filterbank.get_nchan());
-
-    SKDetector * skdetector = new SKDetector;
-    skdetector->set_input (skoutput);
-    skdetector->set_input_tscr (skoutput_tscr);
-    skdetector->set_output (skzapmask);
-
-    skdetector->set_thresholds (config->sk_m, config->sk_std_devs);
-    if (config->sk_chan_start > 0 && config->sk_chan_end < config->filterbank.get_nchan())
-      skdetector->set_channel_range (config->sk_chan_start, config->sk_chan_end);
-    skdetector->set_options (config->sk_no_fscr, config->sk_no_tscr, config->sk_no_ft); 
-
-    skthread->append_operation (skdetector);
-
-#if HAVE_CUDA
-    if (!run_on_gpu)
-#endif
-    {
-      operations.push_back (skthread.get());
-      OperationThread::Wait * skthread_wait = skthread->get_wait();
-      operations.push_back (skthread_wait);
-    }
-
-    // since the blocksize is artificially increased for the SKFB,
-    // we must return it to the required size for the SKFB
-    if (!skresize)
-      skresize = new Resize;
-    
-    skresize->set_input(unpacked);
-    skresize->set_output(unpacked);
-    operations.push_back (skresize.get());
-
-  }
+  // convolved and filterbank are out of place
+  TimeSeries* filterbanked = unpacked;
 
+  // filterbank is performing channelisation
   if (config->filterbank.get_nchan() > 1)
   {
     // new storage for filterbank output (must be out-of-place)
-    convolved = new_time_series ();
+    filterbanked = new_time_series ();
 
 #if HAVE_CUDA
     if (run_on_gpu)
-      convolved->set_memory (device_memory);
+      filterbanked->set_memory (device_memory);
 #endif
 
     config->filterbank.set_device( device_memory.ptr() );
@@ -369,7 +313,7 @@
       filterbank->set_buffering_policy (NULL);
 
     filterbank->set_input (unpacked);
-    filterbank->set_output (convolved);
+    filterbank->set_output (filterbanked);
     
     if (config->filterbank.get_convolve_when() == Filterbank::Config::During)
     {
@@ -383,6 +327,9 @@
       operations.push_back (filterbank.get());
   }
 
+  // output of convolved will be filterbanked|unpacked
+  TimeSeries* convolved = filterbanked;
+
   bool filterbank_after_dedisp
     = config->filterbank.get_convolve_when() == Filterbank::Config::Before;
 
@@ -392,29 +339,44 @@
     if (!convolution)
       convolution = new Convolution;
     
+    if (!config->input_buffering)
+      convolution->set_buffering_policy (NULL);
+
     convolution->set_response (response);
     if (!config->integration_turns)
       convolution->set_passband (passband);
     
+    convolved = new_time_series();
+
     if (filterbank_after_dedisp)
     {
-      convolution->set_input  (unpacked);  
-      convolution->set_output (unpacked);  // inplace
+      convolution->set_input  (filterbanked);  
+      convolution->set_output (convolved);    // out of place
     }
     else
     {
-      convolution->set_input  (convolved);  
-      convolution->set_output (convolved);  // inplace
+      convolution->set_input  (filterbanked);  
+      convolution->set_output (convolved);  // out of place
     }
 
-    if (!config->input_buffering)
-      convolution->set_buffering_policy (NULL);
-
+#if HAVE_CUDA
+    if (run_on_gpu)
+    {
+      convolved->set_memory (device_memory);
+      convolution->set_device (device_memory.ptr());
+      unsigned nchan = manager->get_info()->get_nchan() * config->filterbank.get_nchan();
+      if (nchan >= 16)
+        convolution->set_engine (new CUDA::ConvolutionEngineSpectral (stream));
+      else
+        convolution->set_engine (new CUDA::ConvolutionEngine (stream));
+    }
+#endif
+    
     operations.push_back (convolution.get());
   }
 
   if (filterbank_after_dedisp)
-    prepare_interchan (unpacked);
+    prepare_interchan (convolved);
   else
     prepare_interchan (convolved);
 
@@ -491,15 +453,16 @@
 
     return;
     // the phase-locked filterbank does its own detection and folding
-    
   }
 
   Reference::To<Fold> presk_fold;
   Reference::To<Archiver> presk_unload;
 
+  TimeSeries * cleaned = convolved;
+
   // peform zapping based on the results of the SKFilterbank
   if (config->sk_zap)
-  { 
+  {
     if (config->nosk_too)
     {
       Detection* presk_detect = new Detection;
@@ -538,64 +501,49 @@
       operations.push_back (presk_fold.get());
     }
 
-#if HAVE_CUDA
-    if (run_on_gpu)
-    {
-      OperationThread::Wait * skthread_wait = skthread->get_wait();
-      operations.push_back (skthread_wait);
-    }
-#endif
+    cleaned = new_time_series();
+
+    if (!skestimator)
+      skestimator = new SpectralKurtosis();
 
-    SKMasker * skmasker = new SKMasker;
     if (!config->input_buffering)
-      skmasker->set_buffering_policy (NULL);
+      skestimator->set_buffering_policy (NULL);
+
+    skestimator->set_input (convolved);
+    skestimator->set_output (cleaned);
+    skestimator->set_M (config->sk_m);
 
 #if HAVE_CUDA
     if (run_on_gpu)
     {
-      // transfer the zap mask to the GPU
-      BitSeries * skzapmask_on_gpu = new BitSeries();
-      skzapmask_on_gpu->set_nbit (8);
-      skzapmask_on_gpu->set_npol (1);
-      skzapmask_on_gpu->set_nchan (config->filterbank.get_nchan());
-      skzapmask_on_gpu->set_memory (device_memory);
-
-      TransferBitSeriesCUDA* transfer = new TransferBitSeriesCUDA(stream);
-      transfer->set_kind( cudaMemcpyHostToDevice );
-      transfer->set_input( skzapmask );
-      transfer->set_output( skzapmask_on_gpu );
-      operations.push_back (transfer);
-
-      skmasker->set_mask_input (skzapmask_on_gpu);
-      skmasker->set_engine (new CUDA::SKMaskerEngine (stream));
+      // for input buffering
+      convolved->set_engine (new CUDA::TimeSeriesEngine (device_memory));
+      cleaned->set_memory (device_memory);
+      skestimator->set_engine (new CUDA::SpectralKurtosisEngine (device_memory));
     }
-    else
-      skmasker->set_mask_input (skzapmask);
-#else
-    skmasker->set_mask_input (skzapmask);
 #endif
 
-    skmasker->set_input (convolved);
-    skmasker->set_output (convolved);
-    skmasker->set_M (config->sk_m);
-
-    operations.push_back (skmasker);
+    skestimator->set_thresholds (config->sk_m, config->sk_std_devs);
+    if (config->sk_chan_start > 0 && config->sk_chan_end < config->filterbank.get_nchan())
+      skestimator->set_channel_range (config->sk_chan_start, config->sk_chan_end);
+    skestimator->set_options (config->sk_no_fscr, config->sk_no_tscr, config->sk_no_ft);
 
+    operations.push_back (skestimator.get());
   }
 
   // Cyclic spectrum also detects and folds
   if (config->cyclic_nchan) 
   {
-    build_fold(convolved);
+    build_fold(cleaned);
     return;
   }
 
   if (!detect)
     detect = new Detection;
 
-  TimeSeries* detected = convolved;
-  detect->set_input (convolved);
-  detect->set_output (convolved);
+  TimeSeries* detected = cleaned;
+  detect->set_input (cleaned);
+  detect->set_output (cleaned);
 
   configure_detection (detect, noperations);
 
@@ -636,7 +584,7 @@
     Reference::To<Fold> skfold;
     build_fold (skfold, unload);
 
-    skfold->set_input( skoutput );
+    skfold->set_input( cleaned);
     skfold->prepare( manager->get_info() );
     skfold->reset();
 
@@ -856,7 +804,7 @@
 
     minimum_samples = convolution->get_minimum_samples () * fb_factor;
     if (report_vitals)
-      cerr << "dspsr: convolution requires at least " 
+      cerr << "dspsr: convolution requires at least "
            << minimum_samples << " samples" << endl;
 
     if (!config->input_buffering)
@@ -882,41 +830,41 @@
   uint64_t ram = manager->set_block_size( block_size );
 
 #if HAVE_CFITSIO
+#if HAVE_fits
   // if PSRFITS input, set block to exact size of FITS row
   // this is needed to keep in sync with the callback
   if (manager->get_info()->get_machine() == "FITS")
   {
     FITSFile* tmp = dynamic_cast<FITSFile*> (manager->get_input());
-    unsigned samples_per_row = tmp->get_samples_in_row();
-    uint64_t current_bytes = manager->set_block_size (samples_per_row);
-    uint64_t new_max_ram = current_bytes / tmp->get_block_size() * samples_per_row;
-    if (new_max_ram > config->get_maximum_RAM ())
-      throw Error (InvalidState, "prepare", "Maximum RAM smaller than PSRFITS row.");
-    manager->set_maximum_RAM (new_max_ram);
-    manager->set_block_size (samples_per_row);
-  }
-#endif
-
-  // add the increased block size if the SKFB is being used
-  if (skfilterbank)
-  {
-    block_size = manager->get_input()->get_block_size();
-    int64_t skfb_increment = (int64_t) skfilterbank->get_skfb_inc (block_size);
-
-    block_size += skfb_increment;
-    block_overlap += skfb_increment;
-
-    if (block_overlap)
-      manager->set_overlap( block_overlap );
-    ram = manager->set_block_size( block_size );
-
-    skfb_increment *= -1;
-    skresize->set_resize_samples (skfb_increment);
+    uint64_t block_size;
 
-    if (Operation::verbose)
-      cerr << "dsp::LoadToFold::prepare block_size will be adjusted by " 
-          << skfb_increment << " samples for SKFB" << endl;
+    if (!tmp)
+    {
+      MultiFile* mfile = dynamic_cast<MultiFile*> (manager->get_input());
+      if (mfile)
+      {
+        block_size = mfile->get_block_size();
+        tmp = dynamic_cast<FITSFile*> ( mfile->get_loader() );
+      }
+    }
+    else
+      block_size = tmp->get_block_size();
+    if (tmp)
+    {
+      unsigned samples_per_row = tmp->get_samples_in_row();
+      uint64_t current_bytes = manager->set_block_size (samples_per_row);
+      uint64_t new_max_ram = current_bytes / block_size * samples_per_row;
+      if (new_max_ram > config->get_maximum_RAM ())
+        throw Error (InvalidState, "LoadToFold::prepare", 
+            "Maximum RAM smaller than PSRFITS row.");
+      manager->set_maximum_RAM (new_max_ram);
+      manager->set_block_size (samples_per_row);
+    }
+    else
+      cerr << "dspsr: WARNING have FITS input but cannot set block size properly." << endl;
   }
+#endif
+#endif
 
   if (report_vitals)
   {
diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/Makefile.am bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/Makefile.am
--- bl-dspsr-0+git20160405/Signal/Pulsar/Makefile.am	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/Makefile.am	2018-03-12 23:02:35.000000000 +0000
@@ -43,7 +43,13 @@
 	$(top_builddir)/Signal/General/libdspdsp.la \
 	$(top_builddir)/Kernel/libdspbase.la \
 	$(top_builddir)/Signal/Statistics/libdspstats.la \
-	@CUFFT_LIBS@ @CUDA_LIBS@
+	@CUDA_LIBS@ 
+
+if HAVE_CUFFT_CALLBACKS
+LDADD += $(top_builddir)/Signal/General/ConvolutionCUDACallbacks_DC.o -lcufft_static -lculibos
+else
+LDADD += @CUFFT_LIBS@
+endif
 
 AM_CPPFLAGS += @CUFFT_CFLAGS@
 if HAVE_CFITSIO
diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/TransferPhaseSeriesCUDA.C bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/TransferPhaseSeriesCUDA.C
--- bl-dspsr-0+git20160405/Signal/Pulsar/TransferPhaseSeriesCUDA.C	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/TransferPhaseSeriesCUDA.C	2018-03-12 23:02:35.000000000 +0000
@@ -26,11 +26,6 @@
 {
   prepare ();
 
-  if (stream)
-    cudaStreamSynchronize(stream);
-  else
-    cudaThreadSynchronize();
-
   if (verbose)
     cerr << "dsp::TransferPhaseSeriesCUDA::transformation input ndat="
          << input->get_ndat() << " ndim=" << input->get_ndim()
@@ -75,6 +70,12 @@
       throw Error (InvalidState, "dsp::TransferPhaseSeriesCUDA::transformation hits",
                    cudaGetErrorString (error));
   }
+
+  if (stream)
+    cudaStreamSynchronize(stream);
+  else
+    cudaThreadSynchronize();
+
 }
 
 void dsp::TransferPhaseSeriesCUDA::prepare ()
diff -Nru bl-dspsr-0+git20160405/Signal/Statistics/dsp/MidPoint.h bl-dspsr-0.0~git20180312.50ea209/Signal/Statistics/dsp/MidPoint.h
--- bl-dspsr-0+git20160405/Signal/Statistics/dsp/MidPoint.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/Statistics/dsp/MidPoint.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/Statistics/dsp/MidPoint.h,v $
-   $Revision: 1.2 $
-   $Date: 2011/08/04 21:03:22 $
-   $Author: straten $ */
+// dspsr/Signal/Statistics/dsp/MidPoint.h
 
 #ifndef __MidPointMethod
 #define __MidPointMethod
diff -Nru bl-dspsr-0+git20160405/Signal/Statistics/dsp/Neville.h bl-dspsr-0.0~git20180312.50ea209/Signal/Statistics/dsp/Neville.h
--- bl-dspsr-0+git20160405/Signal/Statistics/dsp/Neville.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/Statistics/dsp/Neville.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/Statistics/dsp/Neville.h,v $
-   $Revision: 1.2 $
-   $Date: 2011/08/04 21:03:22 $
-   $Author: straten $ */
+// dspsr/Signal/Statistics/dsp/Neville.h
 
 #ifndef __NevilleMethod
 #define __NevilleMethod
diff -Nru bl-dspsr-0+git20160405/Signal/Statistics/dsp/NewtonRaphson.h bl-dspsr-0.0~git20180312.50ea209/Signal/Statistics/dsp/NewtonRaphson.h
--- bl-dspsr-0+git20160405/Signal/Statistics/dsp/NewtonRaphson.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/Statistics/dsp/NewtonRaphson.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/Statistics/dsp/NewtonRaphson.h,v $
-   $Revision: 1.2 $
-   $Date: 2011/08/04 21:03:22 $
-   $Author: straten $ */
+// dspsr/Signal/Statistics/dsp/NewtonRaphson.h
 
 #ifndef __NewtonRaphsonMethod
 #define __NewtonRaphsonMethod
diff -Nru bl-dspsr-0+git20160405/Signal/Statistics/dsp/Romberg.h bl-dspsr-0.0~git20180312.50ea209/Signal/Statistics/dsp/Romberg.h
--- bl-dspsr-0+git20160405/Signal/Statistics/dsp/Romberg.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/Statistics/dsp/Romberg.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/Statistics/dsp/Romberg.h,v $
-   $Revision: 1.2 $
-   $Date: 2011/08/04 21:03:22 $
-   $Author: straten $ */
+// dspsr/Signal/Statistics/dsp/Romberg.h
 
 #ifndef __RombergMethod
 #define __RombergMethod
diff -Nru bl-dspsr-0+git20160405/Signal/Statistics/dsp/Trapezoid.h bl-dspsr-0.0~git20180312.50ea209/Signal/Statistics/dsp/Trapezoid.h
--- bl-dspsr-0+git20160405/Signal/Statistics/dsp/Trapezoid.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/Statistics/dsp/Trapezoid.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/Statistics/dsp/Trapezoid.h,v $
-   $Revision: 1.2 $
-   $Date: 2011/08/04 21:03:22 $
-   $Author: straten $ */
+// dspsr/Signal/Statistics/dsp/Trapezoid.h
 
 #ifndef __TrapezoidMethod
 #define __TrapezoidMethod
diff -Nru bl-dspsr-0+git20160405/Signal/Statistics/dsp/VolumeIntegral.h bl-dspsr-0.0~git20180312.50ea209/Signal/Statistics/dsp/VolumeIntegral.h
--- bl-dspsr-0+git20160405/Signal/Statistics/dsp/VolumeIntegral.h	2018-03-12 08:31:57.000000000 +0000
+++ bl-dspsr-0.0~git20180312.50ea209/Signal/Statistics/dsp/VolumeIntegral.h	2018-03-12 23:02:35.000000000 +0000
@@ -6,10 +6,7 @@
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Signal/Statistics/dsp/VolumeIntegral.h,v $
-   $Revision: 1.2 $
-   $Date: 2011/08/04 21:03:22 $
-   $Author: straten $ */
+// dspsr/Signal/Statistics/dsp/VolumeIntegral.h
 
 #ifndef __Volume_Integral
 #define __Volume_Integral