diff -Nru bl-dspsr-0+git20160405/Benchmark/filterbank_bench.csh bl-dspsr-0.0~git20180312.50ea209/Benchmark/filterbank_bench.csh --- bl-dspsr-0+git20160405/Benchmark/filterbank_bench.csh 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Benchmark/filterbank_bench.csh 2018-03-12 23:02:35.000000000 +0000 @@ -19,7 +19,7 @@ echo -n "Testing nchan=$nchan nfft=$nfft " - time filterbank_speed -c$nchan -n$nfft -cuda >> filterbank_bench.out + time ../Signal/General/filterbank_speed -c$nchan -n$nfft -cuda >> filterbank_bench.out nfft=`expr $nfft '*' 2` diff -Nru bl-dspsr-0+git20160405/config/ax_hdf5.m4 bl-dspsr-0.0~git20180312.50ea209/config/ax_hdf5.m4 --- bl-dspsr-0+git20160405/config/ax_hdf5.m4 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/config/ax_hdf5.m4 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,183 @@ +dnl +dnl NOTE: this file has been modified from its original form on 9/22/2015. +dnl +dnl ###################################################################### +dnl +dnl File: hdf5.m4 +dnl +dnl Purpose: Determine the locations of hdf5 includes and libraries +dnl +dnl Version: $Id: hdf5.m4,v 1.26 2003/09/15 20:36:26 cary Exp $ +dnl +dnl Tech-X configure system +dnl +dnl Copyright Tech-X Corporation +dnl +dnl ###################################################################### +dnl + +dnl +dnl NOTE: this file was retrieved from: +dnl +dnl https://www.hdfgroup.org/ftp/HDF5/contrib/autoconf-macros/hdf5.m4 +dnl + +dnl +dnl Copyright Notice and License Terms for +dnl HDF5 (Hierarchical Data Format 5) Software Library and Utilities +dnl ----------------------------------------------------------------------------- +dnl +dnl HDF5 (Hierarchical Data Format 5) Software Library and Utilities +dnl Copyright 2006-2015 by The HDF Group. +dnl +dnl NCSA HDF5 (Hierarchical Data Format 5) Software Library and Utilities +dnl Copyright 1998-2006 by the Board of Trustees of the University of Illinois. +dnl +dnl All rights reserved. +dnl +dnl Redistribution and use in source and binary forms, with or without +dnl modification, are permitted for any purpose (including commercial purposes) +dnl provided that the following conditions are met: +dnl +dnl 1. Redistributions of source code must retain the above copyright notice, +dnl this list of conditions, and the following disclaimer. +dnl +dnl 2. Redistributions in binary form must reproduce the above copyright notice, +dnl this list of conditions, and the following disclaimer in the documentation +dnl and/or materials provided with the distribution. +dnl +dnl 3. In addition, redistributions of modified forms of the source or binary +dnl code must carry prominent notices stating that the original code was +dnl changed and the date of the change. +dnl +dnl 4. All publications or advertising materials mentioning features or use of +dnl this software are asked, but not required, to acknowledge that it was +dnl developed by The HDF Group and by the National Center for Supercomputing +dnl Applications at the University of Illinois at Urbana-Champaign and +dnl credit the contributors. +dnl +dnl 5. Neither the name of The HDF Group, the name of the University, nor the +dnl name of any Contributor may be used to endorse or promote products derived +dnl from this software without specific prior written permission from +dnl The HDF Group, the University, or the Contributor, respectively. +dnl +dnl DISCLAIMER: +dnl THIS SOFTWARE IS PROVIDED BY THE HDF GROUP AND THE CONTRIBUTORS +dnl "AS IS" WITH NO WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED. In no +dnl event shall The HDF Group or the Contributors be liable for any damages +dnl suffered by the users arising out of the use of this software, even if +dnl advised of the possibility of such damage. + +AC_DEFUN([AX_HDF5], [ + +dnl ###################################################################### +dnl +dnl Allow the user to specify an overall hdf5 directory. If specified, +dnl we look for include and lib under this. +dnl +dnl ###################################################################### + +AC_ARG_WITH(hdf5,[ --with-hdf5= ],HDF5_DIR="$withval",HDF5_DIR="") + +dnl ###################################################################### +dnl +dnl Find hdf5 includes - looking in include location if present, +dnl otherwise in dir/include if present, otherwise in default locations, +dnl first parallel, then serial. +dnl +dnl ###################################################################### + +AC_ARG_WITH(hdf5-incdir,[ --with-hdf5-incdir= ], +HDF5_INCDIR="$withval",HDF5_INCDIR="") +if test "x$HDF5_DIR" != xno; then +if test -n "$HDF5_INCDIR"; then + HDF5_INCPATH=$HDF5_INCDIR +elif test -n "$HDF5_DIR"; then + HDF5_INCPATH=$HDF5_DIR/include +elif test "$MPI" = yes; then + HDF5_INCPATH=$HOME/hdf5mpi/include:/usr/local/hdf5mpi/include:/loc/hdf5mpi/include:$HOME/hdf5/include:/usr/local/hdf5/include:/loc/hdf5/include:/usr/common/usg/hdf5/default/parallel/include:/usr/local/include +else + HDF5_INCPATH=$HOME/hdf5/include:/usr/local/hdf5/include:/loc/hdf5/include:$HOME/hdf5mpi/include:/usr/local/hdf5mpi/include:/loc/hdf5mpi/include:/usr/common/usg/hdf5/default/serial/include +fi +saveCPPFLAGS=$CPPFLAGS +CPPFLAGS="-I$HDF5_INCPATH $CPPFLAGS" +AC_CHECK_HEADER(hdf5.h, [HDF5_H=y], [HDF5_H=""]) +CPPFLAGS=$saveCPPFLAGS +if test -z "$HDF5_H"; then + AC_MSG_WARN(hdf5.h not found in $HDF5_INCPATH. Set with --with-hdf5-incdir=) + HDF5_INC=" " + ac_cv_have_hdf5=no +else + HDF5_INCDIR=$HDF5_INCPATH + AC_SUBST(HDF5_INCDIR) + HDF5_INC=-I$HDF5_INCDIR + HDF5_CPPFLAGS=$HDF5_INC + AC_SUBST(HDF5_INC) + AC_SUBST(HDF5_CPPFLAGS) + HDF5_DIR=`dirname $HDF5_INCDIR` + ac_cv_have_hdf5=yes +fi +fi +dnl ###################################################################### +dnl +dnl See if built parallel +dnl +dnl ###################################################################### + +if test $ac_cv_have_hdf5 = yes; then + if test -f $HDF5_INCDIR/H5config.h; then + hdf5par=`grep "HAVE_PARALLEL 1" $HDF5_INCDIR/H5config.h` + elif test -f $HDF5_INCDIR/H5pubconf.h; then + hdf5par=`grep "HAVE_PARALLEL 1" $HDF5_INCDIR/H5pubconf.h` + fi +fi + +dnl ###################################################################### +dnl +dnl Find hdf5 libraries +dnl +dnl ###################################################################### + +AC_ARG_WITH(hdf5-libdir,[ --with-hdf5-libdir= ], +HDF5_LIBDIR="$withval",HDF5_LIBDIR="") +if test $ac_cv_have_hdf5 = yes; then + if test -n "$HDF5_LIBDIR"; then + HDF5_LIBPATH=$HDF5_LIBDIR + else + HDF5_LIBPATH=$HDF5_DIR/lib + fi + + saveLDFLAGS=$LDFLAGS + LDFLAGS="-L$HDF5_LIBPATH $LDFLAGS" + AC_CHECK_LIB([hdf5],[H5open],[LIBHDF5_A=y],[LIBHDF5_A=""]) + LDFLAGS=$saveLDFLAGS + + if test -z "$LIBHDF5_A"; then + AC_MSG_WARN(libhdf5.a not found. Set with --with-hdf5-libdir=) + ac_cv_have_hdf5=no + HDF5_LDFLAGS=" " + HDF5_LIBS=" " + else + HDF5_LIBDIR=$HDF5_LIBPATH + AC_SUBST(HDF5_LIBDIR) + HDF5_LDFLAGS="-L$HDF5_LIBDIR" + HDF5_LIBS="-lhdf5" + AC_SUBST(HDF5_LDFLAGS) + AC_SUBST(HDF5_LIBS) + fi +fi + +dnl ###################################################################### +dnl +dnl Define for whether hdf5 found +dnl +dnl ###################################################################### + +if test $ac_cv_have_hdf5 = yes; then + AC_DEFINE(HAVE_HDF5, [1], [Define if we have libhdf5]) + AM_CONDITIONAL(HAVE_HDF5, true) +else + AM_CONDITIONAL(HAVE_HDF5, false) +fi + +]) dnl End of DEFUN diff -Nru bl-dspsr-0+git20160405/config/cuda.m4 bl-dspsr-0.0~git20180312.50ea209/config/cuda.m4 --- bl-dspsr-0+git20160405/config/cuda.m4 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/config/cuda.m4 2018-03-12 23:02:35.000000000 +0000 @@ -6,6 +6,9 @@ SWIN_PACKAGE_OPTIONS([cuda]) + AC_ARG_ENABLE([cufft_callbacks], + AC_HELP_STRING([--enable-cufft-callbacks],[Use CUFFT callbacks if CUDA enabled, EXPERIMENTAL])) + CUDA_CFLAGS="" CUDA_LIBS="" @@ -87,6 +90,35 @@ fi + have_cufft_callbacks="no" + + if test x"$enable_cufft_callbacks" = xyes; then + + if test "$have_cufft" = "yes" ; then + + AC_MSG_CHECKING([for CUDA FFT Callbacks]) + + SWIN_PACKAGE_FIND([cufft_callbacks],[cufftXt.h]) + SWIN_PACKAGE_TRY_COMPILE([cufft_callbacks],[#include + #include ],[],[$swin_cuda_include_dir]) + + SWIN_PACKAGE_FIND([cufft_callbacks],[libcufft_static.*]) + SWIN_PACKAGE_TRY_LINK([cufft_callbacks],[#include + #include ], + [cufftPlan1d (0, 1024, CUFFT_C2C, 1);],[-lcudart -lcufft]) + + AC_MSG_RESULT([$have_cufft_callbacks]) + + if test "$have_cufft_callbacks" = "yes"; then + AC_DEFINE([HAVE_CUFFT_CALLBACKS],[1],[Define if the CUFFT Callbacks library is present]) + [$1] + else + AC_MSG_WARN([CUFFT Callbacks will not be compiled]) + [$2] + fi + fi + fi + AC_SUBST(CUDA_NVCC) CUDA_LIBS="$cuda_LIBS" @@ -103,4 +135,10 @@ AC_SUBST(CUFFT_CFLAGS) AM_CONDITIONAL(HAVE_CUFFT,[test "$have_cufft" = "yes"]) + CUFFT_CALLBACKS_LIBS="${cufft_callbacks_LIBS}_static -lculibos" + CUFFT_CALLBACKS_CFLAGS="$cufft_callbacks_CFLAGS" + + AC_SUBST(CUFFT_CALLBACKS_LIBS) + AC_SUBST(CUFFT_CALLBACKS_CFLAGS) + AM_CONDITIONAL(HAVE_CUFFT_CALLBACKS,[test "$have_cufft_callbacks" = "yes"]) ]) diff -Nru bl-dspsr-0+git20160405/config/guppi_daq.m4 bl-dspsr-0.0~git20180312.50ea209/config/guppi_daq.m4 --- bl-dspsr-0+git20160405/config/guppi_daq.m4 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/config/guppi_daq.m4 2018-03-12 23:02:35.000000000 +0000 @@ -17,7 +17,7 @@ if test x"$GUPPI_DIR" != x; then GUPPI_DAQ_CFLAGS="-I$GUPPI_DIR/src" - GUPPI_DAQ_LIBS="-L$GUPPI_DIR/src -lguppi_daq -lsla -lvdifio -lm" + GUPPI_DAQ_LIBS="-L$GUPPI_DIR/src -lsla -lm" have_guppi_daq="yes" fi diff -Nru bl-dspsr-0+git20160405/config/mark5access.m4 bl-dspsr-0.0~git20180312.50ea209/config/mark5access.m4 --- bl-dspsr-0+git20160405/config/mark5access.m4 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/config/mark5access.m4 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,67 @@ +# SWIN_LIB_MARK5ACCESS([ACTION-IF-FOUND [,ACTION-IF-NOT-FOUND]]) +# ---------------------------------------------------------- +AC_DEFUN([SWIN_LIB_MARK5ACCESS], +[ + AC_PROVIDE([SWIN_LIB_MARK5ACCESS]) + + AC_ARG_WITH([mark5access-dir], + AC_HELP_STRING([--with-mark5access-dir=DIR], + [MARK5ACCESS is installed in DIR])) + + MARK5ACCESS_CFLAGS="" + MARK5ACCESS_LIBS="" + + if test x"$with_mark5access_dir" = xno; then + # user disabled mark5access. Leave cache alone. + have_mark5access="User disabled mark5access." + else + + AC_MSG_CHECKING([for mark5access installation]) + + # "yes" is not a specification + if test x"$with_mark5access_dir" = xyes; then + with_mark5access_dir= + fi + + have_mark5access="not found" + + ac_save_CPPFLAGS="$CPPFLAGS" + ac_save_LIBS="$LIBS" + + CPPFLAGS="`pkg-config --cflags mark5access` $CPPFLAGS" + LIBS="`pkg-config --libs mark5access` $LIBS" + + # TESTPKG="`pkg-config --cflags mark5access`" + # AC_MSG_NOTICE([pkg-config returns $TESTPKG]) + AC_TRY_LINK([#include ], [new_mark5_stream(0,0);], + have_mark5access=yes, have_mark5access=no) + + if test $have_mark5access = yes; then + MARK5ACCESS_CFLAGS="`pkg-config --cflags mark5access`" + MARK5ACCESS_LIBS="`pkg-config --libs mark5access`" + fi + + LIBS="$ac_save_LIBS" + CPPFLAGS="$ac_save_CPPFLAGS" + + fi + + AC_MSG_RESULT([$have_mark5access]) + + if test "$have_mark5access" = "yes"; then + AC_DEFINE([HAVE_MARK5ACCESS], [1], [Define if the mark5access library is present]) + [$1] + else + AC_MSG_NOTICE([Ensure that the PKG_CONFIG_PATH environment variable points to]) + AC_MSG_NOTICE([the lib/pkgconfig sub-directory of the root directory where]) + AC_MSG_NOTICE([the mark5access library was installed.]) + AC_MSG_NOTICE([Alternatively, use the --with-mark5access-dir option.]) + [$2] + fi + + AC_SUBST(MARK5ACCESS_LIBS) + AC_SUBST(MARK5ACCESS_CFLAGS) + AM_CONDITIONAL(HAVE_MARK5ACCESS,[test "$have_mark5access" = "yes"]) + +]) + diff -Nru bl-dspsr-0+git20160405/configure.ac bl-dspsr-0.0~git20180312.50ea209/configure.ac --- bl-dspsr-0+git20160405/configure.ac 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/configure.ac 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,223 @@ +# -*- Autoconf -*- +# Process this file with autoconf to produce a configure script. + +AC_PREREQ(2.57) +AC_INIT([DSPSR],[2016-06+],[dspsr-developers@lists.sourceforge.net]) + +AC_CONFIG_AUX_DIR([config]) +AC_CONFIG_SRCDIR([Kernel/Makefile.am]) + +AC_ARG_VAR([PSRHOME], [Standard pulsar home directory]) +AC_ARG_VAR([LOGIN_ARCH], [Architecture-dependent sub-directory of PSRHOME]) +AC_ARG_VAR([PACKAGES], [Root directories of third-party packages]) +AC_ARG_VAR([CUDA_NVCC_FLAGS], [CUDA nvcc flags (e.g. -arch, etc)]) + +AC_PREFIX_DEFAULT([${PSRHOME:-"/usr/local"}${PSRHOME:+"/$LOGIN_ARCH"}]) + +if test x"$PSRHOME" = x; then + AC_MSG_WARN([The PSRHOME environment variable is not set]) +else + if test x"$LOGIN_ARCH" = x; then + AC_MSG_WARN([The LOGIN_ARCH environment variable is not set]) + fi +fi + +# +# +# + +# Check if the user has set compiler options +SWIN_OPTIONS_SET + +# Enable convenience compiler selection +SWIN_COMPILER + +AM_INIT_AUTOMAKE([foreign subdir-objects]) + +# +# Check for selected formats in backends.list +# + +DSPSR_FORMATS + +# Create static libraries by default +AC_DISABLE_SHARED + +# Enable large-file support +AC_SYS_LARGEFILE + +# Determine the machine endian +AC_C_BIGENDIAN + +# Checks for programs. +AC_PROG_CXX +AC_PROG_CXXCPP +AC_PROG_CC +AC_PROG_CPP +AM_PROG_CC_C_O + +AC_PROG_F77 + +AC_PROG_INSTALL +AC_PROG_LIBTOOL + +# +# Disable the debugging information option, -g +# +SWIN_DEBUG +SWIN_LOCAL + +# +# Checks for essential libraries +# +AC_CHECK_LIB([m], [sin]) +SWIN_LIB_PSRCHIVE([],AC_MSG_ERROR([PSRCHIVE Library not found. + +Please see http://psrchive.sourceforge.net + +])) + +PSRCHIVE_ACLOCAL=`$psrchive_config --aclocal` +AC_SUBST(PSRCHIVE_ACLOCAL) + +PSRCHIVE_INCLUDE=`$psrchive_config --cflags | sed s/-pthread//` +AC_SUBST(PSRCHIVE_INCLUDE) + +# +# Checks for optional libraries +# +SWIN_LIB_PSRDADA +SWIN_LIB_CFITSIO +SWIN_LIB_MPI +SWIN_LIB_CUDA +SWIN_LIB_GUPPI_DAQ +SWIN_LIB_MARK5ACCESS + +AX_OPENMP +AC_SUBST(OPENMP_CFLAGS) + +# +# Checks for graphics libraries +# +SWIN_LIB_PGPLOT + +# +# For developers: preserve file modification times +# +INSTALL_DATA="${INSTALL} -m 644 -p" +install_sh="CPPROG='cp -p' ${install_sh}" + +# +# Checks for header files. +# +AC_CHECK_HEADERS([malloc.h]) + +# Check for openssl +MJK_LIB_CRYPTO + +# Check for psrxml io library +MJK_LIB_PSRXML + +# find HDF5 (needed for LOFAR) +AX_HDF5 + +# +# Checks for library functions. +# +SWIN_FUNC_GETOPT_LONG +SWIN_FUNC_AFFINITY + +# +# Generate python module if --enable-shared is used +# +if test x"$enable_shared" == xyes; then + AM_PATH_PYTHON(,, [:]) + AC_PROG_SWIG + SWIG_ENABLE_CXX + SWIG_PYTHON +else + PYTHON=":" +fi + +AM_CONDITIONAL([HAVE_PYTHON], [test "$PYTHON" != :]) + +# +# Initialize variables used by Makefile.include +# +INCLUDE_CPPFLAGS="$PSRCHIVE_CPPFLAGS" +AC_SUBST(INCLUDE_CPPFLAGS) + +INCLUDE_LDFLAGS="$PSRCHIVE_LIBS" +AC_SUBST(INCLUDE_LDFLAGS) + +AM_CONFIG_HEADER([config.h]) +AC_CONFIG_FILES([Makefile + config/Makefile + Kernel/Makefile + Kernel/Classes/Makefile + Kernel/Applications/Makefile + Kernel/Formats/Makefile + Kernel/Formats/apsr/Makefile + Kernel/Formats/asp/Makefile + Kernel/Formats/bcpm/Makefile + Kernel/Formats/bpsr/Makefile + Kernel/Formats/caspsr/Makefile + Kernel/Formats/cpsr/Makefile + Kernel/Formats/cpsr2/Makefile + Kernel/Formats/dada/Makefile + Kernel/Formats/dummy/Makefile + Kernel/Formats/emerlin/Makefile + Kernel/Formats/fadc/Makefile + Kernel/Formats/fits/Makefile + Kernel/Formats/gmrt/Makefile + Kernel/Formats/guppi/Makefile + Kernel/Formats/kat/Makefile + Kernel/Formats/lbadr/Makefile + Kernel/Formats/lbadr64/Makefile + Kernel/Formats/lofar_dal/Makefile + Kernel/Formats/lump/Makefile + Kernel/Formats/lwa/Makefile + Kernel/Formats/spda1k/Makefile + Kernel/Formats/mark4/Makefile + Kernel/Formats/mark5/Makefile + Kernel/Formats/mark5b/Makefile + Kernel/Formats/maxim/Makefile + Kernel/Formats/mopsr/Makefile + Kernel/Formats/mwa/Makefile + Kernel/Formats/pmdaq/Makefile + Kernel/Formats/pdev/Makefile + Kernel/Formats/puma/Makefile + Kernel/Formats/puma2/Makefile + Kernel/Formats/s2/Makefile + Kernel/Formats/sigproc/Makefile + Kernel/Formats/ska1/Makefile + Kernel/Formats/spigot/Makefile + Kernel/Formats/vdif/Makefile + Kernel/Formats/wapp/Makefile + Signal/Makefile + Signal/Statistics/Makefile + Signal/General/Makefile + Signal/Pulsar/Makefile + Management/Makefile + Management/dspsr_ldflags + Management/dspsr_cflags + Management/release.csh + More/Makefile + More/Plotting/Makefile + More/Applications/Makefile + python/Makefile]) + +AC_OUTPUT + +echo + +if test x"$selected_formats" = x; then + echo "WARNING: no file formats have been selected" + echo "Please see http://dspsr.sourceforge.net/formats for details" +else + echo "DSPSR will support these formats: $selected_formats" +fi + +echo +echo "DSPSR is now ready to be compiled. Please run 'make'" +echo diff -Nru bl-dspsr-0+git20160405/configure.in bl-dspsr-0.0~git20180312.50ea209/configure.in --- bl-dspsr-0+git20160405/configure.in 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/configure.in 1970-01-01 00:00:00.000000000 +0000 @@ -1,213 +0,0 @@ -# -*- Autoconf -*- -# Process this file with autoconf to produce a configure script. - -AC_PREREQ(2.57) -AC_INIT([DSPSR],[2.0],[dspsr-developers@lists.sourceforge.net]) - -AC_CONFIG_AUX_DIR([config]) -AC_CONFIG_SRCDIR([Kernel/Makefile.am]) - -AC_ARG_VAR([PSRHOME], [Standard pulsar home directory]) -AC_ARG_VAR([LOGIN_ARCH], [Architecture-dependent sub-directory of PSRHOME]) -AC_ARG_VAR([PACKAGES], [Root directories of third-party packages]) -AC_ARG_VAR([CUDA_NVCC_FLAGS], [CUDA nvcc flags (e.g. -arch, etc)]) - -AC_PREFIX_DEFAULT([${PSRHOME:-"/usr/local"}${PSRHOME:+"/$LOGIN_ARCH"}]) - -if test x"$PSRHOME" = x; then - AC_MSG_WARN([The PSRHOME environment variable is not set]) -else - if test x"$LOGIN_ARCH" = x; then - AC_MSG_WARN([The LOGIN_ARCH environment variable is not set]) - fi -fi - -# -# -# - -# Check if the user has set compiler options -SWIN_OPTIONS_SET - -# Enable convenience compiler selection -SWIN_COMPILER - -AM_INIT_AUTOMAKE([foreign subdir-objects]) - -# -# Check for selected formats in backends.list -# - -DSPSR_FORMATS - -# Create static libraries by default -AC_DISABLE_SHARED - -# Enable large-file support -AC_SYS_LARGEFILE - -# Determine the machine endian -AC_C_BIGENDIAN - -# Checks for programs. -AC_PROG_CXX -AC_PROG_CXXCPP -AC_PROG_CC -AC_PROG_CPP -AM_PROG_CC_C_O - -AC_PROG_INSTALL -AC_PROG_LIBTOOL - -# -# Disable the debugging information option, -g -# -SWIN_DEBUG -SWIN_LOCAL - -# -# Checks for essential libraries -# -AC_CHECK_LIB([m], [sin]) -SWIN_LIB_PSRCHIVE([],AC_MSG_ERROR([PSRCHIVE Library not found. - -Please see http://psrchive.sourceforge.net - -])) - -PSRCHIVE_ACLOCAL=`$psrchive_config --aclocal` -AC_SUBST(PSRCHIVE_ACLOCAL) - -PSRCHIVE_INCLUDE=`$psrchive_config --cflags | sed s/-pthread//` -AC_SUBST(PSRCHIVE_INCLUDE) - -# -# Checks for optional libraries -# -SWIN_LIB_PSRDADA -SWIN_LIB_CFITSIO -SWIN_LIB_MPI -SWIN_LIB_CUDA -SWIN_LIB_GUPPI_DAQ - -AX_OPENMP -AC_SUBST(OPENMP_CFLAGS) - -# -# Checks for graphics libraries -# -SWIN_LIB_PGPLOT - -# -# For developers: preserve file modification times -# -INSTALL_DATA="${INSTALL} -m 644 -p" -install_sh="CPPROG='cp -p' ${install_sh}" - -# -# Checks for header files. -# -AC_CHECK_HEADERS([malloc.h]) - -# Check for openssl -MJK_LIB_CRYPTO - -# Check for psrxml io library -MJK_LIB_PSRXML - -# -# Checks for library functions. -# -SWIN_FUNC_GETOPT_LONG -SWIN_FUNC_AFFINITY - -# -# Generate python module if --enable-shared is used -# -if test x"$enable_shared" == xyes; then - AM_PATH_PYTHON(,, [:]) - AC_PROG_SWIG - SWIG_ENABLE_CXX - SWIG_PYTHON -else - PYTHON=":" -fi - -AM_CONDITIONAL([HAVE_PYTHON], [test "$PYTHON" != :]) - -# -# Initialize variables used by Makefile.include -# -INCLUDE_CPPFLAGS="$PSRCHIVE_CPPFLAGS" -AC_SUBST(INCLUDE_CPPFLAGS) - -INCLUDE_LDFLAGS="$PSRCHIVE_LIBS" -AC_SUBST(INCLUDE_LDFLAGS) - -AM_CONFIG_HEADER([config.h]) -AC_CONFIG_FILES([Makefile - config/Makefile - Kernel/Makefile - Kernel/Classes/Makefile - Kernel/Applications/Makefile - Kernel/Formats/Makefile - Kernel/Formats/apsr/Makefile - Kernel/Formats/asp/Makefile - Kernel/Formats/bcpm/Makefile - Kernel/Formats/bpsr/Makefile - Kernel/Formats/caspsr/Makefile - Kernel/Formats/cpsr/Makefile - Kernel/Formats/cpsr2/Makefile - Kernel/Formats/dada/Makefile - Kernel/Formats/dummy/Makefile - Kernel/Formats/fadc/Makefile - Kernel/Formats/fits/Makefile - Kernel/Formats/gmrt/Makefile - Kernel/Formats/guppi/Makefile - Kernel/Formats/kat/Makefile - Kernel/Formats/lbadr/Makefile - Kernel/Formats/lbadr64/Makefile - Kernel/Formats/lofar_dal/Makefile - Kernel/Formats/lump/Makefile - Kernel/Formats/lwa/Makefile - Kernel/Formats/spda1k/Makefile - Kernel/Formats/mark4/Makefile - Kernel/Formats/mark5/Makefile - Kernel/Formats/maxim/Makefile - Kernel/Formats/mwa/Makefile - Kernel/Formats/pmdaq/Makefile - Kernel/Formats/pdev/Makefile - Kernel/Formats/puma/Makefile - Kernel/Formats/puma2/Makefile - Kernel/Formats/s2/Makefile - Kernel/Formats/sigproc/Makefile - Kernel/Formats/spigot/Makefile - Kernel/Formats/vdif/Makefile - Kernel/Formats/wapp/Makefile - Signal/Makefile - Signal/Statistics/Makefile - Signal/General/Makefile - Signal/Pulsar/Makefile - Management/Makefile - Management/dspsr_ldflags - Management/dspsr_cflags - Management/release.csh - More/Makefile - More/Plotting/Makefile - More/Applications/Makefile - python/Makefile]) - -AC_OUTPUT - -echo - -if test x"$selected_formats" = x; then - echo "WARNING: no file formats have been selected" - echo "Please see http://dspsr.sourceforge.net/formats for details" -else - echo "DSPSR will support these formats: $selected_formats" -fi - -echo -echo "DSPSR is now ready to be compiled. Please run 'make'" -echo diff -Nru bl-dspsr-0+git20160405/debian/changelog bl-dspsr-0.0~git20180312.50ea209/debian/changelog --- bl-dspsr-0+git20160405/debian/changelog 2018-03-13 10:20:03.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/debian/changelog 2018-08-20 13:41:36.000000000 +0000 @@ -1,3 +1,13 @@ +bl-dspsr (0.0~git20180312.50ea209-1) bionic; urgency=medium + + [ Gijs Molenaar ] + * add watch file + * New upstream version 0.0~git20180312.50ea209 + * remove old patch + * disable tests since broken + + -- KERN packaging Mon, 20 Aug 2018 15:41:36 +0200 + bl-dspsr (0+git20160405-1) bionic; urgency=medium * Initial release diff -Nru bl-dspsr-0+git20160405/debian/clean bl-dspsr-0.0~git20180312.50ea209/debian/clean --- bl-dspsr-0+git20160405/debian/clean 2018-03-13 10:19:45.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/debian/clean 2018-08-20 13:41:36.000000000 +0000 @@ -89,3 +89,57 @@ Kernel/Formats/fits/FITSOutputFile.o Kernel/Formats/fits/FITSUnpacker.o Kernel/Formats/fits/GUPPIFITSUnpacker.o +Kernel/Classes/Makefile.in +Kernel/Formats/Makefile.in +Kernel/Formats/apsr/Makefile.in +Kernel/Formats/asp/Makefile.in +Kernel/Formats/bcpm/Makefile.in +Kernel/Formats/bpsr/Makefile.in +Kernel/Formats/caspsr/Makefile.in +Kernel/Formats/cpsr/Makefile.in +Kernel/Formats/cpsr2/Makefile.in +Kernel/Formats/dada/Makefile.in +Kernel/Formats/dummy/Makefile.in +Kernel/Formats/emerlin/Makefile.in +Kernel/Formats/fadc/Makefile.in +Kernel/Formats/fits/Makefile.in +Kernel/Formats/gmrt/Makefile.in +Kernel/Formats/guppi/Makefile.in +Kernel/Formats/kat/Makefile.in +Kernel/Formats/lbadr/Makefile.in +Kernel/Formats/lbadr64/Makefile.in +Kernel/Formats/lofar_dal/Makefile.in +Kernel/Formats/lump/Makefile.in +Kernel/Formats/lwa/Makefile.in +Kernel/Formats/mark4/Makefile.in +Kernel/Formats/mark5/Makefile.in +Kernel/Formats/mark5b/Makefile.in +Kernel/Formats/maxim/Makefile.in +Kernel/Formats/mopsr/Makefile.in +Kernel/Formats/mwa/Makefile.in +Kernel/Formats/pdev/Makefile.in +Kernel/Formats/pmdaq/Makefile.in +Kernel/Formats/puma/Makefile.in +Kernel/Formats/puma2/Makefile.in +Kernel/Formats/s2/Makefile.in +Kernel/Formats/sigproc/Makefile.in +Kernel/Formats/ska1/Makefile.in +Kernel/Formats/spda1k/Makefile.in +Kernel/Formats/spigot/Makefile.in +Kernel/Formats/vdif/Makefile.in +Kernel/Formats/wapp/Makefile.in +Kernel/Makefile.in +Makefile.in +Management/Makefile.in +More/Applications/Makefile.in +More/Makefile.in +More/Plotting/Makefile.in +Signal/General/Makefile.in +Signal/Makefile.in +Signal/Pulsar/Makefile.in +Signal/Statistics/Makefile.in +config/Makefile.in +python/Makefile.in +config/fontutil.m4 +config/mysql.m4 + diff -Nru bl-dspsr-0+git20160405/debian/control bl-dspsr-0.0~git20180312.50ea209/debian/control --- bl-dspsr-0+git20160405/debian/control 2018-03-13 10:20:03.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/debian/control 2018-08-20 13:41:36.000000000 +0000 @@ -2,8 +2,16 @@ Section: science Priority: optional Maintainer: KERN packaging -Build-Depends: debhelper (>=9), psrchive, liblapack-dev, libfftw3-dev, - libcfitsio-dev, dh-autoreconf, gfortran +Build-Depends: + debhelper (>=9), + psrchive, + psrchive-dev, + liblapack-dev, + libfftw3-dev, + libcfitsio3-dev|libcfitsio-dev, + libgsl-dev, + dh-autoreconf, + gfortran Standards-Version: 3.9.6 Homepage: https://github.com/UCBerkeleySETI/dspsr diff -Nru bl-dspsr-0+git20160405/debian/patches/add_missing_dep bl-dspsr-0.0~git20180312.50ea209/debian/patches/add_missing_dep --- bl-dspsr-0+git20160405/debian/patches/add_missing_dep 2018-03-13 10:19:45.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/debian/patches/add_missing_dep 1970-01-01 00:00:00.000000000 +0000 @@ -1,13 +0,0 @@ -Description: lib dependencies missing - ---- bl-dspsr-0+git20160405.orig/Signal/Pulsar/Makefile.am -+++ bl-dspsr-0+git20160405/Signal/Pulsar/Makefile.am -@@ -43,6 +43,8 @@ LDADD = libdspsr.la \ - $(top_builddir)/Signal/General/libdspdsp.la \ - $(top_builddir)/Kernel/libdspbase.la \ - $(top_builddir)/Signal/Statistics/libdspstats.la \ -+ $(top_builddir)/Kernel/Formats/fits/libfits.la \ -+ $(top_builddir)/Kernel/Classes/libClasses.la \ - @CUFFT_LIBS@ @CUDA_LIBS@ - - AM_CPPFLAGS += @CUFFT_CFLAGS@ diff -Nru bl-dspsr-0+git20160405/debian/patches/series bl-dspsr-0.0~git20180312.50ea209/debian/patches/series --- bl-dspsr-0+git20160405/debian/patches/series 2018-03-13 10:19:45.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/debian/patches/series 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ -add_missing_dep diff -Nru bl-dspsr-0+git20160405/debian/rules bl-dspsr-0.0~git20180312.50ea209/debian/rules --- bl-dspsr-0+git20160405/debian/rules 2018-03-13 10:19:45.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/debian/rules 2018-08-20 13:40:06.000000000 +0000 @@ -11,3 +11,4 @@ dh_auto_build -- -C Kernel/Formats/fits dh_auto_build +override_dh_auto_test: diff -Nru bl-dspsr-0+git20160405/debian/watch bl-dspsr-0.0~git20180312.50ea209/debian/watch --- bl-dspsr-0+git20160405/debian/watch 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/debian/watch 2018-08-17 14:32:40.000000000 +0000 @@ -0,0 +1,5 @@ +version=4 +opts="mode=git, pgpmode=none, pretty=0.0~git%cd.%h" \ +https://github.com/UCBerkeleySETI/dspsr \ +heads/directio-branch debian uupdate + diff -Nru bl-dspsr-0+git20160405/.gitignore bl-dspsr-0.0~git20180312.50ea209/.gitignore --- bl-dspsr-0+git20160405/.gitignore 2018-03-12 08:32:59.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/.gitignore 2018-03-12 23:02:35.000000000 +0000 @@ -52,6 +52,7 @@ Management/release.csh More/Applications/searchplot Signal/General/Makefile +Signal/General/cufft_callback_bench Signal/General/digifil Signal/General/digifits Signal/General/digihist @@ -59,9 +60,11 @@ Signal/General/digistat Signal/General/digitxt Signal/General/dmsmear +Signal/General/fftbatch_speed Signal/General/filterbank_speed Signal/General/passband Signal/General/the_decimator +Signal/General/undersampling_speed Signal/Makefile Signal/Pulsar/Makefile Signal/Pulsar/dspsr diff -Nru bl-dspsr-0+git20160405/Kernel/Applications/Makefile.am bl-dspsr-0.0~git20180312.50ea209/Kernel/Applications/Makefile.am --- bl-dspsr-0+git20160405/Kernel/Applications/Makefile.am 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Applications/Makefile.am 2018-03-12 23:02:35.000000000 +0000 @@ -15,9 +15,11 @@ ############################################################################# -INCLUDES = -I$(top_builddir)/local_include -LDADD = $(top_builddir)/Kernel/libdspbase.la +LDADD = $(top_builddir)/Kernel/libdspbase.la @CUDA_LIBS@ @CUFFT_LIBS@ include $(top_srcdir)/config/Makefile.include +AM_CPPFLAGS += -I$(top_builddir)/local_include +AM_CPPFLAGS += @CUDA_CFLAGS@ @CUFFT_CFLAGS@ + diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/ASCIIObservation.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/ASCIIObservation.C --- bl-dspsr-0+git20160405/Kernel/Classes/ASCIIObservation.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/ASCIIObservation.C 2018-03-12 23:02:35.000000000 +0000 @@ -340,6 +340,18 @@ // ////////////////////////////////////////////////////////////////////// // + // PICOSECONDS offset from UTC_START second + // + uint64_t offset_picoseconds = 0; + if (ascii_header_check (header, "PICOSECONDS", UI64, &offset_picoseconds) >= 0) + { + double offset_seconds = double(offset_picoseconds) / 1e12; + recording_start_time += offset_seconds; + } + + + // ////////////////////////////////////////////////////////////////////// + // // OBS_OFFSET // offset_bytes = 0; diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/BitSeries.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/BitSeries.C --- bl-dspsr-0+git20160405/Kernel/Classes/BitSeries.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/BitSeries.C 2018-03-12 23:02:35.000000000 +0000 @@ -102,7 +102,7 @@ const unsigned char* from = bitseries.get_rawptr(); unsigned char* to = get_rawptr(); - memcpy(to,from,size_t(bitseries.get_nbytes())); + memory->do_copy (to, from, size_t(bitseries.get_nbytes())); return *this; } @@ -154,7 +154,7 @@ unsigned char *into = get_rawptr(); const unsigned char *from = copy->get_rawptr() + offset; - memcpy (into, from, size_t(bytes)); + memory->do_copy (into, from, size_t(bytes)); } input_sample = copy->input_sample + idat_start; @@ -189,7 +189,7 @@ const unsigned char* from = little->get_datptr(0); unsigned char* to = get_datptr(get_ndat()); - memcpy(to,from,size_t(get_nbytes())); + memory->do_copy (to, from, size_t(get_nbytes())); set_ndat( get_ndat() + little->get_ndat() ); } diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/DADAFile.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/DADAFile.C --- bl-dspsr-0+git20160405/Kernel/Classes/DADAFile.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/DADAFile.C 2018-03-12 23:02:35.000000000 +0000 @@ -15,6 +15,7 @@ #include "FilePtr.h" #include "Error.h" +#include "strutil.h" #include #include @@ -23,11 +24,13 @@ dsp::DADAFile::DADAFile (const char* filename) : File ("DADA") { + separate_header_file = false; + if (filename) open (filename); } -string dsp::DADAFile::get_header (const char* filename) +string dsp::DADAFile::get_header (const char* filename) const { FilePtr fptr = fopen (filename, "r"); if (!fptr) @@ -35,7 +38,7 @@ "fopen (%s)", filename); // default DADA header size - unsigned hdr_size = 4096; + long hdr_size = 4096; vector buffer; char* header = 0; @@ -55,13 +58,51 @@ /* Get the header size */ if (ascii_header_get (header, "HDR_SIZE", "%u", &hdr_size) != 1) - throw Error (InvalidState, "dsp::DADAFile::get_header", - "could not parse HDR_SIZE"); + hdr_size = 0; /* Ensure that the incoming header fits in the client header buffer */ } while (hdr_size > buffer.size()); + if (hdr_size == 0) + { + // search for a matching .hdr file + string hdr_ext = ".hdr"; + string hdr_fname = replace_extension (filename, hdr_ext); + FilePtr hdr_ptr = fopen (hdr_fname.c_str(), "r"); + if (!fptr) + { + hdr_fname = filename + hdr_ext; + hdr_ptr = fopen (hdr_fname.c_str(), "r"); + } + + if (!hdr_ptr) + throw Error (InvalidState, "dsp::DADAFile::get_header", + "file has no header and no matching header file found"); + + if (fseek (hdr_ptr, 0, SEEK_END) < 0) + throw Error (FailedSys, "dsp::DADAFile::get_header", + "could not fseek to end of header file"); + + hdr_size = ftell (hdr_ptr); + if (hdr_size < 0) + throw Error (FailedSys, "dsp::DADAFile::get_header", + "ftell fails at end of header file"); + + ::rewind (hdr_ptr); + + buffer.resize (hdr_size); + header = &(buffer[0]); + + if (fread (header, 1, hdr_size, hdr_ptr) != hdr_size) + throw Error (FailedSys, "dsp::DADAFile::get_header", + "fread (nbyte=%u) from header file", hdr_size); + + // ensure that text is null-terminated before calling ascii_header_get + header[ hdr_size-1 ] = '\0'; + separate_header_file = true; + } + if (!header) return string(); @@ -116,11 +157,15 @@ info = new ASCIIObservation (header.c_str()); - if (ascii_header_get (header.c_str(), "HDR_SIZE", "%u", &header_bytes) < 0) + const char* hdr = header.c_str(); + + if (separate_header_file) + header_bytes = 0; + else if (ascii_header_get (hdr, "HDR_SIZE", "%u", &header_bytes) < 0) throw Error (FailedCall, "dsp::DADAFile::open_file", "ascii_header_get(HDR_SIZE) failed"); - if (ascii_header_get (header.c_str(), "RESOLUTION", "%u", &resolution) < 0) + if (ascii_header_get (hdr, "RESOLUTION", "%u", &resolution) < 0) resolution = 1; // the resolution is the _byte_ resolution; convert to _sample_ resolution diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/debug.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/debug.h --- bl-dspsr-0+git20160405/Kernel/Classes/debug.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/debug.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/debug.h,v $ - $Revision: 1.1 $ - $Date: 2009/11/15 00:47:21 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/debug.h #ifndef __debug_h #define __debug_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/ASCIIObservation.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/ASCIIObservation.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/ASCIIObservation.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/ASCIIObservation.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/ASCIIObservation.h,v $ - $Revision: 1.8 $ - $Date: 2011/08/01 10:05:37 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/ASCIIObservation.h #ifndef __ASCIIObservation_h #define __ASCIIObservation_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/BitUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/BitUnpacker.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/BitUnpacker.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/BitUnpacker.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/BitUnpacker.h,v $ - $Revision: 1.6 $ - $Date: 2009/06/17 10:16:53 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/BitUnpacker.h #ifndef __BitUnpacker_h #define __BitUnpacker_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/BlockFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/BlockFile.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/BlockFile.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/BlockFile.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/BlockFile.h,v $ - $Revision: 1.8 $ - $Date: 2010/10/22 19:17:56 $ - $Author: demorest $ */ +// dspsr/Kernel/Classes/dsp/BlockFile.h #ifndef __dsp_BlockFile_h @@ -59,7 +56,7 @@ than the sampled data, this method should be overloaded and the additional information should be filtered out. */ virtual int64_t load_bytes (unsigned char* buffer, uint64_t nbytes); - + //! Set the file pointer to the absolute number of sampled data bytes /*! If the header_bytes attribute is set, this number of bytes will be subtracted by File::seek_bytes before seeking. If the diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/BlockIterator.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/BlockIterator.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/BlockIterator.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/BlockIterator.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/BlockIterator.h,v $ - $Revision: 1.3 $ - $Date: 2008/09/09 06:34:14 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/BlockIterator.h #ifndef __BlockIterator_h #define __BlockIterator_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/BufferingPolicy.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/BufferingPolicy.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/BufferingPolicy.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/BufferingPolicy.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/BufferingPolicy.h,v $ - $Revision: 1.7 $ - $Date: 2009/06/17 10:16:53 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/BufferingPolicy.h #ifndef __baseband_dsp_BufferingPolicy_h #define __baseband_dsp_BufferingPolicy_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/DADAFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/DADAFile.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/DADAFile.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/DADAFile.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/DADAFile.h,v $ - $Revision: 1.2 $ - $Date: 2008/05/28 21:12:42 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/DADAFile.h #ifndef __DADAFile_h #define __DADAFile_h @@ -36,8 +33,10 @@ virtual void open_file (const char* filename); //! Read the DADA ascii header from filename - static std::string get_header (const char* filename); + std::string get_header (const char* filename) const; + //! Flag set true when the header information is in a separate text file + mutable bool separate_header_file; }; } diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/Digitizer.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Digitizer.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/Digitizer.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Digitizer.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/Digitizer.h,v $ - $Revision: 1.3 $ - $Date: 2010/04/02 21:27:32 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/Digitizer.h #ifndef __Digitizer_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/dsp.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/dsp.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/dsp.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/dsp.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/dsp.h,v $ - $Revision: 1.6 $ - $Date: 2008/04/14 21:23:59 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/dsp.h #ifndef __baseband_dsp_h #define __baseband_dsp_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/DummyFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/DummyFile.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/DummyFile.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/DummyFile.h 2018-03-12 23:02:35.000000000 +0000 @@ -36,7 +36,7 @@ void close(); //! load bytes - int64_t load_bytes(unsigned char *buffer, uint64_t bytes); + int64_t load_bytes (unsigned char *buffer, uint64_t bytes); //! seek bytes int64_t seek_bytes(uint64_t bytes); diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/EightBitOne.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/EightBitOne.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/EightBitOne.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/EightBitOne.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/EightBitOne.h,v $ - $Revision: 1.3 $ - $Date: 2009/10/30 00:15:03 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/EightBitOne.h #ifndef __EightBitOne_h #define __EightBitOne_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/EightBitUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/EightBitUnpacker.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/EightBitUnpacker.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/EightBitUnpacker.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/EightBitUnpacker.h,v $ - $Revision: 1.5 $ - $Date: 2009/06/17 10:16:53 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/EightBitUnpacker.h #ifndef __EightBitUnpacker_h #define __EightBitUnpacker_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/ExcisionUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/ExcisionUnpacker.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/ExcisionUnpacker.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/ExcisionUnpacker.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/ExcisionUnpacker.h,v $ - $Revision: 1.6 $ - $Date: 2009/08/27 06:53:58 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/ExcisionUnpacker.h #ifndef __ExcisionUnpacker_h #define __ExcisionUnpacker_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/excision_unpack.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/excision_unpack.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/excision_unpack.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/excision_unpack.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/excision_unpack.h,v $ - $Revision: 1.5 $ - $Date: 2009/10/30 00:15:08 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/excision_unpack.h #ifndef __ExcisionUnpacker_excision_unpack_h #define __ExcisionUnpacker_excision_unpack_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/File.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/File.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/File.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/File.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/File.h,v $ - $Revision: 1.34 $ - $Date: 2012/02/24 20:47:06 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/File.h #ifndef __File_h @@ -117,11 +114,23 @@ //! The name of the currently opened file, set by open() std::string current_filename; +#if HAVE_CUDA + //! staging buffer for Host to Device transfers + void * host_buffer; + + //! The size of the host_buffer in bytes + uint64_t host_buffer_size; +#endif + //! Load nbyte bytes of sampled data from the device into buffer /*! If the data stored on the device contains information other than the sampled data, this method should be overloaded and the additional information should be filtered out. */ virtual int64_t load_bytes (unsigned char* buffer, uint64_t nbytes); + +#if HAVE_CUDA + virtual int64_t load_bytes_device (unsigned char* buffer, uint64_t bytes, void * device_handle); +#endif //! Set the file pointer to the absolute number of sampled data bytes /*! If the header_bytes attribute is set, this number of bytes diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/FloatUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/FloatUnpacker.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/FloatUnpacker.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/FloatUnpacker.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/FloatUnpacker.h,v $ - $Revision: 1.1 $ - $Date: 2011/08/01 10:07:00 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/FloatUnpacker.h #ifndef __FloatUnpacker_h #define __FloatUnpacker_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/FourBitTwo.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/FourBitTwo.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/FourBitTwo.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/FourBitTwo.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/FourBitTwo.h,v $ - $Revision: 1.3 $ - $Date: 2009/10/30 00:15:03 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/FourBitTwo.h #ifndef __FourBitTwo_h #define __FourBitTwo_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/FourBitUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/FourBitUnpacker.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/FourBitUnpacker.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/FourBitUnpacker.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/FourBitUnpacker.h,v $ - $Revision: 1.9 $ - $Date: 2009/06/17 10:16:53 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/FourBitUnpacker.h #ifndef __FourBitUnpacker_h #define __FourBitUnpacker_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/GenericEightBitUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/GenericEightBitUnpacker.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/GenericEightBitUnpacker.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/GenericEightBitUnpacker.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/GenericEightBitUnpacker.h,v $ - $Revision: 1.1 $ - $Date: 2012/03/21 09:19:09 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/GenericEightBitUnpacker.h #ifndef __GenericEightBitUnpacker_h #define __GenericEightBitUnpacker_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/GenericFourBitUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/GenericFourBitUnpacker.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/GenericFourBitUnpacker.h 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/GenericFourBitUnpacker.h 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,30 @@ +//-*-C++-*- +/*************************************************************************** + * + * Copyright (C) 2017 by Willem van Straten + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#ifndef __GenericFourBitUnpacker_h +#define __GenericFourBitUnpacker_h + +#include "dsp/FourBitUnpacker.h" + +namespace dsp +{ + //! Converts single-dish GMRT data from 4-bit to floating point values + class GenericFourBitUnpacker: public FourBitUnpacker + { + public: + + //! Constructor initializes bit table + GenericFourBitUnpacker (); + + //! Return true if this unpacker can handle the observation + bool matches (const Observation*); + + }; +} + +#endif diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/HasInput.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/HasInput.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/HasInput.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/HasInput.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/HasInput.h,v $ - $Revision: 1.3 $ - $Date: 2011/07/26 12:40:30 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/HasInput.h #ifndef __dsp_HasInput_h #define __dsp_HasInput_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/HasOutput.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/HasOutput.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/HasOutput.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/HasOutput.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/HasOutput.h,v $ - $Revision: 1.3 $ - $Date: 2010/09/16 17:45:34 $ - $Author: demorest $ */ +// dspsr/Kernel/Classes/dsp/HasOutput.h #ifndef __dsp_HasOutput_h #define __dsp_HasOutput_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/HistUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/HistUnpacker.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/HistUnpacker.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/HistUnpacker.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/HistUnpacker.h,v $ - $Revision: 1.17 $ - $Date: 2010/05/28 14:13:15 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/HistUnpacker.h #ifndef __HistUnpacker_h #define __HistUnpacker_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/Input.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Input.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/Input.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Input.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/Input.h,v $ - $Revision: 1.47 $ - $Date: 2011/09/20 20:20:26 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/Input.h #ifndef __Input_h #define __Input_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/IOManager.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/IOManager.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/IOManager.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/IOManager.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/IOManager.h,v $ - $Revision: 1.33 $ - $Date: 2011/08/04 21:04:38 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/IOManager.h #ifndef __IOManager_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/MemoryCUDA.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/MemoryCUDA.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/MemoryCUDA.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/MemoryCUDA.h 2018-03-12 23:02:35.000000000 +0000 @@ -27,7 +27,8 @@ class DeviceMemory : public dsp::Memory { public: - DeviceMemory (cudaStream_t _stream = 0) { stream = _stream; } + + DeviceMemory (cudaStream_t _stream = 0, int _device = 0); void* do_allocate (size_t nbytes); void do_free (void*); @@ -35,10 +36,17 @@ void do_zero (void*, size_t); bool on_host () const { return false; } + void set_stream (cudaStream_t _stream) { stream = _stream; } cudaStream_t get_stream () { return stream; } + cudaStream_t get_stream () const { return stream; } + + int get_device () { return device; }; + int get_device () const { return device; }; + protected: cudaStream_t stream; + int device; }; class SharedPinnedMemory : public dsp::Memory diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/Memory.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Memory.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/Memory.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Memory.h 2018-03-12 23:02:35.000000000 +0000 @@ -10,7 +10,7 @@ #define __dsp_Memory_h_ #include "Reference.h" -#include +#include "environ.h" namespace dsp { diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/MultiFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/MultiFile.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/MultiFile.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/MultiFile.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/MultiFile.h,v $ - $Revision: 1.29 $ - $Date: 2011/09/20 20:20:31 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/MultiFile.h #ifndef __MultiFile_h @@ -51,6 +48,9 @@ File* get_loader (); const File* get_loader () const; + //! Access to current file objects + std::vector< Reference::To >& get_files () {return files;} + //! Return true if the loader File instance is set bool has_loader (); diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/NLowLookup.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/NLowLookup.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/NLowLookup.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/NLowLookup.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/NLowLookup.h,v $ - $Revision: 1.2 $ - $Date: 2008/07/13 00:38:53 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/NLowLookup.h #ifndef __NLowLookup_h #define __NLowLookup_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/ObservationInterface.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/ObservationInterface.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/ObservationInterface.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/ObservationInterface.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/ObservationInterface.h,v $ - $Revision: 1.1 $ - $Date: 2012/01/19 21:46:11 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/ObservationInterface.h #ifndef __dsp_ObservationTI_h #define __dsp_ObservationTI_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/Operation.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Operation.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/Operation.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Operation.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/Operation.h,v $ - $Revision: 1.49 $ - $Date: 2010/02/04 09:15:10 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/Operation.h #ifndef __Operation_h #define __Operation_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/OutputFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/OutputFile.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/OutputFile.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/OutputFile.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/OutputFile.h,v $ - $Revision: 1.2 $ - $Date: 2011/09/19 01:56:42 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/OutputFile.h #ifndef __OutputFile_h #define __OutputFile_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/PrestoObservation.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/PrestoObservation.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/PrestoObservation.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/PrestoObservation.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/PrestoObservation.h,v $ - $Revision: 1.2 $ - $Date: 2009/03/03 05:29:30 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/PrestoObservation.h #ifndef __PrestoObservation_h #define __PrestoObservation_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/Scratch.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Scratch.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/Scratch.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Scratch.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/Scratch.h,v $ - $Revision: 1.5 $ - $Date: 2010/01/21 23:36:23 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/Scratch.h #ifndef __dsp_Scratch_h #define __dsp_Scratch_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/Seekable.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Seekable.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/Seekable.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Seekable.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,15 +6,16 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/Seekable.h,v $ - $Revision: 1.15 $ - $Date: 2010/06/04 03:36:31 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/Seekable.h +#ifdef HAVE_CONFIG_H +#include +#endif #ifndef __Seekable_h #define __Seekable_h +#include "dsp/Memory.h" #include "dsp/Input.h" namespace dsp { @@ -44,9 +45,15 @@ //! Inquire current time sample virtual uint64_t get_current_sample() { return current_sample; } + //! Set the bits series into which data will be loaded + void set_output (BitSeries* data); + //! Buffer used to store overlap (useful in multi-threaded applications) void set_overlap_buffer (BitSeries*); + //! Set the memory type used in the overlap buffer + void set_overlap_buffer_memory (Memory * memory); + protected: //! set end_of_data @@ -57,6 +64,11 @@ //! Load data from device and return the number of bytes read. virtual int64_t load_bytes (unsigned char* buffer, uint64_t bytes) = 0; + +#ifdef HAVE_CUDA + //! Load data from device to device memory and return the number of bytes read. + virtual int64_t load_bytes_device (unsigned char* buffer, uint64_t bytes, void * dev_handle) = 0; +#endif //! Seek to absolute position and return absolute position in bytes virtual int64_t seek_bytes (uint64_t bytes) = 0; diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/Sink.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Sink.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/Sink.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Sink.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/Sink.h,v $ - $Revision: 1.2 $ - $Date: 2009/06/07 01:22:34 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/Sink.h #ifndef __dsp_Sink_h #define __dsp_Sink_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/StepIterator.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/StepIterator.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/StepIterator.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/StepIterator.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/StepIterator.h,v $ - $Revision: 1.2 $ - $Date: 2008/07/13 00:38:53 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/StepIterator.h #ifndef __dsp_StepIterator_h #define __dsp_StepIterator_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/SubByteTwoBitCorrection.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/SubByteTwoBitCorrection.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/SubByteTwoBitCorrection.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/SubByteTwoBitCorrection.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/SubByteTwoBitCorrection.h,v $ - $Revision: 1.9 $ - $Date: 2010/05/11 06:21:12 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/SubByteTwoBitCorrection.h #ifndef __SubByteTwoBitCorrection_h #define __SubByteTwoBitCorrection_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/TestInput.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TestInput.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/TestInput.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TestInput.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/TestInput.h,v $ - $Revision: 1.3 $ - $Date: 2009/06/17 10:16:53 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/TestInput.h #ifndef __TestInput_h #define __TestInput_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/TimeSeriesCUDA.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TimeSeriesCUDA.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/TimeSeriesCUDA.h 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TimeSeriesCUDA.h 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,100 @@ +//-*-C++-*- + +/*************************************************************************** + * + * Copyright (C) 2016 by Andrew Jameson + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#ifndef __TimeSeriesEngine_h +#define __TimeSeriesEngine_h + +#include "dsp/TimeSeries.h" +#include "dsp/MemoryCUDA.h" + +#include + +namespace CUDA +{ + class TimeSeriesEngine : public dsp::TimeSeries::Engine + { + public: + + //! Default constructor + TimeSeriesEngine (dsp::Memory * _memory); + + //! Copy constructor + //TimeSeriesEngine (const TimeSeriesEngine& tse); + + ~TimeSeriesEngine (); + + //TimeSeriesEngine& operator = (const TimeSeriesEngine& copy); + + void prepare (dsp::TimeSeries * parent); + + void prepare_buffer (unsigned nbytes); + + void copy_data_fpt (const dsp::TimeSeries * copy, + uint64_t idat_start = 0, + uint64_t ndat = 0); + + void copy_data_fpt_same_stream (const dsp::TimeSeries * from, + uint64_t idat_start, uint64_t ndat); + + void copy_data_fpt_same_device (const dsp::TimeSeries * from, + uint64_t idat_start, uint64_t ndat); + + void copy_data_fpt_diff_device (const dsp::TimeSeries * from, + uint64_t idat_start, uint64_t ndat); + + void copy_data_fpt_kernel_multidim (float * to, const float * from, + uint64_t to_stride, uint64_t from_stride, + uint64_t idat_start, uint64_t ndat, cudaStream_t stream); + + void * buffer; + + protected: + + dsp::TimeSeries * to; + + CUDA::DeviceMemory * memory; + + CUDA::PinnedMemory * pinned_memory; + + void * host_buffer; + + size_t host_buffer_size; + + size_t buffer_size; + + unsigned nchan; + + unsigned npol; + + unsigned ndim; + + uint64_t ichanpol_stride; + + uint64_t ochanpol_stride; + + uint64_t bchanpol_stride; + + unsigned nthread; + + dim3 blocks; + + int device; + + cudaStream_t to_stream; + + cudaStream_t from_stream; + + int to_device; + + int from_device; + + }; +} + +#endif // !defined(__TimeSeriesEngine_h) diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/TimeSeries.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TimeSeries.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/TimeSeries.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TimeSeries.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/TimeSeries.h,v $ - $Revision: 1.55 $ - $Date: 2011/08/04 21:05:19 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/TimeSeries.h #ifndef __TimeSeries_h #define __TimeSeries_h @@ -152,6 +149,12 @@ void set_match (TimeSeries*); + class Engine; + + void set_engine (Engine*); + + Engine * get_engine () const { return engine; }; + protected: //! Returns a uchar pointer to the first piece of data @@ -179,6 +182,8 @@ // do the work of the null_clone: copy necessary attributes from the given TimeSeries void null_work (const TimeSeries* from); + Reference::To engine; + private: //! Order of the dimensions @@ -202,7 +207,21 @@ }; - + + class TimeSeries::Engine : public OwnStream + { + public: + + virtual void prepare (dsp::TimeSeries * to) = 0; + + virtual void prepare_buffer (unsigned nbytes) = 0; + + virtual void copy_data_fpt (const dsp::TimeSeries * copy, + uint64_t idat_start = 0, + uint64_t ndat = 0) = 0; + + }; + } #endif diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/Transformation.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Transformation.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/Transformation.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Transformation.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/Transformation.h,v $ - $Revision: 1.54 $ - $Date: 2011/08/26 22:02:53 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/Transformation.h #ifndef __dsp_Transformation_h #define __dsp_Transformation_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/TwoBit1or2.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TwoBit1or2.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/TwoBit1or2.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TwoBit1or2.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/TwoBit1or2.h,v $ - $Revision: 1.6 $ - $Date: 2010/05/28 14:13:32 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/TwoBit1or2.h #ifndef __TwoBit1or2_h #define __TwoBit1or2_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/TwoBitCorrection.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TwoBitCorrection.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/TwoBitCorrection.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TwoBitCorrection.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/TwoBitCorrection.h,v $ - $Revision: 1.44 $ - $Date: 2010/05/11 06:21:17 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/TwoBitCorrection.h #ifndef __TwoBitCorrection_h #define __TwoBitCorrection_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/TwoBitFour.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TwoBitFour.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/TwoBitFour.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TwoBitFour.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/TwoBitFour.h,v $ - $Revision: 1.7 $ - $Date: 2010/05/11 06:22:25 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/TwoBitFour.h #ifndef __TwoBitFour_h #define __TwoBitFour_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/TwoBitLookup.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TwoBitLookup.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/TwoBitLookup.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TwoBitLookup.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/TwoBitLookup.h,v $ - $Revision: 1.1 $ - $Date: 2008/07/17 01:17:33 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/TwoBitLookup.h #ifndef __TwoBitLookup_h #define __TwoBitLookup_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/TwoBitMask.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TwoBitMask.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/TwoBitMask.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TwoBitMask.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/TwoBitMask.h,v $ - $Revision: 1.3 $ - $Date: 2008/07/13 00:38:54 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/TwoBitMask.h #ifndef __TwoBitMask_h #define __TwoBitMask_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/TwoBitTable.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TwoBitTable.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/TwoBitTable.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/TwoBitTable.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/TwoBitTable.h,v $ - $Revision: 1.15 $ - $Date: 2009/07/31 12:23:13 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/TwoBitTable.h #ifndef __TwoBitTable_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/Unpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Unpacker.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/Unpacker.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/Unpacker.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/Unpacker.h,v $ - $Revision: 1.32 $ - $Date: 2012/02/24 20:47:06 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/Unpacker.h #ifndef __Unpacker_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/UnpackerIterator.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/UnpackerIterator.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/UnpackerIterator.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/UnpackerIterator.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/UnpackerIterator.h,v $ - $Revision: 1.1 $ - $Date: 2008/09/09 06:34:07 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/UnpackerIterator.h #ifndef __UnpackerIterator_h #define __UnpackerIterator_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/dsp/WeightedTimeSeries.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/WeightedTimeSeries.h --- bl-dspsr-0+git20160405/Kernel/Classes/dsp/WeightedTimeSeries.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/dsp/WeightedTimeSeries.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/WeightedTimeSeries.h,v $ - $Revision: 1.15 $ - $Date: 2011/08/04 21:05:36 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/WeightedTimeSeries.h #ifndef __WeightedTimeSeries_h #define __WeightedTimeSeries_h diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/DummyFile.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/DummyFile.C --- bl-dspsr-0+git20160405/Kernel/Classes/DummyFile.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/DummyFile.C 2018-03-12 23:02:35.000000000 +0000 @@ -56,6 +56,19 @@ // Read obs info from ASCII file info = new ASCIIObservation(header); + + if (ascii_header_get (header, "RESOLUTION", "%u", &resolution) < 0) + resolution = 1; + + // the resolution is the _byte_ resolution; convert to _sample_ resolution + if (verbose) + cerr << "dsp::DummyFile::open_file byte_resolution=" << resolution << endl; + resolution = info->get_nsamples (resolution); + if (verbose) + cerr << "dsp::DummyFile::open_file sample_resolution=" << resolution << endl; + if (resolution == 0) + resolution = 1; + } void dsp::DummyFile::close () diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/environ.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/environ.h --- bl-dspsr-0+git20160405/Kernel/Classes/environ.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/environ.h 2018-03-12 23:02:35.000000000 +0000 @@ -4,10 +4,7 @@ * Licensed under the Academic Free License version 2.1 * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/environ.h,v $ - $Revision: 1.1 $ - $Date: 2009/06/18 00:05:05 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/environ.h /* * Use the standard C integer types diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/File.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/File.C --- bl-dspsr-0+git20160405/Kernel/Classes/File.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/File.C 2018-03-12 23:02:35.000000000 +0000 @@ -23,6 +23,10 @@ #include #include +#if HAVE_CUDA +#include +#endif + using namespace std; using std::cerr; @@ -51,6 +55,11 @@ current_filename = ""; +#if HAVE_CUDA + host_buffer = 0; + host_buffer_size = 0; +#endif + get_info()->init(); } @@ -200,6 +209,70 @@ return bytes_read; } +#if HAVE_CUDA +int64_t dsp::File::load_bytes_device (unsigned char* buffer, uint64_t bytes, void * device_handle) +{ + cudaStream_t stream = (cudaStream_t) device_handle; + + if (verbose) + cerr << "dsp::File::load_bytes_device (" << (void *) buffer << ", " + << bytes << ", " << (void *) stream << ")" << endl; + + cudaError_t result; + + // ensure the host CPU buffer is large enough + if (bytes > host_buffer_size) + { + if (host_buffer) + { + if (result != cudaSuccess) + throw Error (InvalidState, "dsp::File::load_bytes_device", + "failed to synchronize cuda stream prior to buffer enlargement: %s", + cudaGetErrorString (result)); + + result = cudaFreeHost (host_buffer); + if (result != cudaSuccess) + throw Error (InvalidState, "dsp::File::load_bytes_device", + "cudaFreeHost (host_buffer) failed: %s", + cudaGetErrorString (result)); + } + + if (verbose) + cerr << "dsp::File::load_bytes_device cudaMallocHost() " << bytes + << "bytes for host_buffer" << endl; + result = cudaMallocHost (&host_buffer, bytes); + if (result != cudaSuccess) + throw Error (InvalidState, "dsp::File::load_bytes_device", + "cudaMallocHost (host_buffer, %"PRIu64") failed: %s", + bytes, cudaGetErrorString (result)); + host_buffer_size = bytes; + } + + if (verbose) + cerr << "dsp::File::load_bytes_device load_bytes(" << (void *) host_buffer + << ", " << bytes << ")" << endl; + + // load the data from device to the host buffer + int64_t bytes_read = load_bytes ((unsigned char *) host_buffer, bytes); + + if (bytes_read > 0) + { + if (verbose) + cerr << "dsp::File::load_bytes_device cudaMemcpyAsync (" + << (void *) buffer << ", " << (void *) host_buffer + << ", " << bytes << ", cudaMemcpyHostToDevice, " + << (void *) stream << ")" << endl; + result = cudaMemcpyAsync (buffer, host_buffer, bytes, cudaMemcpyHostToDevice, stream); + if (result != cudaSuccess) + throw Error (InvalidState, "dsp::File::load_bytes_device", + "cudaMemcpyAsync (%p, %p, %"PRIu64") failed: %s", + (void *) buffer, host_buffer, bytes, cudaGetErrorString (result)); + cudaStreamSynchronize(stream); + } + return bytes_read; +} +#endif + //! Adjust the file pointer int64_t dsp::File::seek_bytes (uint64_t bytes) { diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/GenericEightBitUnpacker.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/GenericEightBitUnpacker.C --- bl-dspsr-0+git20160405/Kernel/Classes/GenericEightBitUnpacker.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/GenericEightBitUnpacker.C 2018-03-12 23:02:35.000000000 +0000 @@ -26,7 +26,12 @@ dsp::GenericEightBitUnpacker::GenericEightBitUnpacker () : EightBitUnpacker ("GenericEightBitUnpacker") { +#define ASSUME_TWOS_COMPLEMENT 1 +#if ASSUME_TWOS_COMPLEMENT table = new BitTable (8, BitTable::TwosComplement); +#else + table = new BitTable (8, BitTable::OffsetBinary); +#endif gpu_stream = undefined_stream; } diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/GenericEightBitUnpackerCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/GenericEightBitUnpackerCUDA.cu --- bl-dspsr-0+git20160405/Kernel/Classes/GenericEightBitUnpackerCUDA.cu 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/GenericEightBitUnpackerCUDA.cu 2018-03-12 23:02:35.000000000 +0000 @@ -13,7 +13,7 @@ using namespace std; -void check_error (const char*); +void check_error_stream (const char*, cudaStream_t); /* * Simple CUDA 8-bit unpack kernel @@ -102,6 +102,6 @@ "unknown BitTable::Type"); if (dsp::Operation::record_time || dsp::Operation::verbose) - check_error ("generic_8bit_unpack"); + check_error_stream ("generic_8bit_unpack", stream); } diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/GenericFourBitUnpacker.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/GenericFourBitUnpacker.C --- bl-dspsr-0+git20160405/Kernel/Classes/GenericFourBitUnpacker.C 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/GenericFourBitUnpacker.C 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,35 @@ +/*************************************************************************** + * + * Copyright (C) 2017 by Willem van Straten + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#include "dsp/GenericFourBitUnpacker.h" +#include "dsp/BitTable.h" + +#include +using namespace std; + +dsp::GenericFourBitUnpacker::GenericFourBitUnpacker () + : FourBitUnpacker ("GenericFourBitUnpacker") +{ +#define ASSUME_TWOS_COMPLEMENT 1 +#if ASSUME_TWOS_COMPLEMENT + BitTable* table = new BitTable (4, BitTable::TwosComplement); +#else + BitTable* table = new BitTable (4, BitTable::OffsetBinary); +#endif + table->set_order( BitTable::LeastToMost ); + set_table( table ); +} + +bool dsp::GenericFourBitUnpacker::matches (const Observation* observation) +{ + if (verbose) + cerr << "dsp::GenericUnpacker::matches" + " machine=" << observation->get_machine() << + " nbit=" << observation->get_nbit() << endl; + + return observation->get_nbit() == 4; +} diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/Input.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/Input.C --- bl-dspsr-0+git20160405/Kernel/Classes/Input.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/Input.C 2018-03-12 23:02:35.000000000 +0000 @@ -66,7 +66,8 @@ maximum_load_size += 2 * resolution; if (verbose) - cerr << "dsp::Input::reserve " << maximum_load_size << endl; + cerr << "dsp::Input::reserve block_size=" << block_size + << " maximum_load_size=" << maximum_load_size << endl; buffer->resize (maximum_load_size); buffer->resize (block_size); diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/IOManager.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/IOManager.C --- bl-dspsr-0+git20160405/Kernel/Classes/IOManager.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/IOManager.C 2018-03-12 23:02:35.000000000 +0000 @@ -375,13 +375,15 @@ if (verbose) cerr << "dsp::IOManager::set_block_size required block_size=" << block_size << endl; + if (verbose) + cerr << "dsp::IOManager::set_block_size minimum_RAM=" << minimum_RAM << " nbyte_dat=" << nbyte_dat << endl; if (minimum_RAM) { uint64_t size = (uint64_t(minimum_RAM/nbyte_dat)/resolution) * resolution; if (verbose) cerr << "dsp::IOManager::set_block_size" - " minimum block_size=" << size << endl; + " minimum block_size=" << size << endl; block_size = std::max (block_size, size); } @@ -405,12 +407,13 @@ cerr << "dsp::IOManager::set_block_size insufficient RAM" << endl; throw Error (InvalidState, "dsp::IOManager::set_block_size", - "insufficient RAM: limit=%g MB -> block="UI64" samples\n\t" - "require="UI64" samples -> \"-U %g\" on command line", - float(maximum_RAM)/megabyte, block_size, - minimum_samples, min_ram/megabyte); + "insufficient RAM: limit=%g MB -> block="UI64" samples\n\t" + "require="UI64" samples -> \"-U %g\" on command line", + float(maximum_RAM)/megabyte, block_size, + minimum_samples, min_ram/megabyte); } + // input overlap incorporates overlapping blocks of input data if (input->get_overlap()) { unsigned overlap = input->get_overlap(); @@ -419,10 +422,11 @@ double parts = (block_size - overlap) / stride; if (verbose) - cerr << "dsp::IOManager::set_block_size input" - " overlap=" << overlap << " parts=" << parts << endl; + cerr << "dsp::IOManager::set_block_size block_size=" << block_size + << " overlap=" << overlap << " parts=" << parts << endl; - uint64_t block_resize = unsigned(parts)*(minimum_samples-overlap) + overlap; + uint64_t block_resize = unsigned(parts)*(minimum_samples - overlap)+ overlap; + cerr << "dsp::IOManager::set_block_size block_resize=" << block_resize << endl; if (filterbank_resolution) { @@ -431,17 +435,17 @@ unsigned best_npart = 0; while (trial_block_size < block_size) { - double trial_parts = (trial_block_size-overlap) / stride; - if (trial_parts == unsigned(trial_parts)) - best_npart = trial_block_size / filterbank_resolution; + double trial_parts = (trial_block_size-overlap) / stride; + if (trial_parts == unsigned(trial_parts)) + best_npart = trial_block_size / filterbank_resolution; - trial_block_size += filterbank_resolution; + trial_block_size += filterbank_resolution; } if (best_npart == 0) - throw Error (InvalidState, "dsp::IOManager::set_block_size", - "could not find an overlapping block size " - "for both Filterbank and Convolution"); + throw Error (InvalidState, "dsp::IOManager::set_block_size", + "could not find an overlapping block size " + "for both Filterbank and Convolution"); // WvS to-do: if filterbank also loses samples, then add nlost here block_resize = best_npart * filterbank_resolution; diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/Makefile.am bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/Makefile.am --- bl-dspsr-0+git20160405/Kernel/Classes/Makefile.am 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/Makefile.am 2018-03-12 23:02:35.000000000 +0000 @@ -29,7 +29,9 @@ dsp/HasOutput.h dsp/Sink.h dsp/Multiplex.h \ dsp/Memory.h debug.h dsp/OperationThread.h dsp/FloatUnpacker.h \ dsp/UniversalInputBuffering.h dsp/OutputFile.h \ - dsp/ObservationInterface.h dsp/GenericEightBitUnpacker.h \ + dsp/ObservationInterface.h \ + dsp/GenericEightBitUnpacker.h \ + dsp/GenericFourBitUnpacker.h \ dsp/CommandLineHeader.h dsp/OutputFileShare.h libClasses_la_SOURCES = ascii_header.c ASCIIObservation.C \ @@ -48,18 +50,21 @@ UnpackerIterator.C ObservationChange.C PrestoObservation.C \ CloneArchive.C SignalPath.C Multiplex.C Memory.C \ OperationThread.C FloatUnpacker.C OutputFile.C \ - ObservationInterface.C GenericEightBitUnpacker.C \ + ObservationInterface.C \ + GenericEightBitUnpacker.C \ + GenericFourBitUnpacker.C \ CommandLineHeader.C OutputFileShare.C if HAVE_MPI libClasses_la_SOURCES += MPIRoot.C MPITrans.C MPIServer.C mpi_Observation.C endif -if HAVE_CUFFT -nobase_include_HEADERS += dsp/MemoryCUDA.h dsp/GenericEightBitUnpackerCUDA.h -libClasses_la_SOURCES += MemoryCUDA.C GenericEightBitUnpackerCUDA.cu \ - check_error.C -libClasses_la_LIBADD = @CUFFT_LIBS@ @CUDA_LIBS@ +if HAVE_CUDA +nobase_include_HEADERS += dsp/MemoryCUDA.h dsp/GenericEightBitUnpackerCUDA.h \ + dsp/TimeSeriesCUDA.h +libClasses_la_SOURCES += MemoryCUDA.C check_error.C GenericEightBitUnpackerCUDA.cu \ + TimeSeriesCUDA.cu +libClasses_la_LIBADD = @CUDA_LIBS@ endif check_PROGRAMS = test_BlockIterator test_environ @@ -73,5 +78,5 @@ LDADD = libClasses.la -AM_CPPFLAGS += @CUFFT_CFLAGS@ +AM_CPPFLAGS += @CUDA_CFLAGS@ @PSRDADA_CFLAGS@ diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/Memory.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/Memory.C --- bl-dspsr-0+git20160405/Kernel/Classes/Memory.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/Memory.C 2018-03-12 23:02:35.000000000 +0000 @@ -16,18 +16,19 @@ void* dsp::Memory::do_allocate (size_t nbytes) { - DEBUG("dsp::Memory::allocate (" << nbytes << ")"); + DEBUG("dsp::Memory::do_allocate (" << nbytes << ")"); return malloc16 (nbytes); } void dsp::Memory::do_free (void* ptr) { - DEBUG("dsp::Memory::free (" << ptr << ")"); + DEBUG("dsp::Memory::do_free (" << ptr << ")"); free16 (ptr); } void dsp::Memory::do_zero (void* ptr, size_t nbytes) { + DEBUG("dsp::Memory::do_zero (" << (void*) ptr << "," << nbytes << ")"); memset (ptr, 0, nbytes); } @@ -39,11 +40,13 @@ void* dsp::Memory::allocate (size_t nbytes) { + DEBUG("dsp::Memory::allocate (" << nbytes << ")"); return get_manager()->do_allocate (nbytes); } void dsp::Memory::free (void* ptr) { + DEBUG("dsp::Memory::free (" << (void*) ptr << ")"); get_manager()->do_free (ptr); } diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/MemoryCUDA.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/MemoryCUDA.C --- bl-dspsr-0+git20160405/Kernel/Classes/MemoryCUDA.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/MemoryCUDA.C 2018-03-12 23:02:35.000000000 +0000 @@ -5,8 +5,6 @@ * ***************************************************************************/ -// #define _DEBUG 1 - #include "dsp/MemoryCUDA.h" #include "debug.h" @@ -46,6 +44,12 @@ * ***************************************************************************/ +CUDA::DeviceMemory::DeviceMemory (cudaStream_t _stream, int _device) +{ + stream = _stream; + device = _device; +} + void* CUDA::DeviceMemory::do_allocate (size_t nbytes) { DEBUG("CUDA::DeviceMemory::allocate cudaMalloc (" << nbytes << ")"); diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/Seekable.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/Seekable.C --- bl-dspsr-0+git20160405/Kernel/Classes/Seekable.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/Seekable.C 2018-03-12 23:02:35.000000000 +0000 @@ -5,6 +5,10 @@ * ***************************************************************************/ +#ifdef HAVE_CONFIG_H +#include +#endif + #include "dsp/Seekable.h" #include "dsp/BitSeries.h" @@ -13,6 +17,11 @@ #include +#if HAVE_CUDA +#include "dsp/MemoryCUDA.h" +#include +#endif + using namespace std; //! Constructor @@ -75,26 +84,26 @@ { if (verbose) cerr << "dsp::Seekable::load_data total ndat=" << get_info()->get_ndat() - << " read_sample=" << read_sample << endl; + << " read_sample=" << read_sample << endl; if (read_sample > get_info()->get_ndat()) throw Error (InvalidState, "dsp::Seekable::load_data", - "read_sample="UI64" > ndat="UI64 "\n\t" - "recycled="UI64" load_sample="UI64, - read_sample, get_info()->get_ndat(), - recycled, get_load_sample()); + "read_sample="UI64" > ndat="UI64 "\n\t" + "recycled="UI64" load_sample="UI64, + read_sample, get_info()->get_ndat(), + recycled, get_load_sample()); uint64_t samples_left = get_info()->get_ndat() - read_sample; if (verbose) cerr << "dsp::Seekable::load_data " << samples_left - << " samples remaining" << endl; + << " samples remaining" << endl; if (samples_left <= read_size) { if (verbose) - cerr << "dsp::Seekable::load_data end of data read_size=" - << samples_left << endl; + cerr << "dsp::Seekable::load_data end of data read_size=" + << samples_left << endl; read_size = samples_left; end_of_data = true; @@ -115,7 +124,7 @@ if (verbose) cerr << "dsp::Seekable::load_data read_sample=" << read_sample << " != current_sample=" << current_sample - << " seek_bytes=" << toseek_bytes << endl; + << " seek_bytes=" << toseek_bytes << endl; int64_t seeked = seek_bytes (toseek_bytes); if (seeked < 0) @@ -124,8 +133,8 @@ // confirm that we be where we expect we be if (read_sample != (uint64_t) data->get_nsamples (seeked)) throw Error (InvalidState, "dsp::Seekable::load_data", "seek mismatch" - " read_sample="UI64" absolute_sample="UI64, - read_sample, data->get_nsamples (seeked)); + " read_sample="UI64" absolute_sample="UI64, + read_sample, data->get_nsamples (seeked)); current_sample = read_sample; } @@ -135,18 +144,42 @@ if (toread_bytes < 1) throw Error (InvalidState, "dsp::Seekable::load_data", - "invalid BitSeries state"); + "invalid BitSeries state"); + + int64_t bytes_read; + +#if HAVE_CUDA + // check if the bit series resides in device memory + CUDA::DeviceMemory * device_mem = dynamic_cast(data->get_memory() ); + if (device_mem) + { + cudaStream_t stream = device_mem->get_stream(); + if (verbose) + cerr << "dsp::Seekable::load_data" + " call load_bytes_device ("<< toread_bytes << ")" < get_load_size()) to_recycle = get_load_size(); @@ -244,13 +277,15 @@ if (verbose) cerr << "dsp::Seekable::recycle_data recycle " << recycle_bytes - << " bytes (offset=" << offset_bytes << " bytes)" << endl; + << " bytes (offset=" << offset_bytes << " bytes)" << endl; unsigned char *into = data->get_rawptr(); unsigned char *rbuf = from->get_rawptr() + offset_bytes; if (overlap_buffer) - memcpy (into, rbuf, size_t(recycle_bytes)); + { + overlap_buffer->get_memory()->do_copy( into, rbuf, size_t(recycle_bytes)); + } else { // perform an "overlap safe" memcpy @@ -262,9 +297,9 @@ while (recycle_bytes) { if (offset_bytes > recycle_bytes) - offset_bytes = recycle_bytes; + offset_bytes = recycle_bytes; - memcpy (into, rbuf, size_t(offset_bytes)); + from->get_memory()->do_copy (into, rbuf, size_t(offset_bytes)); recycle_bytes -= offset_bytes; into += offset_bytes; @@ -278,9 +313,23 @@ return to_recycle; } +void dsp::Seekable::set_output (BitSeries* data) +{ + Input::set_output (data); +} + void dsp::Seekable::set_overlap_buffer (BitSeries* buffer) { overlap_buffer = buffer; } +void dsp::Seekable::set_overlap_buffer_memory (Memory * memory) +{ + if (verbose) + cerr << "dsp::Seekable::set_overlap_buffer_memory()" << endl; + if (!overlap_buffer) + set_overlap_buffer( new BitSeries ); + overlap_buffer->set_memory( memory ); +} + diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/TimeSeries.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/TimeSeries.C --- bl-dspsr-0+git20160405/Kernel/Classes/TimeSeries.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/TimeSeries.C 2018-03-12 23:02:35.000000000 +0000 @@ -6,9 +6,15 @@ * ***************************************************************************/ +#include "config.h" #include "dsp/TimeSeries.h" #include "dsp/Memory.h" +#ifdef HAVE_CUDA +#include "dsp/MemoryCUDA.h" +#include "dsp/TimeSeriesCUDA.h" +#endif + #include "fsleep.h" #include "Error.h" @@ -45,6 +51,7 @@ reserve_nfloat = 0; input_sample = -1; zeroed_data = false; + engine = 0; } dsp::TimeSeries* dsp::TimeSeries::clone () const @@ -65,7 +72,16 @@ void dsp::TimeSeries::null_work (const TimeSeries* from) { order = from->order; + +#ifdef HAVE_CUDA memory = from->memory; + if (from->engine) + { + set_engine (new CUDA::TimeSeriesEngine(memory)); + } +#else + memory = from->memory; +#endif } dsp::TimeSeries::~TimeSeries() @@ -139,8 +155,12 @@ << int64_t((data-(float*)buffer)) << endl; } - uint64_t fake_ndat = reserve_nfloat / get_ndim(); - if (reserve_nfloat % get_ndim()) + uint64_t reserve_step = get_ndim(); + if (order == OrderTFP) + reserve_step *= get_nchan() * get_npol(); + + uint64_t fake_ndat = reserve_nfloat / reserve_step; + if (reserve_nfloat % reserve_step) fake_ndat ++; if (verbose) @@ -459,13 +479,20 @@ switch (order) { case OrderFPT: - for (unsigned ichan=0; ichancopy_data_fpt (copy, idat_start, copy_ndat); + } + else { - for (unsigned ipol=0; ipolget_datptr(ichan,ipol) + offset; - memory->do_copy (to, from, size_t(byte_count)); + for (unsigned ipol=0; ipolget_datptr(ichan,ipol) + offset; + memory->do_copy (to, from, size_t(byte_count)); + } } } break; @@ -710,3 +737,10 @@ non_finite, nfloat * nchan * npol); } +void dsp::TimeSeries::set_engine( Engine* _engine ) +{ + engine = _engine; + engine->prepare (this); +} + + diff -Nru bl-dspsr-0+git20160405/Kernel/Classes/TimeSeriesCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/TimeSeriesCUDA.cu --- bl-dspsr-0+git20160405/Kernel/Classes/TimeSeriesCUDA.cu 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Classes/TimeSeriesCUDA.cu 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,261 @@ +//-*-C++-*- + +/*************************************************************************** + * + * Copyright (C) 2016 by Andrew Jameson + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#include "dsp/TimeSeriesCUDA.h" +#include "dsp/MemoryCUDA.h" + +#include "Error.h" + +void check_error_stream (const char*, cudaStream_t); + +using namespace std; + +template +__global__ void copy_data_fpt_kernel(T * to, T * from, + uint64_t to_stride, uint64_t from_stride, + uint64_t ndat) +{ + uint64_t dx = blockIdx.x * blockDim.x + threadIdx.x; + if (dx >= ndat) + return; + to[blockIdx.y * to_stride + dx] = from[blockIdx.y * from_stride + dx]; +} + +CUDA::TimeSeriesEngine::TimeSeriesEngine (dsp::Memory * _memory) +{ + memory = dynamic_cast(_memory); + buffer = NULL; + buffer_size = 0; + + pinned_memory = new CUDA::PinnedMemory; + host_buffer = NULL; + host_buffer_size = 0; +} + +CUDA::TimeSeriesEngine::~TimeSeriesEngine () +{ + if (buffer) + memory->do_free (buffer); + buffer = 0; +} + +void CUDA::TimeSeriesEngine::prepare (dsp::TimeSeries * parent) +{ + to = parent; +} + +void CUDA::TimeSeriesEngine::prepare_buffer (unsigned nbytes) +{ + if (nbytes > buffer_size) + { + if (buffer) + memory->do_free (buffer); + buffer_size = nbytes; + buffer = memory->do_allocate (buffer_size); + memory->do_zero(buffer, buffer_size); + } +} + +// copy data from another time series to this time series +void CUDA::TimeSeriesEngine::copy_data_fpt (const dsp::TimeSeries* from, + uint64_t idat_start, uint64_t ndat) +{ + nchan = to->get_nchan(); + npol = to->get_npol(); + ndim = to->get_ndim(); + + // current cuda device that is executing this function + cudaGetDevice (&device); + +#ifdef _DEBUG + cerr << "CUDA::TimeSeriesEngine::copy_data_fpt from=" << (void *) from + << " idat_start=" << idat_start << " ndat=" << ndat << " device=" << device << endl; +#endif + + // stream and device upon which to TSE exists + to_stream = memory->get_stream(); + to_device = memory->get_device(); + + // stream and device upon which from TSE exists + const CUDA::DeviceMemory * from_mem = dynamic_cast( from->get_memory()); + from_stream = from_mem->get_stream(); + from_device = from_mem->get_device(); + + if (!from_mem) + throw Error (FailedSys, "CUDA::TimeSeriesEngine::copy_data_fpt", "From TimeSeries did not use DeviceMemory"); + + ichanpol_stride = 0; + ochanpol_stride = 0; + bchanpol_stride = ndat; + + if (npol > 1) + { + ochanpol_stride = to->get_datptr (0,1) - to->get_datptr (0,0); + ichanpol_stride = from->get_datptr (0,1) - from->get_datptr (0,0); + } + else if (nchan > 1) + { + ochanpol_stride = to->get_datptr (1,0) - to->get_datptr (1,0); + ichanpol_stride = from->get_datptr (1,0) - from->get_datptr (1,0); + } + else + { + ; + } + + ichanpol_stride /= ndim; + ochanpol_stride /= ndim; + +#ifdef _DEBUG + cerr << "CUDA::TimeSeriesEngine::copy_data_fpt streams to=" + << (void*) to_stream << " from=" << (void*) from_stream << endl; + cerr << "CUDA::TimeSeriesEngine::copy_data_fpt device to=" << device + << " from=" << from_device << endl; + cerr << "CUDA::TimeSeriesEngine::copy_data_fpt nchan=" << nchan << " ndim=" << ndim << " npol=" << npol << " ndat=" << ndat << endl; + cerr << "CUDA::TimeSeriesEngine::copy_data_fpt istride=" << ichanpol_stride << " ostride=" << ochanpol_stride << " bstride=" << bchanpol_stride << endl; +#endif + + // configure the kernels + nthread = 1024; + if (nthread > ndat) + nthread = ndat; + blocks = dim3 (ndat / nthread, nchan*npol); + if (ndat % nthread) + blocks.x++; + +#ifdef _DEBUG + cerr << "blocks=(" << blocks.x << "," << blocks.y << ") threads=" << nthread << endl; +#endif + + if (from_device == to_device) + if (from_stream == to_stream) + copy_data_fpt_same_stream (from, idat_start, ndat); + else + copy_data_fpt_same_device (from, idat_start, ndat); + else + copy_data_fpt_diff_device (from, idat_start, ndat); +} + +// if both time series are within the same stream +void CUDA::TimeSeriesEngine::copy_data_fpt_same_stream (const dsp::TimeSeries* from, + uint64_t idat_start, uint64_t ndat) +{ + cudaStream_t stream = memory->get_stream(); + copy_data_fpt_kernel_multidim (to->get_datptr (0,0), from->get_datptr (0,0), + ochanpol_stride, ichanpol_stride, + idat_start, ndat, stream); +} + +// if both time series are in different streams, but the same device +void CUDA::TimeSeriesEngine::copy_data_fpt_same_device (const dsp::TimeSeries* from, + uint64_t idat_start, uint64_t ndat) +{ + size_t nbytes = nchan * ndim * npol * ndat * sizeof(float); + + // stream upon which from TSE exists + CUDA::TimeSeriesEngine * from_engine = dynamic_cast(from->get_engine()); + const CUDA::DeviceMemory * from_mem = dynamic_cast( from->get_memory()); + + // ensure the buffers in each time series are allocated + from_engine->prepare_buffer (nbytes); + prepare_buffer (nbytes); + + // copy from -> from_buffer + copy_data_fpt_kernel_multidim ((float *) from_engine->buffer, from->get_datptr (0,0), + bchanpol_stride, ichanpol_stride, + idat_start, ndat, from_stream); + + // copy from_buffer -> to_buffer + cudaMemcpyAsync (buffer, from_engine->buffer, nbytes, cudaMemcpyDeviceToDevice, from_stream); + cudaStreamSynchronize(from_stream); + + // copy buffer -> to + copy_data_fpt_kernel_multidim (to->get_datptr (0,0), (float *) buffer, + ochanpol_stride, bchanpol_stride, + 0, ndat, to_stream); +} + +// if both time series are in different streams, but the same device +void CUDA::TimeSeriesEngine::copy_data_fpt_diff_device (const dsp::TimeSeries* from, + uint64_t idat_start, uint64_t ndat) +{ + size_t nbytes = nchan * ndim * npol * ndat * sizeof(float); + + // if the current device is not the to device, switch and allocate + if (device != to_device) + cudaSetDevice (to_device); + prepare_buffer (nbytes); + + // switch to the from_device + cudaSetDevice (from_device); + + // ensure buffer is allocated + CUDA::TimeSeriesEngine * from_engine = dynamic_cast(from->get_engine()); + from_engine->prepare_buffer (nbytes); + + // copy from -> from_buffer + copy_data_fpt_kernel_multidim ((float *) from_engine->buffer, from->get_datptr (0,0), + bchanpol_stride, ichanpol_stride, + idat_start, ndat, from_stream); + + // if the host buffer is too small, allocate some pinned memory + if (host_buffer_size < nbytes) + { + if (host_buffer) + pinned_memory->do_free (host_buffer); + host_buffer = pinned_memory->do_allocate (nbytes); + host_buffer_size = nbytes; + } + + // copy from_buffer -> host_buffer + cudaMemcpyAsync (host_buffer, from_engine->buffer, nbytes, cudaMemcpyDeviceToHost, from_stream); + + // wait for the D2H transfer to complete before continuing + cudaStreamSynchronize (from_stream); + + // switch to the to_device + cudaSetDevice (to_device); + + // copy host_buffer -> to_buffer + cudaMemcpyAsync (buffer, host_buffer, nbytes, cudaMemcpyHostToDevice, to_stream); + + // copy to_buffer -> to + copy_data_fpt_kernel_multidim (to->get_datptr (0,0), (float *) buffer, + ochanpol_stride, bchanpol_stride, + 0, ndat, to_stream); + + if (to_device != device) + cudaSetDevice (device); +} + + +void CUDA::TimeSeriesEngine::copy_data_fpt_kernel_multidim (float * to, const float * from, + uint64_t to_stride, uint64_t from_stride, + uint64_t idat_start, uint64_t ndat, + cudaStream_t stream) +{ + if (ndim == 2) + { + float2 * to_ptr = (float2 *) to; + float2 * from_ptr = (float2 *) from; + copy_data_fpt_kernel<<>> ( + to_ptr, from_ptr + idat_start, to_stride, from_stride, ndat); + } + else + { + float * from_ptr = (float *) from; + copy_data_fpt_kernel<<>> ( + to, from_ptr + idat_start, to_stride, from_stride, ndat); + } +} + + + + + diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/apsr/dsp/APSRIterator.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/apsr/dsp/APSRIterator.h --- bl-dspsr-0+git20160405/Kernel/Formats/apsr/dsp/APSRIterator.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/apsr/dsp/APSRIterator.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/apsr/dsp/APSRIterator.h,v $ - $Revision: 1.2 $ - $Date: 2008/07/13 00:38:54 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/apsr/dsp/APSRIterator.h #ifndef __APSRIterator_h #define __APSRIterator_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/apsr/dsp/APSRTwoBitCorrection.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/apsr/dsp/APSRTwoBitCorrection.h --- bl-dspsr-0+git20160405/Kernel/Formats/apsr/dsp/APSRTwoBitCorrection.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/apsr/dsp/APSRTwoBitCorrection.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/apsr/dsp/APSRTwoBitCorrection.h,v $ - $Revision: 1.4 $ - $Date: 2008/07/13 00:38:54 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/apsr/dsp/APSRTwoBitCorrection.h #ifndef __APSRTwoBitCorrection_h #define __APSRTwoBitCorrection_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/apsr/dsp/APSRUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/apsr/dsp/APSRUnpacker.h --- bl-dspsr-0+git20160405/Kernel/Formats/apsr/dsp/APSRUnpacker.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/apsr/dsp/APSRUnpacker.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/apsr/dsp/APSRUnpacker.h,v $ - $Revision: 1.5 $ - $Date: 2009/06/17 10:16:53 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/apsr/dsp/APSRUnpacker.h #ifndef __APSRUnpacker_h #define __APSRUnpacker_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/asp/dsp/ASPUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/asp/dsp/ASPUnpacker.h --- bl-dspsr-0+git20160405/Kernel/Formats/asp/dsp/ASPUnpacker.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/asp/dsp/ASPUnpacker.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/asp/dsp/ASPUnpacker.h,v $ - $Revision: 1.3 $ - $Date: 2006/07/09 13:27:03 $ - $Author: wvanstra $ */ +// dspsr/Kernel/Formats/asp/dsp/ASPUnpacker.h #ifndef __ASPUnpacker_h #define __ASPUnpacker_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/bpsr/BPSRCrossUnpacker.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/bpsr/BPSRCrossUnpacker.C --- bl-dspsr-0+git20160405/Kernel/Formats/bpsr/BPSRCrossUnpacker.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/bpsr/BPSRCrossUnpacker.C 2018-03-12 23:02:35.000000000 +0000 @@ -1,12 +1,11 @@ /*************************************************************************** * - * Copyright (C) 2008-2014 by Andrew Jameson & Willem van Straten + * Copyright (C) 2008 - 2016 by Andrew Jameson & Willem van Straten * Licensed under the Academic Free License version 2.1 * ***************************************************************************/ #include "dsp/BPSRCrossUnpacker.h" -//#include "dsp/DADABuffer.h" #include "dsp/ASCIIObservation.h" #include "Error.h" @@ -19,6 +18,16 @@ { gain_polx = -1; unpack_ppqq_only = false; + + /* + This constant is an observed approximate mean value of + GAIN_POL1 and GAIN_POL2 and it is applied simply to keep + rescale factors close to unity + */ + reference_gain = 100000.0 / 256.0; + ppqq_scale[0] = 1.0; + ppqq_scale[1] = 1.0; + pq_scale = 1.0; } //! Return true if the unpacker support the specified output order @@ -93,44 +102,90 @@ const Input * in = input->get_loader(); const Observation * obs = in->get_info(); const ASCIIObservation * info = dynamic_cast(obs); - if (info) + if (!info) + throw Error (InvalidState, "dsp::BPSRCrossUnpacker::unpack", + "ASCIIObservation required and not available"); + + + // attempt to get the FACTOR_POLX from the header. + // This describes the factor necessary to correct the AB* values + // relative to AA and BB. + + info->custom_header_get ("GAIN_POL1", "%f", &gain_pol1); + info->custom_header_get ("GAIN_POL2", "%f", &gain_pol2); + + // attempt to get the FACTOR_POLX from the header. This completely describes + // the factor necessary to correct the AB* values + try + { + if (info->custom_header_get ("FACTOR_POLX", "%f", &gain_polx) == 1) + { + if (verbose) + cerr << "dsp::BPSRCrossUnpacker::unpack FACTOR_POLX=" + << gain_polx << endl; + } + } + catch (Error& error) { - if (info) + // older method that makes the assumption that the AA and BB are in + // bit window 1. AB* is in bit window 3. The correct calculation is + // gain_polx = polx * 2^11 / (2^8 * (bwx - bw)) + unsigned polx; + if (info->custom_header_get ("GAIN_POLX", "%u", &polx) == 1) { - // attempt to get the FACTOR_POLX from the header. This completely describes - // the factor necessary to correct the AB* values - try + if (polx == 0) { - if (info->custom_header_get ("FACTOR_POLX", "%f", &gain_polx) == 1) - { - if (verbose) - cerr << "dsp::BPSRCrossUnpacker::unpack FACTOR_POLX=" << gain_polx << endl; - } + gain_polx = 1; } - catch (Error& error) + else { - // older method that makes the assumption that the AA and BB are in - // bit window 1. AB* is in bit window 3. The correct calculation is - // gain_polx = polx * 2^11 / (2^8 * (bwx - bw)) - unsigned polx; - if (info->custom_header_get ("GAIN_POLX", "%u", &polx) == 1) - { - if (polx == 0) - { - gain_polx = 1; - } - else - { - gain_polx = ((float) polx) / 32; - } - } - if (verbose) - cerr << "dsp::BPSRCrossUnpacker::unpack GAIN_POLX=" << polx << " FACTOR_POLX=" << gain_polx << endl; + gain_polx = ((float) polx) / 32; } } + if (verbose) + cerr << "dsp::BPSRCrossUnpacker::unpack GAIN_POLX=" + << polx << " FACTOR_POLX=" << gain_polx << endl; } - } + // try to read the Bit Window of the PPQQ data + try + { + if (info->custom_header_get ("PPQQ_BW", "%u", &ppqq_bw) == 1) + { + if (verbose) + cerr << "dsp::BPSRCrossUnpacker::unpack PPQQ_BW=" + << ppqq_bw << endl; + } + } + catch (Error& error) + { + ppqq_bw = 1; + if (verbose) + cerr << "dsp::BPSRCrossUnpacker::unpack assuming PPQQ_BW=" + << ppqq_bw << endl; + + } + + // each bit window suppresses by 256 (2^8) + float ppqq_bw_scale = powf (2, 8*ppqq_bw); + if (verbose) + { + cerr << "dsp::BPSRCrossUnpacker::unpack raw GAIN_POL1=" << gain_pol1 + << " GAIN_POL1=" << gain_pol1/ppqq_bw_scale << endl; + cerr << "dsp::BPSRCrossUnpacker::unpack raw GAIN_POL2=" << gain_pol2 + << " GAIN_POL2=" << gain_pol2/ppqq_bw_scale << endl; + } + gain_pol1 /= ppqq_bw_scale; + gain_pol2 /= ppqq_bw_scale; + + float p_scale = reference_gain/gain_pol1; + float q_scale = reference_gain/gain_pol2; + + ppqq_scale[0] = p_scale * p_scale; + ppqq_scale[1] = q_scale * q_scale; + pq_scale = p_scale * q_scale / gain_polx; + } + switch ( output->get_order() ) { case TimeSeries::OrderFPT: @@ -165,7 +220,7 @@ for (unsigned bt = 0; bt < ndat; bt++) { // hist[ *from ] ++; - into[bt] = float( *from ); + into[bt] = float( *from ) * ppqq_scale[ipol]; from += step; } } @@ -174,7 +229,7 @@ for (unsigned bt = 0; bt < ndat; bt++) { if (!unpack_ppqq_only) - into[bt] = float( ((char) *from) ) / gain_polx; + into[bt] = float( ((char) *from) ) * pq_scale; from += step; } } @@ -185,7 +240,7 @@ case TimeSeries::OrderTFP: { if (verbose) - cerr << "dsp::BPSRCrossUnpacker::unpack Output order OrderTFP\n" << endl; + cerr << "dsp::BPSRCrossUnpacker::unpack Output order OrderTFP" << endl; const unsigned char* from = input->get_rawptr(); float* into = output->get_dattfp(); @@ -201,6 +256,11 @@ into[2] = float( from[1] ) + 0.5; into[3] = float( from[3] ) + 0.5; + into[0] *= ppqq_scale[0]; + into[1] *= ppqq_scale[1]; + into[2] *= ppqq_scale[0]; + into[3] *= ppqq_scale[1]; + into += 4; from += 8; } @@ -218,10 +278,14 @@ into[6] = float( ((char) from[6]) ) + 0.5; into[7] = float( ((char) from[7]) ) + 0.5; - into[2] /= gain_polx; - into[3] /= gain_polx; - into[6] /= gain_polx; - into[7] /= gain_polx; + into[0] *= ppqq_scale[0]; + into[1] *= ppqq_scale[1]; + into[2] *= pq_scale; + into[3] *= pq_scale; + into[4] *= ppqq_scale[0]; + into[5] *= ppqq_scale[1]; + into[6] *= pq_scale; + into[7] *= pq_scale; into += 8; from += 8; diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/bpsr/dsp/BPSRCrossUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/bpsr/dsp/BPSRCrossUnpacker.h --- bl-dspsr-0+git20160405/Kernel/Formats/bpsr/dsp/BPSRCrossUnpacker.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/bpsr/dsp/BPSRCrossUnpacker.h 2018-03-12 23:02:35.000000000 +0000 @@ -44,9 +44,20 @@ unsigned get_output_ipol (unsigned idig) const; float gain_polx; + float gain_pol1; + float gain_pol2; + unsigned ppqq_bw; private: + float reference_gain; + + float ppqq_scale[2]; + + float pq_scale; + + private: + bool unpack_ppqq_only; }; diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/bpsr/dsp/BPSRUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/bpsr/dsp/BPSRUnpacker.h --- bl-dspsr-0+git20160405/Kernel/Formats/bpsr/dsp/BPSRUnpacker.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/bpsr/dsp/BPSRUnpacker.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/bpsr/dsp/BPSRUnpacker.h,v $ - $Revision: 1.4 $ - $Date: 2008/10/08 23:19:06 $ - $Author: sixbynine $ */ +// dspsr/Kernel/Formats/bpsr/dsp/BPSRUnpacker.h #ifndef __BPSRUnpacker_h #define __BPSRUnpacker_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/caspsr/CASPSRSingleUnpacker.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/caspsr/CASPSRSingleUnpacker.C --- bl-dspsr-0+git20160405/Kernel/Formats/caspsr/CASPSRSingleUnpacker.C 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/caspsr/CASPSRSingleUnpacker.C 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,273 @@ +//-*-C++-*- +/*************************************************************************** + * + * Copyright (C) 2009 + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include "dsp/CASPSRSingleUnpacker.h" +#include "dsp/BitTable.h" + +#include "Error.h" + +#if HAVE_CUDA +#include "dsp/MemoryCUDA.h" +#include "dsp/CASPSRUnpackerCUDA.h" +#include +#endif + +#include + +using namespace std; + +static void* const undefined_stream = (void *) -1; + +dsp::CASPSRSingleUnpacker::CASPSRSingleUnpacker (const char* _name) : HistUnpacker (_name) +{ + if (verbose) + cerr << "dsp::CASPSRSingleUnpacker ctor" << endl; + + set_nstate (256); + gpu_stream = undefined_stream; + + table = new BitTable (8, BitTable::TwosComplement); + +#if HAVE_CUDA + int device; + struct cudaDeviceProp gpu; + cudaGetDevice(&device); + cudaGetDeviceProperties (&gpu, device); + threadsPerBlock = gpu.maxThreadsPerBlock; +#endif + + device_prepared = false; +} + +dsp::CASPSRSingleUnpacker::~CASPSRSingleUnpacker () +{ +} + +dsp::CASPSRSingleUnpacker * dsp::CASPSRSingleUnpacker::clone () const +{ + return new CASPSRSingleUnpacker (*this); +} + +//! Return true if the unpacker can operate on the specified device +bool dsp::CASPSRSingleUnpacker::get_device_supported (Memory* memory) const +{ +#if HAVE_CUDA + if (verbose) + cerr << "dsp::CASPSRSingleUnpacker::get_device_supported HAVE_CUDA" << endl; + return dynamic_cast< CUDA::DeviceMemory*> ( memory ); +#else + return false; +#endif +} + +//! Set the device on which the unpacker will operate +void dsp::CASPSRSingleUnpacker::set_device (Memory* memory) +{ +#if HAVE_CUDA + CUDA::DeviceMemory * gpu_mem = dynamic_cast< CUDA::DeviceMemory*>( memory ); + if (gpu_mem) + { + gpu_stream = (void *) gpu_mem->get_stream(); + if (verbose) + cerr << "dsp::CASPSRSingleUnpacker::set_device using gpu memory" << endl; + } + else + { + if (verbose) + cerr << "dsp::CASPSRSingleUnpacker::set_device using cpu memory" << endl; + gpu_stream = undefined_stream; + Unpacker::set_device (memory); + } +#else + Unpacker::set_device (memory); +#endif + device_prepared = true; +} + + +bool dsp::CASPSRSingleUnpacker::matches (const Observation* observation) +{ + return observation->get_machine()== "CASPSR" + && observation->get_nbit() == 8; +} + +// default CPU unpacker for CASPSR format +void dsp::CASPSRSingleUnpacker::unpack_default () +{ + uint64_t ndat = input->get_ndat(); + const float* lookup = table->get_values (); + + const uint64_t * from64 = (uint64_t *) input->get_rawptr(); + + unsigned long* hist_p0 = get_histogram (0); + unsigned long* hist_p1 = get_histogram (1); + + float * into_p0 = output->get_datptr (0, 0); + float * into_p1 = output->get_datptr (0, 1); + + uint64_t val64; + unsigned char * val8 = (unsigned char *) &val64; + char * val8h = (char *) &val64; + + // process 4 samples, from 2 pols per loop + for (uint64_t idat=0; idatget_values (); + const float scale = table->get_scale(); + + const unsigned into_stride = fskip * 4; + const unsigned from_stride = 2; + + // read 4 samples at a time + uint32_t * from32 = (uint32_t *) from; + uint32_t val32; + unsigned char * val8 = (unsigned char *) &val32; + + //std::cout << ndat << std::endl; + for (uint64_t idat=0; idat < ndat; idat+=4) + { + // read 4 uint8_t (actually int8_t) + val32 = *from32; + + into[0] = lookup[ val8[0] ]; + into[1] = lookup[ val8[1] ]; + into[2] = lookup[ val8[2] ]; + into[3] = lookup[ val8[3] ]; + + hist[val8[0]]++; + hist[val8[1]]++; + hist[val8[2]]++; + hist[val8[3]]++; + + from32 += from_stride; + into += into_stride; + } +} + +void dsp::CASPSRSingleUnpacker::unpack () +{ + +#if HAVE_CUDA + if (gpu_stream != undefined_stream) + { + unpack_on_gpu (); + return; + } +#endif + + // some programs (digifil) do not call set_device + if (! device_prepared) + set_device ( Memory::get_manager ()); + + const uint64_t ndat = input->get_ndat(); + const unsigned nchan = input->get_nchan(); + const unsigned npol = input->get_npol(); + const unsigned ndim = input->get_ndim(); + + if (ndim == 1 && npol == 2 && nchan == 1) + { + unpack_default(); + return; + } + + const unsigned fskip = ndim; + unsigned offset = 0; + + for (unsigned ichan=0; ichanget_rawptr() + offset; + float* into = output->get_datptr (ichan, ipol) + idim; + unsigned long* hist = get_histogram (ipol); + + unpack (ndat, from, into, fskip, hist); + offset ++; + } + } + } +} + +unsigned dsp::CASPSRSingleUnpacker::get_resolution () const { return 1024; } + +#if HAVE_CUDA + +void dsp::CASPSRSingleUnpacker::unpack_on_gpu () +{ + const uint64_t ndat = input->get_ndat(); + const unsigned nchan = input->get_nchan(); + const unsigned ndim = input->get_ndim(); + const unsigned npol = input->get_npol(); + + const unsigned char* from = input->get_rawptr(); + float * into_pola, * into_polb; + unsigned ichan; + + cudaStream_t stream = (cudaStream_t) gpu_stream; + cudaError error; + + for (ichan=0; ichanget_datptr(ichan, 0); + into_polb = output->get_datptr(ichan, 1); + + caspsr_unpack (stream, ndat*ndim, table->get_scale(), + from, into_pola, into_polb, + threadsPerBlock); + + from += ndat*ndim*npol; + } +} + +#endif + diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/caspsr/CASPSRUnpacker.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/caspsr/CASPSRUnpacker.C --- bl-dspsr-0+git20160405/Kernel/Formats/caspsr/CASPSRUnpacker.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/caspsr/CASPSRUnpacker.C 2018-03-12 23:02:35.000000000 +0000 @@ -42,6 +42,14 @@ state = Idle; thread_count = 0; +#if HAVE_CUDA + int device; + struct cudaDeviceProp gpu; + cudaGetDevice(&device); + cudaGetDeviceProperties (&gpu, device); + threadsPerBlock = gpu.maxThreadsPerBlock; +#endif + device_prepared = false; single_thread = true; } @@ -82,18 +90,8 @@ if (gpu_mem) { gpu_stream = (void *) gpu_mem->get_stream(); -#ifdef USE_TEXTURE_MEMORY - if (verbose) - cerr << "dsp::CASPSRUnpacker::set_device using texture memory" << endl; - CUDA::TextureMemory * texture_mem = new CUDA::TextureMemory (gpu_mem->get_stream()); - texture_mem->set_format_signed(8, 0, 0, 0); - texture_mem->set_symbol("caspsr_unpack_tex"); - staging.set_memory( texture_mem ); -#else if (verbose) cerr << "dsp::CASPSRUnpacker::set_device using gpu memory" << endl; - staging.set_memory( memory ); -#endif } else { @@ -346,43 +344,28 @@ void dsp::CASPSRUnpacker::unpack_on_gpu () { const uint64_t ndat = input->get_ndat(); + const unsigned nchan = input->get_nchan(); + const unsigned ndim = input->get_ndim(); + const unsigned npol = input->get_npol(); - staging.Observation::operator=( *input ); - staging.resize(ndat); - - // staging buffer on the GPU for packed data - unsigned char* d_staging = staging.get_rawptr(); -#ifdef USE_TEXTURE_MEMORY - if (verbose) - cerr << "dsp::CASPSRUnpacker::unpack_on_gpu: creating TextureMemory" << endl; - - CUDA::TextureMemory * gpu_mem = dynamic_cast< CUDA::TextureMemory*>( staging.get_memory() ); - if (ndat > 0) - gpu_mem->activate ( d_staging ); -#endif - - const unsigned char* from= input->get_rawptr(); - - float* into_pola = output->get_datptr(0,0); - float* into_polb = output->get_datptr(0,1); + const unsigned char* from = input->get_rawptr(); + float * into_pola, * into_polb; + unsigned ichan; cudaStream_t stream = (cudaStream_t) gpu_stream; - cudaError error; - if (stream) - error = cudaMemcpyAsync (d_staging, from, ndat*2, - cudaMemcpyHostToDevice, stream); - else - error = cudaMemcpy (d_staging, from, ndat*2, cudaMemcpyHostToDevice); + for (ichan=0; ichanget_datptr(ichan, 0); + into_polb = output->get_datptr(ichan, 1); - if (error != cudaSuccess) - throw Error (FailedCall, "CASPSRUnpacker::unpack_on_gpu", - "cudaMemcpy%s %s", stream?"Async":"", - cudaGetErrorString (error)); + caspsr_unpack (stream, ndat*ndim, table->get_scale(), + from, into_pola, into_polb, + threadsPerBlock); - caspsr_unpack (stream, ndat, table->get_scale(), - d_staging, into_pola, into_polb); + from += ndat*ndim*npol; + } } #endif diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/caspsr/CASPSRUnpackerCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/caspsr/CASPSRUnpackerCUDA.cu --- bl-dspsr-0+git20160405/Kernel/Formats/caspsr/CASPSRUnpackerCUDA.cu 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/caspsr/CASPSRUnpackerCUDA.cu 2018-03-12 23:02:35.000000000 +0000 @@ -11,14 +11,10 @@ #include "Error.h" -// threads per block - C1060=256 [TODO CHECK below if changing] -#define __CASPSR_UNPACK_TPB 256 - -// global static texture declaration for CASPSR gpu unpacker -texture caspsr_unpack_tex; - using namespace std; +void check_error_stream (const char*, cudaStream_t); + /* Unpack the two real-valued input polarizations into an interleaved array suited to the twofft algorithm described in Section 12.3 @@ -45,100 +41,56 @@ output[7] = convert(scale,input[index].val[7]); } -void check_error (const char*); - -#ifdef USE_TEXTURE_MEMORY -// ndim 1 unpacker uses texture memory for reads -__global__ void unpack_real_ndim1 (float* into_pola, float* into_polb, float scale) +__global__ void unpack_real_ndim1 (uint64_t ndat, float scale, + int8_t * from, float* into_pola, float* into_polb) { - const int idx = blockIdx.x*blockDim.x + threadIdx.x; - const int sample_idx = idx * 8; - unsigned int shared_idx = threadIdx.x * 4; - const uint64_t output_idx = blockIdx.x * blockDim.x * 4; - const unsigned int half_block = blockDim.x / 2; - - // n.b. this is blockDim.x * 4 [hardcoded by default] - __shared__ float pola[4 * __CASPSR_UNPACK_TPB]; - __shared__ float polb[4 * __CASPSR_UNPACK_TPB]; + extern __shared__ int8_t sdata[]; - // loads 8 samples per thread (4 per poln) - unsigned i = 0; + unsigned idx_shm = threadIdx.x; + unsigned idx = (8 * blockIdx.x * blockDim.x) + threadIdx.x; + unsigned i; - // write 4 samples from each poln into shared memory - for (i=0; i<4; i++) + // each thread will load 8 values (coalesced) from GMEM to SHM + for (i=0; i<8; i++) { - - pola[shared_idx + i] = (((float) tex1Dfetch(caspsr_unpack_tex, sample_idx + i)) + 0.5) * scale; - polb[shared_idx + i] = (((float) tex1Dfetch(caspsr_unpack_tex, sample_idx + i + 4)) + 0.5) * scale; + if (idx < 2*ndat) + { + sdata[idx_shm] = from[idx]; + + idx += blockDim.x; + idx_shm += blockDim.x; + } } __syncthreads(); - // first half threads write poln A - if (threadIdx.x < half_block) - { - unsigned int tid = 2 * threadIdx.x + (48 * ((int) (threadIdx.x/8))); - float * to = into_pola + output_idx; + idx = (4 * blockIdx.x * blockDim.x) + threadIdx.x; + idx_shm = threadIdx.x + ((threadIdx.x / 4) * 4); - to[tid + 0] = pola[tid + 0]; - to[tid + 1] = pola[tid + 1]; - to[tid + 16] = pola[tid + 16]; - to[tid + 17] = pola[tid + 17]; - to[tid + 32] = pola[tid + 32]; - to[tid + 33] = pola[tid + 33]; - to[tid + 48] = pola[tid + 48]; - to[tid + 49] = pola[tid + 49]; - } - // second half threads write poln B - else + // each thread will write 4 values (coalesced) from SHM to GMEM + for (i=0; i<4; i++) { - unsigned int tid = 2 * (threadIdx.x - half_block) + (48 * ((int) ((threadIdx.x-half_block)/8))); - float * to = into_polb + output_idx; - - to[tid + 0] = polb[tid + 0]; - to[tid + 1] = polb[tid + 1]; - to[tid + 16] = polb[tid + 16]; - to[tid + 17] = polb[tid + 17]; - to[tid + 32] = polb[tid + 32]; - to[tid + 33] = polb[tid + 33]; - to[tid + 48] = polb[tid + 48]; - to[tid + 49] = polb[tid + 49]; + if (idx < ndat) + { + into_pola[idx] = ((float) sdata[idx_shm] + 0.5) * scale; + into_polb[idx] = ((float) sdata[idx_shm+4] + 0.5) * scale; + + idx += blockDim.x; + idx_shm += blockDim.x * 2; + } } } -#else -__global__ void unpack_real_ndim1 (uint64_t ndat, float scale, - const unsigned char* stagingBufGPU, - float* into_pola, float* into_polb) -{ - uint64_t sampleTmp = blockIdx.x*blockDim.x + threadIdx.x; - - uint64_t outputIndex = sampleTmp * 4; - sampleTmp = sampleTmp * 8; - - float* to_A = into_pola + outputIndex; - float* to_B = into_polb + outputIndex; - - const int8_t* from = reinterpret_cast( stagingBufGPU ) + sampleTmp; - - to_A[0] = ((float) from[0] + 0.5) * scale; - to_A[1] = ((float) from[1] + 0.5) * scale; - to_A[2] = ((float) from[2] + 0.5) * scale; - to_A[3] = ((float) from[3] + 0.5) * scale; - - to_B[0] = ((float) from[4] + 0.5) * scale; - to_B[1] = ((float) from[5] + 0.5) * scale; - to_B[2] = ((float) from[6] + 0.5) * scale; - to_B[3] = ((float) from[7] + 0.5) * scale; -} -#endif void caspsr_unpack (cudaStream_t stream, const uint64_t ndat, float scale, - unsigned char const* input, float* pol0, float* pol1) + unsigned char const* input, float* pol0, float* pol1, + int nthread) { - int nthread = __CASPSR_UNPACK_TPB; // each thread will unpack 4 time samples from each polarization - int nblock = ndat / (4*nthread); + int nsamp_per_block = 4 * nthread; + int nblock = ndat / nsamp_per_block; + if (ndat % nsamp_per_block) + nblock++; #ifdef _DEBUG cerr << "caspsr_unpack ndat=" << ndat << " scale=" << scale @@ -146,22 +98,10 @@ << " nthread=" << nthread << endl; #endif -#ifdef USE_TEXTURE_MEMORY - unpack_real_ndim1<<>> (pol0, pol1, scale); -#else - unpack_real_ndim1<<>> (ndat, scale, input, pol0, pol1); -#endif - - // AJ's theory... - // If there are no stream synchronises on the input then the CPU pinned memory load from the - // input class might be able to get ahead of a whole sequence of GPU operations, and even exceed - // one I/O loop. Therefore this should be a reuqirement to have a stream synchronize some time - // after the data are loaded from pinned memory to GPU ram and the next Input copy to pinned memory - - // put it here for now - cudaStreamSynchronize(stream); - + int8_t * from = (int8_t *) input; + size_t shm_bytes = 8 * nthread; + unpack_real_ndim1<<>> (ndat, scale, from, pol0, pol1); if (dsp::Operation::record_time || dsp::Operation::verbose) - check_error ("caspsr_unpack"); + check_error_stream ("caspsr_unpack", stream); } diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/caspsr/dsp/CASPSRSingleUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/caspsr/dsp/CASPSRSingleUnpacker.h --- bl-dspsr-0+git20160405/Kernel/Formats/caspsr/dsp/CASPSRSingleUnpacker.h 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/caspsr/dsp/CASPSRSingleUnpacker.h 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,60 @@ +/* + + */ + +#ifndef __dsp_CASPSRSingleUnpacker_h +#define __dsp_CASPSRSingleUnpacker_h + +#include "dsp/EightBitUnpacker.h" + +namespace dsp { + + class CASPSRSingleUnpacker : public HistUnpacker + { + public: + + //! Constructor + CASPSRSingleUnpacker (const char* name = "CASPSRSingleUnpacker"); + ~CASPSRSingleUnpacker (); + + //! Cloner (calls new) + virtual CASPSRSingleUnpacker * clone () const; + + //! Return true if the unpacker can operate on the specified device + bool get_device_supported (Memory*) const; + + //! Set the device on which the unpacker will operate + void set_device (Memory*); + + protected: + + Reference::To table; + + //! Return true if we can convert the Observation + bool matches (const Observation* observation); + + void unpack (); + + void unpack_default (); + + void unpack (uint64_t ndat, const unsigned char* from, + float* into, const unsigned fskip, + unsigned long* hist); + + void * gpu_stream; + + void unpack_on_gpu (); + + unsigned get_resolution ()const ; + + private: + + bool device_prepared; + + //! maximum number of GPU threads per block + int threadsPerBlock; + + }; +} + +#endif diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/caspsr/dsp/CASPSRUnpackerCUDA.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/caspsr/dsp/CASPSRUnpackerCUDA.h --- bl-dspsr-0+git20160405/Kernel/Formats/caspsr/dsp/CASPSRUnpackerCUDA.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/caspsr/dsp/CASPSRUnpackerCUDA.h 2018-03-12 23:02:35.000000000 +0000 @@ -5,8 +5,6 @@ #ifndef __dsp_CASPSRUnpackerCUDA_h #define __dsp_CASPSRUnpackerCUDA_h -// #define USE_TEXTURE_MEMORY 1 - #include #include @@ -14,7 +12,8 @@ void caspsr_unpack (cudaStream_t stream, const uint64_t ndat, float scale, - const unsigned char* stagingBufGPU, - float* pol0, float* pol1); + const unsigned char* from, + float* pol0, float* pol1, + int nthread); #endif diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/caspsr/dsp/CASPSRUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/caspsr/dsp/CASPSRUnpacker.h --- bl-dspsr-0+git20160405/Kernel/Formats/caspsr/dsp/CASPSRUnpacker.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/caspsr/dsp/CASPSRUnpacker.h 2018-03-12 23:02:35.000000000 +0000 @@ -40,8 +40,8 @@ float* into, const unsigned fskip, unsigned long* hist); - BitSeries staging; void * gpu_stream; + void unpack_on_gpu (); unsigned get_resolution ()const ; @@ -89,6 +89,9 @@ //! sk_thread states std::vector states; + //! maximum number of GPU threads per block + int threadsPerBlock; + }; } diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/cpsr/dsp/CPSRFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/dsp/CPSRFile.h --- bl-dspsr-0+git20160405/Kernel/Formats/cpsr/dsp/CPSRFile.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/dsp/CPSRFile.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/cpsr/dsp/CPSRFile.h,v $ - $Revision: 1.15 $ - $Date: 2008/05/28 21:12:42 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/cpsr/dsp/CPSRFile.h #ifndef __CPSRFile_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/cpsr/dsp/CPSRTwoBitCorrection.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/dsp/CPSRTwoBitCorrection.h --- bl-dspsr-0+git20160405/Kernel/Formats/cpsr/dsp/CPSRTwoBitCorrection.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/dsp/CPSRTwoBitCorrection.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/cpsr/dsp/CPSRTwoBitCorrection.h,v $ - $Revision: 1.15 $ - $Date: 2006/07/09 13:27:03 $ - $Author: wvanstra $ */ +// dspsr/Kernel/Formats/cpsr/dsp/CPSRTwoBitCorrection.h #ifndef __CPSRTwoBitCorrection_h #define __CPSRTwoBitCorrection_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/cpsr/dsp/pspmXfer.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/dsp/pspmXfer.h --- bl-dspsr-0+git20160405/Kernel/Formats/cpsr/dsp/pspmXfer.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/dsp/pspmXfer.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/cpsr/dsp/pspmXfer.h,v $ - $Revision: 1.6 $ - $Date: 2006/10/15 23:26:47 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/cpsr/dsp/pspmXfer.h #ifndef __pspmXfer_h #define __pspmXfer_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/cpsr/dsp/rdisk.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/dsp/rdisk.h --- bl-dspsr-0+git20160405/Kernel/Formats/cpsr/dsp/rdisk.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/dsp/rdisk.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/cpsr/dsp/rdisk.h,v $ - $Revision: 1.6 $ - $Date: 2006/10/15 23:26:47 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/cpsr/dsp/rdisk.h #ifndef __RDISK_H #define __RDISK_H diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/cpsr/dsp/xfer_tape.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/dsp/xfer_tape.h --- bl-dspsr-0+git20160405/Kernel/Formats/cpsr/dsp/xfer_tape.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/dsp/xfer_tape.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/cpsr/dsp/xfer_tape.h,v $ - $Revision: 1.2 $ - $Date: 2006/07/09 13:27:06 $ - $Author: wvanstra $ */ +// dspsr/Kernel/Formats/cpsr/dsp/xfer_tape.h #ifndef __XFER_TAPE_H #define __XFER_TAPE_H diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/cpsr/pspmDbase.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/pspmDbase.h --- bl-dspsr-0+git20160405/Kernel/Formats/cpsr/pspmDbase.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/pspmDbase.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/cpsr/pspmDbase.h,v $ - $Revision: 1.8 $ - $Date: 2009/06/17 10:32:32 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/cpsr/pspmDbase.h #ifndef __pspmDbase_h #define __pspmDbase_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/cpsr/pspm++.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/pspm++.h --- bl-dspsr-0+git20160405/Kernel/Formats/cpsr/pspm++.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/pspm++.h 2018-03-12 23:02:35.000000000 +0000 @@ -4,10 +4,7 @@ * Licensed under the Academic Free License version 2.1 * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/cpsr/pspm++.h,v $ - $Revision: 1.9 $ - $Date: 2006/10/15 23:26:47 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/cpsr/pspm++.h #ifndef __PSPM_H #define __PSPM_H diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/cpsr/pspm_search_header.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/pspm_search_header.h --- bl-dspsr-0+git20160405/Kernel/Formats/cpsr/pspm_search_header.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr/pspm_search_header.h 2018-03-12 23:02:35.000000000 +0000 @@ -24,7 +24,7 @@ * some error message; * * - * $Log: pspm_search_header.h,v $ + * $Log: pspm_search_header.h * Revision 1.5 2009/06/17 10:16:54 straten * use ISO C99 integer types directly * diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/cpsr2/dsp/CPSR2File.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr2/dsp/CPSR2File.h --- bl-dspsr-0+git20160405/Kernel/Formats/cpsr2/dsp/CPSR2File.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr2/dsp/CPSR2File.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/cpsr2/dsp/CPSR2File.h,v $ - $Revision: 1.20 $ - $Date: 2009/06/17 10:16:54 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/cpsr2/dsp/CPSR2File.h #ifndef __CPSR2File_h #define __CPSR2File_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/cpsr2/dsp/CPSR2_Observation.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr2/dsp/CPSR2_Observation.h --- bl-dspsr-0+git20160405/Kernel/Formats/cpsr2/dsp/CPSR2_Observation.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr2/dsp/CPSR2_Observation.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/cpsr2/dsp/CPSR2_Observation.h,v $ - $Revision: 1.9 $ - $Date: 2008/11/11 06:14:09 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/cpsr2/dsp/CPSR2_Observation.h #ifndef __CPSR2_Observation_h #define __CPSR2_Observation_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/cpsr2/dsp/CPSR2TwoBitCorrection.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr2/dsp/CPSR2TwoBitCorrection.h --- bl-dspsr-0+git20160405/Kernel/Formats/cpsr2/dsp/CPSR2TwoBitCorrection.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/cpsr2/dsp/CPSR2TwoBitCorrection.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/cpsr2/dsp/CPSR2TwoBitCorrection.h,v $ - $Revision: 1.9 $ - $Date: 2006/07/09 13:27:07 $ - $Author: wvanstra $ */ +// dspsr/Kernel/Formats/cpsr2/dsp/CPSR2TwoBitCorrection.h #ifndef __CPSR2TwoBitCorrection_h #define __CPSR2TwoBitCorrection_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/dada/DADABuffer.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/dada/DADABuffer.C --- bl-dspsr-0+git20160405/Kernel/Formats/dada/DADABuffer.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/dada/DADABuffer.C 2018-03-12 23:02:35.000000000 +0000 @@ -12,6 +12,11 @@ #include "ascii_header.h" #include "FilePtr.h" +#if HAVE_CUDA +#include "dada_cuda.h" +#include "ipcio_cuda.h" +#endif + #include #include @@ -60,6 +65,12 @@ } } +#if HAVE_CUDA + if (!passive && dada_cuda_dbunregister (hdu) < 0) + throw Error (InvalidState, "dsp::DADABuffer::close", + "cannot unregister ring buffer blocks as Pinned memory"); +#endif + if (!passive && dada_hdu_unlock_read (hdu) < 0) cerr << "dsp::DADABuffer::close error during dada_hdu_unlock_read" << endl; @@ -203,6 +214,14 @@ throw Error (InvalidState, "dsp::DADABuffer::open_file", "cannot lock DADA ring buffer read client status"); +#if HAVE_CUDA + if (verbose) + cerr << "dsp::DADABuffer::open_file registering dada buffers with CUDA for pinned transfers" << endl; + if (!passive && dada_cuda_dbregister (hdu) < 0) + throw Error (InvalidState, "dsp::DADABuffer::open_file", + "cannot register DADA ring buffer blocks as Pinned memory"); +#endif + if (passive && dada_hdu_open_view (hdu) < 0) throw Error (InvalidState, "dsp::DADABuffer::open_file", "cannot open DADA ring buffer for viewing"); @@ -220,6 +239,7 @@ if (ascii_header_get (hdu->header, "RESOLUTION", "%u", &byte_resolution) < 0) byte_resolution = 1; + // the resolution is the _byte_ resolution; convert to _sample_ resolution resolution = get_info()->get_nsamples (byte_resolution); if (resolution == 0) @@ -237,32 +257,55 @@ int64_t dsp::DADABuffer::load_bytes (unsigned char* buffer, uint64_t bytes) { if (verbose) - cerr << "DADABuffer::load_bytes ipcio_read " + cerr << "dsp::DADABuffer::load_bytes ipcio_read " << bytes << " bytes" << endl; int64_t bytes_read = ipcio_read (hdu->data_block, (char*)buffer, bytes); if (bytes_read < 0) - cerr << "DADABuffer::load_bytes error ipcio_read" << endl; + cerr << "dsp::DADABuffer::load_bytes error ipcio_read" << endl; if (verbose) - cerr << "DADABuffer::load_bytes read " << bytes_read << " bytes" << endl; + cerr << "dsp::DADABuffer::load_bytes read " << bytes_read << " bytes" << endl; return bytes_read; } +#if HAVE_CUDA +//! Load bytes from shared memory directory to GPU memory +int64_t dsp::DADABuffer::load_bytes_device (unsigned char* device_memory, uint64_t bytes, void * device_handle) +{ + cudaStream_t stream = (cudaStream_t) device_handle; + + if (verbose) + cerr << "dsp::DADABuffer::load_bytes_device ipcio_read_cuda " + << bytes << " bytes" << endl; + + int64_t bytes_read = ipcio_read_cuda (hdu->data_block, (char*) device_memory, bytes, stream); + //int64_t bytes_read = (int64_t) bytes; + cudaStreamSynchronize(stream); + if (bytes_read < 0) + cerr << "dsp::DADABuffer::load_bytes_device error ipcio_read_cuda" << endl; + + if (verbose) + cerr << "dsp::DADABuffer::load_bytes_device read " << bytes_read << " bytes" << endl; + + return bytes_read; +} +#endif + //! Adjust the shared memory pointer int64_t dsp::DADABuffer::seek_bytes (uint64_t bytes) { if (verbose) - cerr << "DADABuffer::seek_bytes ipcio_seek " + cerr << "dsp::DADABuffer::seek_bytes ipcio_seek " << bytes << " bytes" << endl; int64_t absolute_bytes = ipcio_seek (hdu->data_block, bytes, SEEK_SET); if (absolute_bytes < 0) - cerr << "DADABuffer::seek_bytes error ipcio_seek" << endl; + cerr << "dsp::DADABuffer::seek_bytes error ipcio_seek" << endl; if (verbose) - cerr << "DADABuffer::seek_bytes absolute_bytes=" << absolute_bytes << endl; + cerr << "dsp::DADABuffer::seek_bytes absolute_bytes=" << absolute_bytes << endl; return absolute_bytes; } diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/dada/dsp/DADABuffer.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/dada/dsp/DADABuffer.h --- bl-dspsr-0+git20160405/Kernel/Formats/dada/dsp/DADABuffer.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/dada/dsp/DADABuffer.h 2018-03-12 23:02:35.000000000 +0000 @@ -56,7 +56,12 @@ //! Load bytes from shared memory virtual int64_t load_bytes (unsigned char* buffer, uint64_t bytes); - + +#if HAVE_CUDA + //! Load bytes from shared memory directory to GPU memory + int64_t load_bytes_device (unsigned char* device_memory, uint64_t bytes, void * device_handle); +#endif + //! Set the offset in shared memory virtual int64_t seek_bytes (uint64_t bytes); diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/dada/Makefile.am bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/dada/Makefile.am --- bl-dspsr-0+git20160405/Kernel/Formats/dada/Makefile.am 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/dada/Makefile.am 2018-03-12 23:02:35.000000000 +0000 @@ -12,5 +12,5 @@ include $(top_srcdir)/config/Makefile.include -AM_CPPFLAGS += @PSRDADA_CFLAGS@ +AM_CPPFLAGS += @PSRDADA_CFLAGS@ @CUDA_CFLAGS@ diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/emerlin/dsp/EmerlinFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/emerlin/dsp/EmerlinFile.h --- bl-dspsr-0+git20160405/Kernel/Formats/emerlin/dsp/EmerlinFile.h 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/emerlin/dsp/EmerlinFile.h 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,40 @@ + +#ifndef __EmerlinFile_h +#define __EmerlinFile_h + + +#include +#include "dsp/File.h" +#include "dsp/BlockFile.h" + + +namespace dsp { + + + class EmerlinFile : public File { + + public: + EmerlinFile(const char* filename=0, const char* headername=0); + + ~EmerlinFile(); + + bool is_valid(const char* filename) const ; + + protected: + virtual void open_file(const char* filename); + + virtual int64_t seek_bytes(uint64_t bytes); + virtual int64_t load_bytes(unsigned char* buffer, uint64_t nbytes); + + private: + char datafile[1024]; + uint64_t cur_frame; + uint64_t first_second; + uint64_t dropped; + + }; +} + + +#endif + diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/emerlin/dsp/EmerlinTwoBitTable.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/emerlin/dsp/EmerlinTwoBitTable.h --- bl-dspsr-0+git20160405/Kernel/Formats/emerlin/dsp/EmerlinTwoBitTable.h 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/emerlin/dsp/EmerlinTwoBitTable.h 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,41 @@ +//-*-C++-*- +/*************************************************************************** + * + * Copyright (C) 2011 by Paul Demorest + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#ifndef __EmerlinTwoBitTable_h +#define __EmerlinTwoBitTable_h + +#include "dsp/TwoBitTable.h" + +namespace dsp { + + //! Look-up tables for conversion from Emerlin two-bit to floating point numbers + /*! Emerlin defines bits to run in time order from LSB to MSB, this is + * the opposite of the standard dspsr TwoBitTable convention so + * we need to override the 'extract' function here. + */ + class EmerlinTwoBitTable : public TwoBitTable { + + public: + + //! Constructor + EmerlinTwoBitTable () : TwoBitTable (TwoBitTable::OffsetBinary) { + destroy(); + build(); + } + + //! Destructor + ~EmerlinTwoBitTable () { } + + //! Return the 2-bit number from byte corresponding to sample + virtual unsigned extract (unsigned byte, unsigned sample) const; + + }; + +} + +#endif // !defined(__EmerlinTwoBitTable_h) diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/emerlin/dsp/EmerlinUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/emerlin/dsp/EmerlinUnpacker.h --- bl-dspsr-0+git20160405/Kernel/Formats/emerlin/dsp/EmerlinUnpacker.h 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/emerlin/dsp/EmerlinUnpacker.h 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,43 @@ + + +#ifndef __EmerlinUnpacker_h +#define __EmerlinUnpacker_h + + +#include "dsp/Unpacker.h" +#include "dsp/TimeSeries.h" +#include "dsp/WeightedTimeSeries.h" +#include "dsp/EmerlinTwoBitTable.h" +#include "dsp/TwoBitTable.h" + + +namespace dsp { + class EmerlinUnpacker : public Unpacker { + + public: + EmerlinUnpacker (const char* name="EmerlinUnpacker"); + unsigned get_ndig() const; + + + protected: + void unpack(); + bool matches(const Observation* observation); + + void reserve(); + void set_output(TimeSeries* _output); + int get_ndat_per_weight(); + + + private: + dsp::EmerlinTwoBitTable bittable; + WeightedTimeSeries* weighted_output; + + }; + +} + +#endif + + + + diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/emerlin/EmerlinFile.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/emerlin/EmerlinFile.C --- bl-dspsr-0+git20160405/Kernel/Formats/emerlin/EmerlinFile.C 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/emerlin/EmerlinFile.C 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,278 @@ + +#include +#include +#include +#include + +#include "dsp/EmerlinFile.h" +#include "dsp/ASCIIObservation.h" +#include "vdifio.h" +#include "ascii_header.h" +#include + + + +using namespace std; + +dsp::EmerlinFile::EmerlinFile(const char* filename, const char* headername) : File("emerlin"),dropped(0) { +} + +dsp::EmerlinFile::~EmerlinFile() { +} + + +bool dsp::EmerlinFile::is_valid(const char* filename) const { + FILE *fptr = fopen(filename, "r"); + if (!fptr) + { + if (verbose) + cerr << "dsp::EmerlinFile::is_valid Error opening file." << endl; + return false; + } + + char header[4096]; + fread(header, sizeof(char), 4096, fptr); + fclose(fptr); + + char inst[64]; + if ( ascii_header_get(header, "INSTRUMENT", "%s", inst) < 0 ) + { + if (verbose) + cerr << "dsp::EmerlinFile::is_valid no INSTRUMENT line" << endl; + return false; + } + if ( std::string(inst) != "EMERLIN" ) + { + if (verbose) + cerr << "dsp::EmerlinFile::is_valid INSTRUMENT != 'EMERLIN'" << endl; + return false; + } + + return true; +} + + +void dsp::EmerlinFile::open_file(const char* filename) { + // This is the header file + FILE *fptr = fopen (filename, "r"); + if (!fptr) + throw Error (FailedSys, "dsp::EmerlinFile::open_file", + "fopen(%s) failed", filename); + + // Read the header + char header[4096]; + fread(header, sizeof(char), 4096, fptr); + fclose(fptr); + + // Get the data file + if (ascii_header_get (header, "DATAFILE", "%s", datafile) < 0) + throw Error (InvalidParam, "dsp::EmerlinFile::open_file", + "Missing DATAFILE keyword"); + + // Parse the standard ASCII info. Timestamps are in VDIF packets + // so not required. Also we'll assume VDIF's "nchan" really gives + // the number of polns for now, and NCHAN is 1. NBIT is in VDIF packets. + // We'll compute TSAMP from the bandwidth. NDIM (real vs complex sampling) + // is in VDIF packets via the iscomplex param. + ASCIIObservation* info_tmp = new ASCIIObservation; + info = info_tmp; + + info_tmp->set_required("UTC_START", false); + info_tmp->set_required("OBS_OFFSET", false); + info_tmp->set_required("NPOL",true); + info_tmp->set_required("NBIT", false); + info_tmp->set_required("NDIM", false); + info_tmp->set_required("NCHAN", false); + info_tmp->set_required("TSAMP", false); + info_tmp->set_required("CALFREQ", false); + info_tmp->load(header); + + + + + // open the file + fd = ::open (datafile, O_RDONLY); + if (fd < 0) + throw Error (FailedSys, "dsp::EmerlinFile::open_file()", + "open(%s) failed", filename); + + + // Read until we get a valid frame + bool got_valid_frame = false; + char rawhdr_bytes[VDIF_HEADER_BYTES]; + vdif_header *rawhdr = (vdif_header *)rawhdr_bytes; + int nbyte; + while (!got_valid_frame) + { + size_t rv = read(fd, rawhdr_bytes, VDIF_HEADER_BYTES); + if (rv != VDIF_HEADER_BYTES) + throw Error (FailedSys, "EmerlinFile::open_file", + "Error reading first header"); + + // Get frame size + nbyte = getVDIFFrameBytes(rawhdr); + if (verbose) cerr << "EmerlinFile::open_file FrameBytes = " << nbyte << endl; + //header_bytes = 0; + //block_bytes = nbyte; + //block_header_bytes = VDIF_HEADER_BYTES; // XXX what about "legacy" mode + + resolution=(nbyte-VDIF_HEADER_BYTES)*2*4*1; // in samples + + // If this first frame is invalid, go to the next one + if (getVDIFFrameInvalid(rawhdr)==0) + got_valid_frame = true; + else + { + rv = lseek(fd, nbyte-VDIF_HEADER_BYTES, SEEK_CUR); + if (rv<0) + throw Error (FailedSys, "EmerlinFile::lseek", + "Error seeking to next VDIF frame"); + } + } + + // Rewind file + lseek(fd, 0, SEEK_SET); +// Get basic params + + int nbit = getVDIFBitsPerSample(rawhdr); + if (verbose) cerr << "EmerlinFile::open_file NBIT = " << nbit << endl; + get_info()->set_nbit (nbit); + + bool iscomplex = rawhdr->iscomplex; + if (iscomplex) + { + get_info()->set_ndim(2); + get_info()->set_state(Signal::Analytic); + } + else + { + get_info()->set_ndim(1); + get_info()->set_state(Signal::Nyquist); + } + if (verbose) cerr << "EmerlinFile::open_file iscomplex = " << iscomplex << endl; + + get_info()->set_npol( 2 ); + get_info()->set_nchan( 1 ); + get_info()->set_rate( (double) get_info()->get_bandwidth() * 1e6 + / (double) get_info()->get_nchan() + * (get_info()->get_state() == Signal::Nyquist ? 2.0 : 1.0)); + if (verbose) cerr << "EmerlinFile::open_file rate = " << get_info()->get_rate() << endl; + + // Figure frames per sec from bw, pkt size, etc + //double frames_per_sec = 64000.0; + int frame_data_size = nbyte - VDIF_HEADER_BYTES; + double frames_per_sec = get_info()->get_nbit() * get_info()->get_nchan() * get_info()->get_npol() + * get_info()->get_rate() / 8.0 / (double) frame_data_size; + if (verbose) cerr << "EmerlinFile::open_file frame_data_size = " + << frame_data_size << endl; + if (verbose) cerr << "EmerlinFile::open_file frames_per_sec = " + << frames_per_sec << endl; + + // Set load resolution equal to one frame? XXX + // This broke file unloading somehow ... wtf.. + //resolution = info.get_nsamples(frame_data_size); + + + int mjd = getVDIFFrameMJD(rawhdr); + int sec = getVDIFFrameSecond(rawhdr); + int fn = getVDIFFrameNumber(rawhdr); + first_second = getVDIFFullSecond(rawhdr); + cur_frame=fn; + if (verbose) cerr << "EmerlinFile::open_file MJD = " << mjd << endl; + if (verbose) cerr << "EmerlinFile::open_file sec = " << sec << endl; + if (verbose) cerr << "EmerlinFile::open_file fn = " << fn << endl; + get_info()->set_start_time( MJD(mjd,sec,(double)fn/frames_per_sec) ); + + // Figures out how much data is in file based on header sizes, etc. + set_total_samples(); + + if (verbose) + cerr << "EmerlinFile::open exit" << endl; +} + + + + + +int64_t dsp::EmerlinFile::load_bytes(unsigned char* buffer, uint64_t nbytes) { + + if (nbytes % 16000){ + // trim to an integer number of frames + std::cerr << "dsp::EmerlinFile::load_bytes ERROR: Need to read integer number of frames" << std::endl; + nbytes = 16000*(nbytes/16000); + } + + + unsigned nframe = nbytes / 16000; + unsigned npacket = nframe/2; + + std::memset(buffer, 0, nbytes); // zero the memory + + unsigned char* write_to = buffer; + + uint64_t to_load=nbytes; + + int ipol=0; // should always start at pol zero please. + + while (to_load > 0){ + + char rawhdr_bytes[VDIF_HEADER_BYTES]; + vdif_header *rawhdr = (vdif_header *)rawhdr_bytes; + + size_t rv = read(fd, rawhdr_bytes, VDIF_HEADER_BYTES); + if (rv != VDIF_HEADER_BYTES) + throw Error (FailedSys, "EmerlinFile::load_bytes", + "Error reading header"); + + int64_t sec = getVDIFFullSecond(rawhdr); + int64_t fn = getVDIFFrameNumber(rawhdr); + int64_t sn = getVDIFThreadID(rawhdr); + + fn += 4000*(sec-first_second); + + int byte_offset = ((fn-cur_frame)*2 + sn)*8000; + //fprintf(stderr,"read %d/%d, pkt=%d to_load=%d %ld\n",fn,sn,byte_offset/8000,to_load,sec-first_second); + if ((byte_offset+8000) > nbytes) { + // we are past the requested data. + // there is surely a better way than this! + dropped += to_load/8000; + std::cerr << "Some packets missing (left toload=" << to_load<<", total dropped so far = " << dropped << ")" << std::endl; + fprintf(stderr,"read %d/%d, pkt=%d to_load=%d %ld\n",fn,sn,byte_offset/8000,to_load,sec-first_second); + rv = lseek(fd, -VDIF_HEADER_BYTES, SEEK_CUR); + if (rv<0) + throw Error (FailedSys, "EmerlinFile::lseek", + "Error seeking to next VDIF frame"); + + break; + } + + write_to = buffer+byte_offset; + + + rv = read(fd,write_to, 8000); + + if (rv!=8000){ + std::cerr << "dsp::EmerlinFile::load_bytes couldn't load data" << std::endl; + } + + +/* if(fn%2==sn){ + for(int i=0; i < 8000; ++i){ + write_to[i]=85; + } + } else { + }*/ + + to_load -= rv; + } + cur_frame += nframe; + + return nbytes; +} + + +int64_t dsp::EmerlinFile::seek_bytes(uint64_t bytes) { + std::cerr << "dsp::EmerlinFile::seek_bytes NOT IMPLEMENTED "< + +unsigned dsp::EmerlinTwoBitTable::extract (unsigned byte, unsigned sample) const +{ + unsigned char shifts[4] = { 0, 2, 4, 6 }; // LSB is first sample. VDIF standard +// unsigned char shifts[4] = { 6, 4, 2, 0 }; + //std::cout << "dsp::EmerlinTwoBitTable::extract()" << std::endl; + return byte >> shifts[sample] & 0x03; +} diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/emerlin/EmerlinUnpacker.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/emerlin/EmerlinUnpacker.C --- bl-dspsr-0+git20160405/Kernel/Formats/emerlin/EmerlinUnpacker.C 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/emerlin/EmerlinUnpacker.C 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,148 @@ + +#include "dsp/EmerlinUnpacker.h" +#include "dsp/WeightedTimeSeries.h" + + +dsp::EmerlinUnpacker::EmerlinUnpacker(const char* name) : Unpacker(name) { +} + +bool dsp::EmerlinUnpacker::matches (const Observation* observation) { + + return observation->get_machine() == "EMERLIN" + && observation->get_nbit() == 2 + && observation->get_nbit() == 2; + +} + +int dsp::EmerlinUnpacker::get_ndat_per_weight() { + return 1000; +} + + +void dsp::EmerlinUnpacker::reserve() { + if (weighted_output) + { + weighted_output -> set_ndat_per_weight (get_ndat_per_weight()); + weighted_output -> set_nchan_weight (1); + weighted_output -> set_npol_weight (input->get_npol()); + } + + output->resize ( input->get_ndat() ); + + if (weighted_output) + weighted_output -> neutral_weights (); +} + +void dsp::EmerlinUnpacker::set_output (TimeSeries* _output) +{ + if (verbose) + std::cerr << "dsp::EmerlinUnpacker::set_output (" << _output << ")" << std::endl; + + Unpacker::set_output (_output); + weighted_output = dynamic_cast (_output); +} + +void dsp::EmerlinUnpacker::unpack() { + if(verbose) { + std::cerr << "dsp::EmerlinUnpacker::unpack()" << std::endl; + std::cerr << "dsp::EmerlinUnpacker input->ndat = "<< input->get_ndat() << std::endl; + std::cerr << "dsp::EmerlinUnpacker input->nbit = "<< input->get_nbit() << std::endl; + std::cerr << "dsp::EmerlinUnpacker input->ndim = "<< input->get_ndim() << std::endl; + std::cerr << "dsp::EmerlinUnpacker input->npol = "<< input->get_npol() << std::endl; + std::cerr << "dsp::EmerlinUnpacker input->nchan = "<< input->get_nchan() << std::endl; + std::cerr << "dsp::EmerlinUnpacker output->ndat = "<< output->get_ndat() << std::endl; + } + + + const unsigned samples_per_byte=4; + + const unsigned total_bytes = 2*input->get_ndat()/samples_per_byte; + + const unsigned nframe = total_bytes/16000; + const unsigned nword = 2000; + const unsigned byte_per_word=4; + unsigned offset=0; + + const unsigned dat_per_frame = nword*byte_per_word*samples_per_byte; + unsigned weights_per_frame = 0; + if(weighted_output){ + weights_per_frame = dat_per_frame / weighted_output->get_ndat_per_weight(); + if(verbose) + std::cerr << "dsp::EmerlinUnpacker weighted output. weights per frame = " << weights_per_frame << std::endl; + } + + const unsigned char *iarray = input->get_rawptr(); + const unsigned char *iarray_orig = iarray; + unsigned char word[byte_per_word]; + + int count[4]; + + unsigned* weights = NULL; + for (unsigned iframe=0; iframe < nframe; ++iframe) { + for (unsigned ipol=0; ipol < 2; ++ipol) { + if(weighted_output){ + weights = weighted_output->get_weights(0,ipol)+weights_per_frame*iframe; + } + count[0]=0; + count[1]=0; + count[2]=0; + count[3]=0; + float ss=0; + if(offset > output->get_ndat()){ + std::cerr << "dsp::EmerlinUnpacker::unpack error" << std::endl; + } + + float* oarray = output->get_datptr (0, ipol) + offset; + for (unsigned wd=0; wd < nword; ++wd) { + for (unsigned bt = 0; bt < byte_per_word; bt++){ + // word[bt] = iarray[byte_per_word-1-bt]; // first samples are in last byte of word. + word[bt] = iarray[bt]; // first sample is byte zero on disk. + } + + iarray += 4; + + + for (unsigned bt = 0; bt < byte_per_word; bt++){ + const float* four = bittable.get_values(word[bt]); + // std::cerr << (int)(word[bt]) << std::endl; + // std::cerr << four[0] << " " << four[1] << + // " " << four[2] << " " << four[3] << std::endl; + + for (unsigned pt=0; pt < samples_per_byte; ++pt) { + if (four[pt] < -0.5)count[0]++; + else if(four[pt] < 0)count[1]++; + else if(four[pt] < 0.5) count[2]++; + else count[3]++; + *oarray = four[pt]; + ss+=four[pt]*four[pt]; + ++oarray; + } + } + } + if(count[3]==0 && count[2]==0 && count[1]==0){ + std::cerr << "Zero weight Dropped Frame (weights_per_frame="< +#include +#include "vdifio.h" + + +#define VDIF_VERSION 0 + +#define UNIXZERO_MJD 40587 + +void mjd2ymd(int mjd, int *year, int *month, int *day) { + int jd, temp1, temp2; + + jd = mjd + 2400001; + + // Do some rather cryptic calculations + + temp1 = 4*(jd+((6*(((4*jd-17918)/146097)))/4+1)/2-37); + temp2 = 10*(((temp1-237)%1461)/4)+5; + + *year = temp1/1461-4712; + *month =((temp2/306+2)%12)+1; + *day = (temp2%306)/10+1; +} + +int ymd2doy(int yr, int mo, int day) +{ + int monstart1[] = {0,31,59,90,120,151,181,212,243,273,304,334}; + int monstart2[] = {0,31,60,91,121,152,182,213,244,274,305,335}; + int L2; + + L2 = yr/4-(yr+7)/4-yr/100+(yr+99)/100+yr/400-(yr+399)/400; + if(L2 == -1) + { + return day + monstart2[mo-1]; + } + else + { + return day + monstart1[mo-1]; + } +} + +int ymd2mjd(int yr, int mo, int day) +{ + int doy; + int yr1 = yr - 1; + + doy = ymd2doy(yr, mo, day); + + return doy-678576+365*yr1+yr1/4-yr1/100+yr1/400; +} + +//int epoch2mjd(int epoch) { +// return ymd2mjd(2000 + epoch/2, (epoch%2)*6+1, 1); // Year and Jan/July +//} + +int createVDIFHeader(vdif_header *header, int framelength, int threadid, int bits, int nchan, + int iscomplex, char stationid[3]) { + int lognchan; + + header->epoch = 0; + + if (VDIF_VERSION>7) return(VDIF_ERROR); + if (bits>32 || bits<1) return(VDIF_ERROR); + if (framelength%8!=0 || framelength<0) return(VDIF_ERROR); + if (threadid>1023 || threadid<0) return(VDIF_ERROR); + + // Number of channels encoded as power of 2 + if (nchan<1) return(VDIF_ERROR); + lognchan = 0; + while (nchan>1) { + if (nchan%2==1) return(VDIF_ERROR); + lognchan++; + nchan /=2; + } + if (lognchan>31) return(VDIF_ERROR); + + memset(header, 0, VDIF_HEADER_BYTES); + + header->version = VDIF_VERSION; + header->nchan = lognchan; + header->framelength8 = framelength/8; + if (iscomplex) + header->iscomplex = 1; + else + header->iscomplex = 0; + header->nbits = bits-1; + header->threadid = threadid; + header->stationid = stationid[0]<<8 | stationid[1]; + + header->frame=0; + //header->framepersec=framepersec; + + return(VDIF_NOERROR); +} + + +void setVDIFThreadID(vdif_header *header, int threadid) +{ + // Should check bounds + header->threadid = threadid; +} + +void setVDIFFrameBytes(vdif_header *header, int bytes) +{ + // Should check modulo8 and not too big + header->framelength8 = bytes/8; +} + +int getVDIFEpochMJD(const vdif_header *header) +{ + int epoch = (int)header->epoch; + return ymd2mjd(2000 + epoch/2, (epoch%2)*6+1, 1); +} + +void setVDIFNumChannels(vdif_header *header, int numchannels) +{ + unsigned int logchans = 0; + while(numchannels > 1) + { + numchannels /= 2; + logchans++; + } + header->nchan = logchans; +} + +int getVDIFNumChannels(const vdif_header *header) +{ + int logchans = header->nchan; + int numchannels = 1; + while(logchans > 0) + { + numchannels *= 2; + logchans--; + } + return numchannels; +} + +int getVDIFFrameMJD(const vdif_header *header) +{ + int mjd = getVDIFEpochMJD(header); + + return mjd + header->seconds/86400; // Seconds may be greater than one day +} + +double getVDIFDMJD(const vdif_header *header, int framepersec) +{ + int mjd = getVDIFFrameMJD(header); + int sec = getVDIFFrameSecond(header); + return (double)mjd+(sec+(double)header->frame/(double)framepersec)/(24*60*60); +} + +// Note assumes the Epoch is already set +void setVDIFFrameMJD(vdif_header *header, int framemjd) +{ + int emjd = getVDIFEpochMJD(header); + int seconds = (int)header->seconds; + int mjd = emjd + seconds/86400; // BUG? I think this step is wrong CJP + if(emjd == framemjd) return; //its already right + header->seconds = (framemjd-mjd)*86400; +} + +void setVDIFMJDSec(vdif_header *header, uint64_t mjdsec) +{ + int epoch = (int)header->epoch; + int emjd = ymd2mjd(2000 + epoch/2, (epoch%2)*6+1, 1); + header->seconds = (int)(mjdsec - ((uint64_t)emjd)*86400); +} + +void setVDIFEpoch(vdif_header *header, int mjd) { + int year, month, day; + mjd2ymd(mjd, &year, &month, &day); + header->epoch = (year-2000)*2; + if (month>6) header->epoch++; +} + +int nextVDIFHeader(vdif_header *header, int framepersec) { + header->frame++; + if (header->frame>framepersec) { + return(VDIF_ERROR); + } else if (header->frame==framepersec) { + header->seconds++; + header->frame = 0; + } + return(VDIF_NOERROR); +} + +uint64_t time2mjdsec(time_t time) { + return ((uint64_t)UNIXZERO_MJD*24*60*60 + (uint64_t)time); + +} +int setVDIFTime(vdif_header *header, time_t time) { + int epoch; + struct tm t; + + gmtime_r(&time, &t); + epoch = (t.tm_year-100)*2; + if (epoch<0) // Year is year since 2000 + return(VDIF_ERROR); + if (t.tm_mon>=6) { + epoch++; + } + epoch %= 32; + header->epoch = epoch; + + uint64_t mjdsec = time2mjdsec(time); + setVDIFMJDSec(header, mjdsec); + + return(VDIF_NOERROR); +} diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/emerlin/vdifio.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/emerlin/vdifio.h --- bl-dspsr-0+git20160405/Kernel/Formats/emerlin/vdifio.h 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/emerlin/vdifio.h 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,145 @@ +/*************************************************************************** + * Copyright (C) 2009-2013 by Adam Deller / Walter Brisken * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 3 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ***************************************************************************/ +//=========================================================================== +// SVN properties (DO NOT CHANGE) +// +// $Id: vdifio.h 5240 2013-04-09 16:33:24Z WalterBrisken $ +// $HeadURL: https://svn.atnf.csiro.au/difx/libraries/vdifio/trunk/src/vdifio.h $ +// $LastChangedRevision: 5240 $ +// $Author: WalterBrisken $ +// $LastChangedDate: 2013-04-09 10:33:24 -0600 (Tue, 09 Apr 2013) $ +// +//============================================================================ + +#ifndef __VDIFIO_H__ +#define __VDIFIO_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +#define VDIF_HEADER_BYTES 32 +#define VDIF_LEGACY_HEADER_BYTES 16 +#define MAX_VDIF_FRAME_BYTES 9032 +#define MAX_VDIF_THREADS 1024 + +#define VDIF_NOERROR 0 +#define VDIF_ERROR 1 + +typedef struct vdif_header { + uint32_t seconds : 30; + uint32_t legacymode : 1; + uint32_t invalid : 1; + uint32_t frame : 24; + uint32_t epoch : 6; + uint32_t unassigned : 2; + uint32_t framelength8 : 24; + uint32_t nchan : 5; + uint32_t version : 3; + uint32_t stationid : 16; + uint32_t threadid : 10; + uint32_t nbits : 5; + uint32_t iscomplex : 1; + uint32_t eversion : 8; + uint32_t extended1 : 24; + uint32_t extended2; + uint32_t extended3; + uint32_t extdended4; + } vdif_header; + +/* Date manipulation functions */ +int ymd2doy(int yr, int mo, int day); +int ymd2mjd(int yr, int mo, int day); + +/* Function to completely fill header struct, returns non-zero on error */ +int createVDIFHeader(vdif_header *header, int framelength, int threadid, int bits, int nchan, + int iscomplex, char stationid[3]); + +/* Functions to grab just one value from the raw header */ +static inline int getVDIFThreadID(const vdif_header *header) { return (int)header->threadid; } +static inline int getVDIFFrameBytes(const vdif_header *header) { return (int)(header->framelength8)*8; } +int getVDIFFrameMJD(const vdif_header *header); +double getVDIFDMJD(const vdif_header *header, int framepersec); +static inline int getVDIFFrameSecond(const vdif_header *header) { return ((int)header->seconds)%86400; } +static inline int getVDIFFrameNumber(const vdif_header *header) { return (int)header->frame; } +static inline int getVDIFStationID(const vdif_header *header) { return (int)header->stationid; } +static inline int getVDIFBitsPerSample(const vdif_header *header) { return ((int)header->nbits+1); } +int getVDIFNumChannels(const vdif_header *header); +static inline int getVDIFFrameInvalid(const vdif_header *header) { return (int)header->invalid; } +static inline int getVDIFFullSecond(const vdif_header *header) { return (int)header->seconds; } +static inline int getVDIFEpoch(const vdif_header *header) { return (int)header->epoch; } + +/* Functions to set just one value from a raw header */ +void setVDIFFrameMJD(vdif_header *header, int framemjd); +void setVDIFMJDSec(vdif_header *header, uint64_t mjdsec); +static inline void setVDIFFrameSecond(vdif_header *header, int framesecond) { header->seconds = framesecond; } +static inline void setVDIFFrameNumber(vdif_header *header, int framenumber) { header->frame = framenumber; } +static inline void setVDIFFrameInvalid(vdif_header *header, unsigned int invalid) { header->invalid = invalid; } +void setVDIFFrameBytes(vdif_header *header, int bytes); +void setVDIFNumChannels(vdif_header *header, int numchannels); +void setVDIFThreadID(vdif_header *header, int threadid); +int setVDIFTime(vdif_header *header, time_t time); +void setVDIFEpoch(vdif_header *header, int mjd); +int nextVDIFHeader(vdif_header *header, int framepersec); + + +struct vdif_mux_statistics { + /* The first 8 accumulate over multiple calls to vdifmux */ + long long nValidFrame; /* number of valid VDIF input frames encountered */ + long long nInvalidFrame; /* number of real VDIF frames discarded because of invalid bit being set */ + long long nDiscardedFrame; /* number of valid input frames discarded because of out-of-order issues */ + long long nWrongThread; /* number of otherwise good frames with incorrect thread */ + long long nSkippedByte; /* number of bytes skipped (interloper frames) */ + long long nFillByte; /* counts number of bytes skipped that were identified as fill pattern */ + long long nDuplicateFrame; /* number of frames found with the same time & thread */ + long long bytesProcessed; /* total bytes consumed from */ + long long nGoodFrame; /* number of fully usable output frames */ + int nCall; /* how many calls to vdifmux since last reset */ + + /* These remaining fields are set each time */ + int srcSize; /* length of input array (bytes) */ + int srcUsed; /* amount of input array consumed (bytes) */ + int destSize; /* length of output array (bytes) */ + int destUsed; /* amount of output array populated */ + int inputFrameSize; /* length in bytes of one input data frame (provided to call) */ + int outputFrameSize; /* length in bytes of one output data frame (calculated) */ + int outputFrameGranularity; /* number of output frames required to make an integer number of nanoseconds */ + int outputFramesPerSecond; /* from call */ + int nOutputFrame; /* length of usable output data measured in frames */ + int epoch; /* from first header */ + long long startFrameNumber; + + /* start time of output data */ + /* duration of output data */ +}; + +int vdifmux(unsigned char *dest, int nFrame, const unsigned char *src, int length, int inputFrameSize, int inputFramesPerSecond, int nBit, int nThread, const int *threadIds, int nSort, int nGap, long long startOutputFrameNumber, struct vdif_mux_statistics *stats); + +void printvdifmuxstatistics(const struct vdif_mux_statistics *stats); + +void resetvdifmuxstatistics(struct vdif_mux_statistics *stats); + +#ifdef __cplusplus +} +#endif + +#endif diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/File_registry.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/File_registry.C --- bl-dspsr-0+git20160405/Kernel/Formats/File_registry.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/File_registry.C 2018-03-12 23:02:35.000000000 +0000 @@ -125,6 +125,11 @@ static dsp::File::Register::Enter register_mark5; #endif +#if HAVE_mark5b +#include "dsp/Mark5bFile.h" +static dsp::File::Register::Enter register_mark5b; +#endif + #if HAVE_maxim #include "dsp/MaximFile.h" static dsp::File::Register::Enter register_maxim; @@ -180,6 +185,12 @@ static dsp::File::Register::Enter register_spigot; #endif + +#if HAVE_emerlin +#include "dsp/EmerlinFile.h" +static dsp::File::Register::Enter register_emerlin; +#endif + #if HAVE_vdif #include "dsp/VDIFFile.h" static dsp::File::Register::Enter register_vdif; diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/fits/dsp/FITSOutputFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/fits/dsp/FITSOutputFile.h --- bl-dspsr-0+git20160405/Kernel/Formats/fits/dsp/FITSOutputFile.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/fits/dsp/FITSOutputFile.h 2018-03-12 23:02:35.000000000 +0000 @@ -54,6 +54,12 @@ //! Set the output filename convention void set_atnf ( bool ); + //! Set output mangling + void set_mangle_output ( bool ); + + //! Set length of output file (seconds) + void set_max_length( double ); + protected: //! Need a custom implementation of operation to handle FITS I/O @@ -71,6 +77,9 @@ //! Write nbyte bytes with cfitsio virtual int64_t unload_bytes (const void* buffer, uint64_t bytes); + //! Interface to CFITSIO with error checking and bookkeeping + unsigned char* write_bytes (int colnum, int isub, int offset, unsigned bytes_to_write, unsigned char** buffer); + //! samples per block (FITS row) unsigned nsblk; @@ -89,6 +98,9 @@ //! convenience store channel nuumber unsigned nchan; + //! maximum length of output file + double max_length; + //! buffer for channels weights std::vector dat_wts; @@ -113,7 +125,10 @@ unsigned offset; //! keep track of bytes written so far - uint64_t written; + int64_t written; + + //! optional maximum bytes per file + int64_t max_bytes; //! set up buffers, etc. void initialize (); @@ -124,6 +139,10 @@ //! Use ATNF datestr convention bool use_atnf; + //! Use a mangled file name for output; rename on file close + bool mangle_output; + std::string mangled_output_filename; + }; } diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/fits/FITSDigitizer.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/fits/FITSDigitizer.C --- bl-dspsr-0+git20160405/Kernel/Formats/fits/FITSDigitizer.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/fits/FITSDigitizer.C 2018-03-12 23:02:35.000000000 +0000 @@ -10,6 +10,7 @@ #include "dsp/FITSDigitizer.h" #include "dsp/InputBuffering.h" #include +#include void dsp::FITSDigitizer::set_digi_scales() { @@ -375,7 +376,6 @@ // with F in inner loop case TimeSeries::OrderTFP: { -#pragma omp parallel for for (uint64_t idat=0; idat < ndat; idat++) { unsigned char* outptr = output->get_rawptr() + (idat*nchan*npol)/samp_per_byte; @@ -607,6 +607,7 @@ int bit_counter=0; unsigned inner_stride = nchan * npol; unsigned idx = 0, bit_shift = 0; // make gcc happy +#pragma omp parallel for for (unsigned ichan=0; ichan < nchan; ichan++) { unsigned mapped_chan = channel (ichan); diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/fits/FITSFile.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/fits/FITSFile.C --- bl-dspsr-0+git20160405/Kernel/Formats/fits/FITSFile.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/fits/FITSFile.C 2018-03-12 23:02:35.000000000 +0000 @@ -36,6 +36,7 @@ : File("FITSFile") { current_byte = 0; + zero_off = 0; } bool dsp::FITSFile::is_valid (const char* filename) const diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/fits/FITSOutputFile.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/fits/FITSOutputFile.C --- bl-dspsr-0+git20160405/Kernel/Formats/fits/FITSOutputFile.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/fits/FITSOutputFile.C 2018-03-12 23:02:35.000000000 +0000 @@ -24,6 +24,7 @@ #include "psrfitsio.h" #include +#include using namespace std; @@ -104,7 +105,9 @@ nbblk = 0; nbit = 2; - use_atnf = true; + use_atnf = false; + mangle_output = false; + max_length = 0; } dsp::FITSOutputFile::~FITSOutputFile () @@ -112,11 +115,30 @@ finalize_fits (); } +unsigned char* dsp::FITSOutputFile::write_bytes (int colnum, int isub, int offset, unsigned bytes_to_write, unsigned char** buffer) { + int status = 0; + fits_write_col_byt (fptr, colnum, isub, offset, bytes_to_write, *buffer, &status); + if (status) + throw FITSError(status,"dsp::FITSOutputFile::write_bytes"); + written += bytes_to_write; + *buffer += bytes_to_write; +} + void dsp::FITSOutputFile::set_atnf (bool _use_atnf) { use_atnf = _use_atnf; } +void dsp::FITSOutputFile::set_mangle_output (bool _mangle_output) +{ + mangle_output = _mangle_output; +} + +void dsp::FITSOutputFile::set_max_length ( double _max_length ) +{ + max_length = _max_length; +} + void dsp::FITSOutputFile::set_nsblk (unsigned nblk) { if ( fptr && (nblk != nsblk) ) @@ -280,8 +302,8 @@ // set_model must be called after the Integration::MJD has been set - //archive-> set_filename (get_filename (phase)); - if (output_filename.empty()) + // if using a maximum file size, re-generate file name + if (output_filename.empty() || max_bytes) { MJD epoch = get_input()->get_start_time(); vector buffer (FILENAME_MAX); @@ -303,8 +325,17 @@ "error MJD::datestr("+datestr_pattern+")"); } output_filename = filename + get_extension(); + if (mangle_output) + { + char buff [L_tmpnam]; + tmpnam(buff); + mangled_output_filename = output_filename + (buff+strlen(buff)-6); + } } - archive -> unload (output_filename); + if (mangle_output) + archive -> unload (mangled_output_filename); + else + archive -> unload (output_filename); } void dsp::FITSOutputFile::write_row () @@ -347,8 +378,16 @@ } } + // reset bytes written and current row, etc. + written = 0; + isub = 0; + offset = 0; + int status = 0; - fits_open_file (&fptr,output_filename.c_str(), READWRITE, &status); + if (mangle_output) + fits_open_file (&fptr,mangled_output_filename.c_str(), READWRITE, &status); + else + fits_open_file (&fptr,output_filename.c_str(), READWRITE, &status); if (status) throw FITSError (status, "dsp::FITSOutputFile::initialize", "unable to open FITS file for writing"); @@ -392,6 +431,13 @@ // TODO -- will need to fix this later on psrfits_update_key (fptr, "NSUBOFFS", 0); + + max_bytes = max_length*get_input()->get_rate() / (8/nbit) * nchan * npol; + if ( max_bytes && (max_bytes < nbblk) ) + throw Error (InvalidState, "must set maximum file size > data block size (1 FITS row)" ); + if ( max_bytes && (max_bytes % nbblk != 0)) + cerr << "WARNING: maximum file size is not an integer number of data blocks; output files will not be contiguous under PSRFITS conventions." << endl; + } void dsp::FITSOutputFile::operation () @@ -404,6 +450,27 @@ if (verbose) cerr << "dsp::FITSOutputFile::operation" << endl; + + // should handle both case where data block is larger than maximum file + // size and more typical case where file ends within a data block + if (max_bytes) + { + int64_t nbytes = get_input()->get_nbytes(); + if (nbytes == 0) return; + nbytes -= unload_bytes (get_input()->get_rawptr(), + std::min(max_bytes - written, nbytes)); + while (nbytes) + { + finalize_fits (); + write_header (); + initialize (); + // NB written will == 0 here + nbytes -= unload_bytes (get_input()->get_rawptr(), + std::min(max_bytes - written, nbytes)); + } + return; + } + unload_bytes (get_input()->get_rawptr(), get_input()->get_nbytes()); } @@ -424,7 +491,6 @@ << " buffer=" << void_buffer << endl; unsigned to_write = bytes; - int status = 0; int colnum = dsp::get_colnum (fptr, "DATA"); // write to incomplete block first @@ -437,9 +503,7 @@ // finish remainder of subint if (bytes >= remainder) { - fits_write_col_byt (fptr, colnum, isub, offset, remainder, - buffer, &status); - buffer += remainder; + write_bytes (colnum, isub, offset, remainder, &buffer); to_write -= remainder; offset = 0; } @@ -447,9 +511,7 @@ // write all available bytes without advancing subint else { - fits_write_col_byt (fptr, colnum, isub, offset, bytes, - buffer, &status); - written += bytes; + write_bytes (colnum, isub, offset, bytes, &buffer); offset += bytes; return bytes; } @@ -467,9 +529,8 @@ write_row (); // Now write that data into a subintegration in the PSRFITS file - fits_write_col_byt (fptr, colnum, isub, 1, nbblk, buffer, &status); + write_bytes (colnum, isub, 1, nbblk, &buffer); to_write -= nbblk; - buffer += nbblk; } // write out remaining bytes to partial subbint @@ -477,7 +538,7 @@ { isub += 1; write_row(); - fits_write_col_byt (fptr, colnum, isub, 1, to_write, buffer, &status); + write_bytes (colnum, isub, 1, to_write, &buffer); offset += to_write; } @@ -491,12 +552,20 @@ cerr << "dsp::FITSOutputFile::finalize_fits" << endl; if (fptr) { psrfits_update_key (fptr, "NAXIS2", isub); - psrfits_update_key (fptr, "NSTOT", written * (8/nbit) ); + int nstot = (written*8)/(npol * nchan * nbit); + psrfits_update_key (fptr, "NSTOT", nstot ); + int nsuboffs = get_input()->get_input_sample()/nsblk - written/nbblk; + psrfits_update_key (fptr, "NSUBOFFS", nsuboffs); int status = 0; fits_close_file(fptr, &status); - if (status) - throw FITSError(status, "dsp::FITSOutputFile"); fptr = NULL; + if (status) + throw FITSError(status, "dsp::FITSOutputFile::finalize_fits"); + if (mangle_output) + { + if (rename( mangled_output_filename.c_str(), output_filename.c_str())) + throw Error(FailedSys, "dsp::FITSOutputFile::finalize_fits"); + } } } diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/fits/FITSUnpacker.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/fits/FITSUnpacker.C --- bl-dspsr-0+git20160405/Kernel/Formats/fits/FITSUnpacker.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/fits/FITSUnpacker.C 2018-03-12 23:02:35.000000000 +0000 @@ -53,14 +53,14 @@ void dsp::FITSUnpacker::unpack() { - if (verbose) { - cerr << "dsp::FITSUnpacker::unpack" << endl; - } // Allocate mapping method to use depending on how many bits per value. BitNumberFn p; const unsigned nbit = input->get_nbit(); + if (verbose) + cerr << "dsp::FITSUnpacker::unpack with nbit=" << nbit << endl; + switch (nbit) { case 1: p = &dsp::FITSUnpacker::oneBitNumber; @@ -83,6 +83,13 @@ const unsigned nchan = input->get_nchan(); const unsigned ndat = input->get_ndat(); + // Make sure scales and offsets exist + if (dat_scl.size() == 0) + { + dat_scl.assign(nchan,1); + dat_offs.assign(nchan,0); + } + // Number of samples in one byte. const int samples_per_byte = BYTE_SIZE / nbit; const int mod_offset = samples_per_byte - 1; diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/fits/Makefile.am bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/fits/Makefile.am --- bl-dspsr-0+git20160405/Kernel/Formats/fits/Makefile.am 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/fits/Makefile.am 2018-03-12 23:02:35.000000000 +0000 @@ -11,9 +11,9 @@ ############################################################################# # -libfits_la_LIBADD = @CFITSIO_LIBS@ +libfits_la_LIBADD = @CFITSIO_LIBS@ -lgomp include $(top_srcdir)/config/Makefile.include -AM_CPPFLAGS += @CFITSIO_CFLAGS@ +AM_CPPFLAGS += @CFITSIO_CFLAGS@ -fopenmp diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/gmrt/dsp/GMRTBinaryFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/gmrt/dsp/GMRTBinaryFile.h --- bl-dspsr-0+git20160405/Kernel/Formats/gmrt/dsp/GMRTBinaryFile.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/gmrt/dsp/GMRTBinaryFile.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/gmrt/dsp/GMRTBinaryFile.h,v $ - $Revision: 1.1 $ - $Date: 2011/05/08 07:02:00 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/gmrt/dsp/GMRTBinaryFile.h #ifndef __GMRTBinaryFile_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/gmrt/dsp/GMRTFilterbank16.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/gmrt/dsp/GMRTFilterbank16.h --- bl-dspsr-0+git20160405/Kernel/Formats/gmrt/dsp/GMRTFilterbank16.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/gmrt/dsp/GMRTFilterbank16.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/gmrt/dsp/GMRTFilterbank16.h,v $ - $Revision: 1.2 $ - $Date: 2009/03/10 06:26:05 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/gmrt/dsp/GMRTFilterbank16.h #ifndef __GMRTFilterbank16_h #define __GMRTFilterbank16_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/gmrt/dsp/GMRTFilterbankFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/gmrt/dsp/GMRTFilterbankFile.h --- bl-dspsr-0+git20160405/Kernel/Formats/gmrt/dsp/GMRTFilterbankFile.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/gmrt/dsp/GMRTFilterbankFile.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/gmrt/dsp/GMRTFilterbankFile.h,v $ - $Revision: 1.1 $ - $Date: 2009/03/02 17:27:35 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/gmrt/dsp/GMRTFilterbankFile.h #ifndef __GMRTFilterbankFile_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/gmrt/dsp/GMRTUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/gmrt/dsp/GMRTUnpacker.h --- bl-dspsr-0+git20160405/Kernel/Formats/gmrt/dsp/GMRTUnpacker.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/gmrt/dsp/GMRTUnpacker.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/gmrt/dsp/GMRTUnpacker.h,v $ - $Revision: 1.2 $ - $Date: 2011/07/15 04:55:14 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/gmrt/dsp/GMRTUnpacker.h #ifndef __GMRTUnpacker_h #define __GMRTUnpacker_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/guppi/fitshead.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/guppi/fitshead.h --- bl-dspsr-0+git20160405/Kernel/Formats/guppi/fitshead.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/guppi/fitshead.h 2018-03-12 23:02:35.000000000 +0000 @@ -109,7 +109,8 @@ char* hgetc( /* Return pointer to value for FITS keyword */ const char* hstring, /* FITS header string */ - const char* keyword); /* FITS keyword */ + const char* keyword, /* FITS keyword */ + char * value_buffer); /* caller provided buffer to make re-entrant */ char* ksearch( /* Return pointer to keyword in FITS header */ const char* hstring, /* FITS header string */ diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/guppi/hget.c bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/guppi/hget.c --- bl-dspsr-0+git20160405/Kernel/Formats/guppi/hget.c 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/guppi/hget.c 2018-03-12 23:02:35.000000000 +0000 @@ -42,7 +42,7 @@ * Subroutine: hgetm (hstring,keyword, lstr, str) returns multi-keyword string * Subroutine: hgetdate (hstring,keyword,date) returns date as fractional year * Subroutine: hgetndec (hstring, keyword, ndec) returns number of dec. places - * Subroutine: hgetc (hstring,keyword) returns character string + * Subroutine: hgetc (hstring,keyword,value_buffer) returns character string * Subroutine: blsearch (hstring,keyword) returns pointer to blank lines before keyword * Subroutine: ksearch (hstring,keyword) returns pointer to header string entry @@ -179,9 +179,10 @@ int lval; char *dchar; char val[VLENGTH+1]; + char value_buffer[VLENGTH + 1]; /* Get value and comment from header string */ - value = hgetc (hstring,keyword); + value = hgetc(hstring, keyword, value_buffer); /* Translate value from ASCII to binary */ if (value != NULL) { @@ -239,9 +240,10 @@ int lval; char *dchar; char val[VLENGTH+1]; + char value_buffer[VLENGTH + 1]; /* Get value and comment from header string */ - value = hgetc (hstring,keyword); + value = hgetc(hstring, keyword, value_buffer); /* Translate value from ASCII to binary */ if (value != NULL) { @@ -296,9 +298,10 @@ int lval; char *dchar; char val[VLENGTH+1]; + char value_buffer[VLENGTH + 1]; /* Get value and comment from header string */ - value = hgetc (hstring,keyword); + value = hgetc(hstring, keyword, value_buffer); /* translate value from ASCII to binary */ if (value != NULL) { @@ -342,9 +345,10 @@ double *dval; /* Right ascension in degrees (returned) */ { char *value; + char value_buffer[VLENGTH + 1]; /* Get value from header string */ - value = hgetc (hstring,keyword); + value = hgetc(hstring, keyword, value_buffer); /* Translate value from ASCII colon-delimited string to binary */ if (value != NULL) { @@ -371,9 +375,10 @@ double *dval; /* Right ascension in degrees (returned) */ { char *value; + char value_buffer[VLENGTH + 1]; /* Get value from header string */ - value = hgetc (hstring,keyword); + value = hgetc(hstring, keyword, value_buffer); /* Translate value from ASCII colon-delimited string to binary */ if (value != NULL) { @@ -434,9 +439,10 @@ int lval; char *dchar; char val[VLENGTH+1]; + char value_buffer[VLENGTH + 1]; /* Get value and comment from header string */ - value = hgetc (hstring,keyword); + value = hgetc(hstring, keyword, value_buffer); /* Translate value from ASCII to binary */ if (value != NULL) { @@ -483,9 +489,10 @@ char newval; int lval; char val[VLENGTH+1]; + char value_buffer[VLENGTH + 1]; /* Get value and comment from header string */ - value = hgetc (hstring,keyword); + value = hgetc(hstring, keyword, value_buffer); /* Translate value from ASCII to binary */ if (value != NULL) { @@ -528,9 +535,10 @@ int year, month, day, yday, i, hours, minutes; //static int mday[12] = {31,28,31,30,31,30,31,31,30,31,30,31}; int mday[12] = {31,28,31,30,31,30,31,31,30,31,30,31}; + char value_buffer[VLENGTH + 1]; /* Get value and comment from header string */ - value = hgetc (hstring,keyword); + value = hgetc(hstring, keyword, value_buffer); /* Translate value from ASCII to binary */ if (value != NULL) { @@ -721,10 +729,11 @@ /* Loop through sequentially-named keywords */ multiline = 1; for (ikey = 1; ikey < 500; ikey++) { + char value_buffer[VLENGTH + 1]; sprintf (keywordi, keyform, keyword, ikey); /* Get value for this keyword */ - value = hgetc (hstring, keywordi); + value = hgetc (hstring, keywordi, value_buffer); if (value != NULL) { lval = strlen (value); if (lval < lstri) @@ -803,9 +812,10 @@ { char *value; int lval; + char value_buffer[VLENGTH + 1]; /* Get value and comment from header string */ - value = hgetc (hstring,keyword); + value = hgetc(hstring, keyword, value_buffer); if (value != NULL) { lval = strlen (value); @@ -838,9 +848,10 @@ { char *value; int i, nchar; + char value_buffer[VLENGTH + 1]; /* Get value and comment from header string */ - value = hgetc (hstring,keyword); + value = hgetc(hstring, keyword, value_buffer); /* Find end of string and count backward to decimal point */ *ndec = 0; @@ -861,7 +872,7 @@ /* Extract character value for variable from FITS header string */ char * -hgetc (hstring,keyword0) +hgetc (hstring,keyword0,value_buffer) const char *hstring; /* character string containing FITS header information in the format = {/ } */ @@ -870,9 +881,10 @@ line beginning with this string. if "[n]" is present, the n'th token in the value is returned. (the first 8 characters must be unique) */ +char * value_buffer; { //static char cval[80]; - char cval[80]; + char *cval; char *value; char cwhite[2]; char squot[2], dquot[2], lbracket[2], rbracket[2], slash[2], comma[2]; @@ -890,6 +902,11 @@ if( !use_saolib ){ #endif + if (value_buffer == NULL) + { + return NULL; + } + cval = value_buffer; squot[0] = (char) 39; squot[1] = (char) 0; @@ -1326,7 +1343,7 @@ { double dec; /* Declination in degrees (returned) */ - double deg, min, sec, sign; + double deg, min=0.0, sec, sign; char *value, *c1, *c2; int lval; char *dchar; @@ -1921,4 +1938,5 @@ * Feb 28 2007 If header length is not set in hlength, set it to 0 * May 31 2007 Add return value of 3 to isnum() if string has colon(s) * Aug 22 2007 If closing quote not found, make one up + * Sep 6 2016 Added third arg to hgetc() to correct a 'return ptr to stack' issue. */ diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/kat/dsp/MeerKATUnpackerCUDA.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/kat/dsp/MeerKATUnpackerCUDA.h --- bl-dspsr-0+git20160405/Kernel/Formats/kat/dsp/MeerKATUnpackerCUDA.h 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/kat/dsp/MeerKATUnpackerCUDA.h 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,45 @@ +/*************************************************************************** + * + * Copyright (C) 2015 by Andrew Jameson + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#ifndef __dsp_MeerKATUnpackerCUDA_h +#define __dsp_MeerKATUnpackerCUDA_h + +#include "dsp/MeerKATUnpacker.h" + +#include + +namespace CUDA +{ + + class MeerKATUnpackerEngine : public dsp::MeerKATUnpacker::Engine + { + public: + + //! Default Constructor + MeerKATUnpackerEngine (cudaStream_t stream); + + void setup (); + + bool get_device_supported (dsp::Memory* memory) const; + + void set_device (dsp::Memory* memory); + + void unpack (float scale, const dsp::BitSeries * input, dsp::TimeSeries * output); + + protected: + + cudaStream_t stream; + + struct cudaDeviceProp gpu; + + dsp::BitSeries staging; + + }; +} + + +#endif diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/kat/dsp/MeerKATUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/kat/dsp/MeerKATUnpacker.h --- bl-dspsr-0+git20160405/Kernel/Formats/kat/dsp/MeerKATUnpacker.h 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/kat/dsp/MeerKATUnpacker.h 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,80 @@ +/* + + */ + +#ifndef __dsp_MeerKATUnpacker_h +#define __dsp_MeerKATUnpacker_h + +#include "dsp/EightBitUnpacker.h" + +namespace dsp { + + class MeerKATUnpacker : public HistUnpacker + { + public: + + //! Constructor + MeerKATUnpacker (const char* name = "MeerKATUnpacker"); + ~MeerKATUnpacker (); + + bool get_order_supported (TimeSeries::Order order) const; + void set_output_order (TimeSeries::Order order); + + + unsigned get_output_offset (unsigned idig) const; + unsigned get_output_ipol (unsigned idig) const; + unsigned get_output_ichan (unsigned idig) const; + + //! Cloner (calls new) + virtual MeerKATUnpacker * clone () const; + + //! Return true if the unpacker can operate on the specified device + bool get_device_supported (Memory*) const; + + //! Set the device on which the unpacker will operate + void set_device (Memory*); + + //! Engine used to unpack the data + class Engine; + + void set_engine (Engine*); + + protected: + + //! Interface to alternate processing engine (e.g. GPU) + Reference::To engine; + + Reference::To table; + + //! Return true if we can convert the Observation + bool matches (const Observation* observation); + + void unpack (); + + private: + + bool device_prepared; + + int8_t * tfp_buffer; + + size_t tfp_buffer_size; + + }; + + class MeerKATUnpacker::Engine : public Reference::Able + { + public: + + virtual void setup() = 0; + + virtual void unpack(float scale, const BitSeries * input, TimeSeries * output) = 0; + + virtual bool get_device_supported (Memory* memory) const = 0; + + virtual void set_device (Memory* memory) = 0; + + }; + +} + +#endif diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/kat/KAT7Unpacker.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/kat/KAT7Unpacker.C --- bl-dspsr-0+git20160405/Kernel/Formats/kat/KAT7Unpacker.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/kat/KAT7Unpacker.C 2018-03-12 23:02:35.000000000 +0000 @@ -127,7 +127,7 @@ for (unsigned ipol=0; ipolget_datptr (ichan, ipol) + (iblock*256); - for (unsigned isamp=0; isamp<256; isamp++) + for (unsigned isamp=0; isamp<256; isamp++) { into[isamp] = (float) from[isamp]; } diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/kat/KAT7UnpackerCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/kat/KAT7UnpackerCUDA.cu --- bl-dspsr-0+git20160405/Kernel/Formats/kat/KAT7UnpackerCUDA.cu 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/kat/KAT7UnpackerCUDA.cu 2018-03-12 23:02:35.000000000 +0000 @@ -13,11 +13,9 @@ #include -//#define _DEBUG - using namespace std; -void check_error (const char*); +void check_error_stream (const char*, cudaStream_t); // each thread unpacks samples so that 1 warp does 128 contiguous samples __global__ void kat7_unpack_fpt_kernel (const uint64_t ndat, float scale, const int16_t * input, cuFloatComplex * output) @@ -83,7 +81,7 @@ // after the data are loaded from pinned memory to GPU ram and the next Input copy to pinned memory if (dsp::Operation::record_time || dsp::Operation::verbose) - check_error ("kat7_unpack_fpt_kernel"); + check_error_stream ("kat7_unpack_fpt_kernel", stream); // put it here for now cudaStreamSynchronize(stream); diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/kat/Makefile.am bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/kat/Makefile.am --- bl-dspsr-0+git20160405/Kernel/Formats/kat/Makefile.am 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/kat/Makefile.am 2018-03-12 23:02:35.000000000 +0000 @@ -1,14 +1,14 @@ noinst_LTLIBRARIES = libkat.la -nobase_include_HEADERS = dsp/KAT7Unpacker.h +nobase_include_HEADERS = dsp/KAT7Unpacker.h dsp/MeerKATUnpacker.h -libkat_la_SOURCES = KAT7Unpacker.C +libkat_la_SOURCES = KAT7Unpacker.C MeerKATUnpacker.C if HAVE_CUDA -nobase_include_HEADERS += dsp/KAT7UnpackerCUDA.h -libkat_la_SOURCES += KAT7UnpackerCUDA.cu +nobase_include_HEADERS += dsp/KAT7UnpackerCUDA.h dsp/MeerKATUnpackerCUDA.h +libkat_la_SOURCES += KAT7UnpackerCUDA.cu MeerKATUnpackerCUDA.cu endif diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/kat/MeerKATUnpacker.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/kat/MeerKATUnpacker.C --- bl-dspsr-0+git20160405/Kernel/Formats/kat/MeerKATUnpacker.C 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/kat/MeerKATUnpacker.C 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,262 @@ +//-*-C++-*- +/*************************************************************************** + * + * Copyright (C) 2015 Andrew Jameson + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include "dsp/MeerKATUnpacker.h" +#include "dsp/BitTable.h" + +#include "Error.h" + +#if HAVE_CUDA +#include "dsp/MemoryCUDA.h" +#include "dsp/MeerKATUnpackerCUDA.h" +#include +#endif + +#include +#include + +using namespace std; + +static void* const undefined_stream = (void *) -1; + +dsp::MeerKATUnpacker::MeerKATUnpacker (const char* _name) : HistUnpacker (_name) +{ + if (verbose) + cerr << "dsp::MeerKATUnpacker ctor" << endl; + + set_nstate (256); + + table = new BitTable (8, BitTable::TwosComplement); + + device_prepared = false; + + engine = 0; + + tfp_buffer = 0; + tfp_buffer_size = 0; + +} + +dsp::MeerKATUnpacker::~MeerKATUnpacker () +{ +} + +//! Return true if the unpacker support the specified output order +bool dsp::MeerKATUnpacker::get_order_supported (TimeSeries::Order order) const +{ + //return ((order == TimeSeries::OrderFPT) || (order == TimeSeries::OrderTFP)); + return (order == TimeSeries::OrderFPT); +} + +//! Set the order of the dimensions in the output TimeSeries +void dsp::MeerKATUnpacker::set_output_order (TimeSeries::Order order) +{ + output_order = order; +} + + +/*! The quadrature components are offset by one */ +unsigned dsp::MeerKATUnpacker::get_output_offset (unsigned idig) const +{ + return idig % 2; +} + +/*! The first two digitizer channels are poln0, the last two are poln1 */ +unsigned dsp::MeerKATUnpacker::get_output_ipol (unsigned idig) const +{ + return (idig % 4) / 2; +} + +/*! Each chan has 4 values (quadrature, dual pol) */ +unsigned dsp::MeerKATUnpacker::get_output_ichan (unsigned idig) const +{ + return idig / 4; +} + +dsp::MeerKATUnpacker * dsp::MeerKATUnpacker::clone () const +{ + return new MeerKATUnpacker (*this); +} + +void dsp::MeerKATUnpacker::set_engine (Engine* _engine) +{ + engine = _engine; +} + +//! Return true if the unpacker can operate on the specified device +bool dsp::MeerKATUnpacker::get_device_supported (Memory* memory) const +{ + // create a temporary engine in the default stream +#if HAVE_CUDA + CUDA::DeviceMemory * gpu_mem = dynamic_cast< CUDA::DeviceMemory*>( memory ); + if (gpu_mem) + { + CUDA::MeerKATUnpackerEngine * tmp = new CUDA::MeerKATUnpackerEngine(0); + return tmp->get_device_supported (memory); + } + else +#endif + { + return false; + } +} + +//! Set the device on which the unpacker will operate +void dsp::MeerKATUnpacker::set_device (Memory* memory) +{ +#if HAVE_CUDA + CUDA::DeviceMemory * gpu_mem = dynamic_cast< CUDA::DeviceMemory*>( memory ); + if (gpu_mem) + { + cudaStream_t stream = gpu_mem->get_stream(); + set_engine (new CUDA::MeerKATUnpackerEngine(stream)); + } +#endif + + if (verbose) + cerr << "dsp::MeerKATUnpacker::set_device" << endl; + if (engine) + { + engine->set_device(memory); + engine->setup(); + } + else + Unpacker::set_device (memory); + device_prepared = true; +} + +bool dsp::MeerKATUnpacker::matches (const Observation* observation) +{ + return observation->get_machine() == "MeerKAT" + && observation->get_ndim() == 2 + && (observation->get_npol() == 2 || observation->get_npol() == 1) + && observation->get_nbit() == 8; +} + +void dsp::MeerKATUnpacker::unpack () +{ + const uint64_t ndat = input->get_ndat(); + if (ndat == 0) + return; + + if (engine) + { + if (verbose) + cerr << "dsp::MeerKATUnpacker::unpack using Engine" << endl; + engine->unpack(table->get_scale(), input, output); + return; + } + + // some programs (digifil) do not call set_device + if ( ! device_prepared ) + set_device ( Memory::get_manager ()); + + int16_t * from = (int16_t *) input->get_rawptr(); + int16_t from16; + int8_t * from8 = (int8_t * ) &from16; + float * into; + const float scale = table->get_scale(); + const unsigned nchan = input->get_nchan(); + const unsigned npol = input->get_npol(); + const unsigned ndim = 2; + const unsigned nsamp_per_heap = 256; + const unsigned nheap = ndat / nsamp_per_heap; + const float* lookup = table->get_values (); + + // data is stored as sample blocks of FPT ordered data + const uint64_t nval = nsamp_per_heap * ndim; + + if (verbose) + cerr << "dsp::MeerKATUnpacker::unpack nheap=" << nheap << " ndat=" << ndat << " nchan=" << nchan + << " npol=" << npol << " nval=" << nval << endl; + + unsigned long * digs[2]; + + switch ( output->get_order() ) + { + case TimeSeries::OrderFPT: + { +#ifdef _DEBUG + cerr << "dsp::MeerKATUnpacker::unpack TimeSeries::OrderFPT" << endl; +#endif + for (unsigned iheap=0; iheapget_datptr (ichan, ipol) + iheap*nsamp_per_heap * ndim; + + for (unsigned isamp=0; isampget_dattfp(); + const unsigned heap_stride = nchan * npol * ndim * nsamp_per_heap; + const unsigned into_stride = nchan * npol * ndim; + for (unsigned iheap=0; iheap + +using namespace std; + +void check_error_stream (const char*, cudaStream_t); + +// each thread unpacks 1 complex sample +__global__ void meerkat_unpack_fpt_kernel (const uint64_t ndat, float scale, const char2 * input, cuFloatComplex * output, uint64_t ostride) +{ + // blockIdx.x is the heap number, threadIdx.x is the sample number in the heap + const uint64_t idat = (blockIdx.x * blockDim.x) + threadIdx.x; + if (idat >= ndat) + return; + + const unsigned ichanpol = blockIdx.y * gridDim.z + blockIdx.z; // ichan * npol + ipol + const unsigned pol_stride = gridDim.y * blockDim.x; // nchan * heap_size + const unsigned heap_stride = gridDim.z * pol_stride; // npol * pol_stride + + // iheap ipol ichan * heap_size + const uint64_t idx = (blockIdx.x * heap_stride) + (blockIdx.z * pol_stride) + (blockIdx.y * blockDim.x) + threadIdx.x; + const uint64_t odx = (ichanpol * ostride) + idat; + + char2 in16 = input[idx]; + + cuFloatComplex out64; + out64.x = ((float) in16.x + 0.5) * scale; + out64.y = ((float) in16.y + 0.5) * scale; + + output[odx] = out64; +} + +CUDA::MeerKATUnpackerEngine::MeerKATUnpackerEngine (cudaStream_t _stream) +{ + stream = _stream; +} + +void CUDA::MeerKATUnpackerEngine::setup () +{ + // determine cuda device properties for block & grid size + int device; + cudaGetDevice(&device); + cudaGetDeviceProperties (&gpu, device); +} + +bool CUDA::MeerKATUnpackerEngine::get_device_supported (dsp::Memory* memory) const +{ + return dynamic_cast< CUDA::DeviceMemory*> ( memory ); +} + +void CUDA::MeerKATUnpackerEngine::set_device (dsp::Memory* memory) +{ + //CUDA::DeviceMemory * gpu_mem = dynamic_cast< CUDA::DeviceMemory*>( memory ); + //staging.set_memory (gpu_mem); +} + + +void CUDA::MeerKATUnpackerEngine::unpack (float scale, const dsp::BitSeries * input, dsp::TimeSeries * output) +{ + const uint64_t ndat = input->get_ndat(); + const unsigned nchan = input->get_nchan(); + const unsigned ndim = input->get_ndim(); + const unsigned npol = input->get_npol(); + +#ifdef _DEBUG + cerr << "CUDA::MeerKATUnpackerEngine::unpack scale=" << scale + << " ndat=" << ndat << " nchan=" << nchan << " ndim=" << ndim + << " npol=" << npol << endl; +#endif + + // copy from CPU Bitseries to GPU staging Bitseries + char2 * from = (char2 *) input->get_rawptr(); + + cuFloatComplex * into = (cuFloatComplex *) output->get_datptr(0, 0); + size_t pol_span = (output->get_datptr(0, 1) - output->get_datptr(0,0)) / ndim; + + if (dsp::Operation::verbose) + cerr << "CUDA::MeerKATUnpackerEngine::unpack from=" << (void *) from + << " to=" << (void *) into << " pol_span=" << pol_span << endl; + + // since 256 samples per heap + int nthread = 256; + + // each thread will unpack 4 time samples + dim3 blocks = dim3 (ndat / nthread, nchan, npol); + + if (ndat % nthread != 0) + blocks.x++; + +#ifdef _DEBUG + cerr << "CUDA::MeerKATUnpackerEngine::unpack meerkat_unpack ndat=" << ndat + << " scale=" << scale << " input=" << (void*) input << " nblock=(" + << blocks.x << "," << blocks.y << "," << blocks.z << ")" << " nthread=" << nthread << endl; +#endif + + meerkat_unpack_fpt_kernel<<>> (ndat, scale, from, into, pol_span); + + if (dsp::Operation::record_time || dsp::Operation::verbose) + check_error_stream ("CUDA::MeerKATUnpackerEngine::unpack", stream); +} + diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/lbadr/dsp/SMROFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/lbadr/dsp/SMROFile.h --- bl-dspsr-0+git20160405/Kernel/Formats/lbadr/dsp/SMROFile.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/lbadr/dsp/SMROFile.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/lbadr/dsp/SMROFile.h,v $ - $Revision: 1.6 $ - $Date: 2009/06/17 10:16:54 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/lbadr/dsp/SMROFile.h #ifndef __SMROFile_h #define __SMROFile_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/lbadr/dsp/SMROTwoBitCorrection.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/lbadr/dsp/SMROTwoBitCorrection.h --- bl-dspsr-0+git20160405/Kernel/Formats/lbadr/dsp/SMROTwoBitCorrection.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/lbadr/dsp/SMROTwoBitCorrection.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/lbadr/dsp/SMROTwoBitCorrection.h,v $ - $Revision: 1.6 $ - $Date: 2009/06/17 10:16:54 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/lbadr/dsp/SMROTwoBitCorrection.h #ifndef __SMROTwoBitCorrection_h #define __SMROTwoBitCorrection_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/lbadr64/dsp/LBADR64_File.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/lbadr64/dsp/LBADR64_File.h --- bl-dspsr-0+git20160405/Kernel/Formats/lbadr64/dsp/LBADR64_File.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/lbadr64/dsp/LBADR64_File.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/lbadr64/dsp/LBADR64_File.h,v $ - $Revision: 1.3 $ - $Date: 2009/06/17 10:16:54 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/lbadr64/dsp/LBADR64_File.h #ifndef __LBADR64_File_h #define __LBADR64_File_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/lbadr64/dsp/LBADR64_TwoBitCorrection.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/lbadr64/dsp/LBADR64_TwoBitCorrection.h --- bl-dspsr-0+git20160405/Kernel/Formats/lbadr64/dsp/LBADR64_TwoBitCorrection.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/lbadr64/dsp/LBADR64_TwoBitCorrection.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/lbadr64/dsp/LBADR64_TwoBitCorrection.h,v $ - $Revision: 1.1 $ - $Date: 2007/02/23 04:29:36 $ - $Author: ahotan $ */ +// dspsr/Kernel/Formats/lbadr64/dsp/LBADR64_TwoBitCorrection.h #ifndef __LBADR64_TwoBitCorrection_h #define __LBADR64_TwoBitCorrection_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/lofar_dal/dsp/LOFAR_DALUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/lofar_dal/dsp/LOFAR_DALUnpacker.h --- bl-dspsr-0+git20160405/Kernel/Formats/lofar_dal/dsp/LOFAR_DALUnpacker.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/lofar_dal/dsp/LOFAR_DALUnpacker.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/LOFAR_DALUnpacker.h,v $ - $Revision: 1.1 $ - $Date: 2011/08/01 10:07:00 $ - $Author: straten $ */ +// dspsr/Kernel/Classes/dsp/LOFAR_DALUnpacker.h #ifndef __LOFAR_DALUnpacker_h #define __LOFAR_DALUnpacker_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/lofar_dal/Makefile.am bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/lofar_dal/Makefile.am --- bl-dspsr-0+git20160405/Kernel/Formats/lofar_dal/Makefile.am 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/lofar_dal/Makefile.am 2018-03-12 23:02:35.000000000 +0000 @@ -5,11 +5,12 @@ liblofar_dal_la_SOURCES = LOFAR_DALFile.C LOFAR_DALUnpacker.C -liblofar_dal_la_LIBADD = -L$(LOFAR_DAL_INSTALL_PREFIX)/lib -llofardal -lhdf5 +liblofar_dal_la_LIBADD = -L$(LOFAR_DAL_INSTALL_PREFIX)/lib -llofardal @HDF5_LDFLAGS@ @HDF5_LIBS@ ############################################################################# include $(top_srcdir)/config/Makefile.include -AM_CPPFLAGS += -I$(LOFAR_DAL_INSTALL_PREFIX)/include/dal +AM_CPPFLAGS += -I$(LOFAR_DAL_INSTALL_PREFIX)/include/dal @HDF5_CPPFLAGS@ + diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/Makefile.am bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/Makefile.am --- bl-dspsr-0+git20160405/Kernel/Formats/Makefile.am 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/Makefile.am 2018-03-12 23:02:35.000000000 +0000 @@ -10,5 +10,5 @@ include $(top_srcdir)/config/Makefile.include -AM_CPPFLAGS += @PSRDADA_CFLAGS@ @CFITSIO_CFLAGS@ @GUPPI_DAQ_CFLAGS@ +AM_CPPFLAGS += @PSRDADA_CFLAGS@ @CFITSIO_CFLAGS@ @GUPPI_DAQ_CFLAGS@ @CUDA_CFLAGS@ diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mark4/dsp/Mark4File.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark4/dsp/Mark4File.h --- bl-dspsr-0+git20160405/Kernel/Formats/mark4/dsp/Mark4File.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark4/dsp/Mark4File.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/mark4/dsp/Mark4File.h,v $ - $Revision: 1.5 $ - $Date: 2009/06/17 10:16:54 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/mark4/dsp/Mark4File.h #ifndef __Mark4File_h #define __Mark4File_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mark4/dsp/Mark4TwoBitCorrection.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark4/dsp/Mark4TwoBitCorrection.h --- bl-dspsr-0+git20160405/Kernel/Formats/mark4/dsp/Mark4TwoBitCorrection.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark4/dsp/Mark4TwoBitCorrection.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/mark4/dsp/Mark4TwoBitCorrection.h,v $ - $Revision: 1.2 $ - $Date: 2006/07/09 13:27:08 $ - $Author: wvanstra $ */ +// dspsr/Kernel/Formats/mark4/dsp/Mark4TwoBitCorrection.h #ifndef __Mark4TwoBitCorrection_h #define __Mark4TwoBitCorrection_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mark4/dsp/Mark4TwoBitTable.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark4/dsp/Mark4TwoBitTable.h --- bl-dspsr-0+git20160405/Kernel/Formats/mark4/dsp/Mark4TwoBitTable.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark4/dsp/Mark4TwoBitTable.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/mark4/dsp/Mark4TwoBitTable.h,v $ - $Revision: 1.2 $ - $Date: 2006/07/09 13:27:08 $ - $Author: wvanstra $ */ +// dspsr/Kernel/Formats/mark4/dsp/Mark4TwoBitTable.h #ifndef __Mark4TwoBitTable_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mark5/dsp/Mark5TwoBitCorrection.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark5/dsp/Mark5TwoBitCorrection.h --- bl-dspsr-0+git20160405/Kernel/Formats/mark5/dsp/Mark5TwoBitCorrection.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark5/dsp/Mark5TwoBitCorrection.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/mark5/dsp/Mark5TwoBitCorrection.h,v $ - $Revision: 1.5 $ - $Date: 2009/06/17 10:16:54 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/mark5/dsp/Mark5TwoBitCorrection.h #ifndef __Mark5TwoBitCorrection_h #define __Mark5TwoBitCorrection_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mark5b/dsp/Mark5bFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark5b/dsp/Mark5bFile.h --- bl-dspsr-0+git20160405/Kernel/Formats/mark5b/dsp/Mark5bFile.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark5b/dsp/Mark5bFile.h 2018-03-12 23:02:35.000000000 +0000 @@ -1,7 +1,7 @@ //-*-C++-*- /*************************************************************************** * - * Copyright (C) 2015 by Stuart Weston and Willem van Straten + * Copyright (C) 2016 by Willem van Straten * Licensed under the Academic Free License version 2.1 * ***************************************************************************/ @@ -13,7 +13,7 @@ namespace dsp { - //! Loads BitSeries data from a MkV file + //! Loads BitSeries data from a MkV file using the mark5access library class Mark5bFile : public BlockFile { public: diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mark5b/Makefile.am bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark5b/Makefile.am --- bl-dspsr-0+git20160405/Kernel/Formats/mark5b/Makefile.am 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark5b/Makefile.am 2018-03-12 23:02:35.000000000 +0000 @@ -3,10 +3,13 @@ nobase_include_HEADERS = dsp/Mark5bFile.h dsp/Mark5bUnpacker.h -libmark5_la_SOURCES = Mark5bFile.C Mark5bUnpacker.C +libmark5b_la_SOURCES = Mark5bFile.C Mark5bUnpacker.C + +libmark5b_la_LIBADD = @MARK5ACCESS_LIBS@ ############################################################################# # include $(top_srcdir)/config/Makefile.include +AM_CPPFLAGS += @MARK5ACCESS_CFLAGS@ diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mark5b/Mark5bFile.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark5b/Mark5bFile.C --- bl-dspsr-0+git20160405/Kernel/Formats/mark5b/Mark5bFile.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark5b/Mark5bFile.C 2018-03-12 23:02:35.000000000 +0000 @@ -1,30 +1,23 @@ /*************************************************************************** * - * Copyright (C) 2015 by Stuart Weston and Willem van Straten + * Copyright (C) 2016 by Willem van Straten * Licensed under the Academic Free License version 2.1 * ***************************************************************************/ -using namespace std; - #include "dsp/Mark5bFile.h" -#include "vlba_stream.h" #include "Error.h" #include "coord.h" #include "strutil.h" #include "ascii_header.h" -#include +#include -#include -#include +#include #include -#include -#include -#include -#include -#include + +using namespace std; dsp::Mark5bFile::Mark5bFile (const char* filename,const char* headername) : BlockFile ("Mark5b") @@ -43,8 +36,10 @@ headername += ".hdr"; FILE* fptr = fopen (headername.c_str(), "r"); - if( !fptr ) { - if (verbose) cerr << "Mark5bFile: no hdr file (" << headername << ")" << endl; + if( !fptr ) + { + if (verbose) + cerr << "Mark5bFile: no hdr file (" << headername << ")" << endl; return false; } @@ -52,8 +47,8 @@ fread (header.get(), sizeof(char),1024, fptr); fclose (fptr); - int dummy_fanout = 0; - if (ascii_header_get (header.get(), "FANOUT", "%d", &dummy_fanout) < 0) + char dummy_format[64]; + if (ascii_header_get (header.get(), "FORMAT", "%d", &dummy_format) < 0) return false; return true; @@ -76,48 +71,51 @@ fclose (ftext); // /////////////////////////////////////////////////////////////// - // NBIT + // FORMAT // - int nbit = 0; - if (ascii_header_get (header,"NBIT","%d",&nbit) < 0) + char format[64]; + if (ascii_header_get (header,"FORMAT","%s",&format) < 0) throw Error (InvalidParam, "Mark5bFile::open_file", - "failed read NBIT"); + "failed read FORMAT"); - cerr << "NBIT = " << nbit << endl; - get_info()->set_nbit (nbit); - + cerr << "FORMAT = " << format << endl; - // /////////////////////////////////////////////////////////////// - // FANOUT - // - int fanout = 0; - if (ascii_header_get (header,"FANOUT","%d",&fanout) < 0) - throw Error (InvalidParam, "Mark5bFile::open_file", - "failed read FANOUT"); - - cerr << "FANOUT = " << fanout << endl; + /* From the mark5access library documentation: - struct VLBA_stream* vlba_stream = 0; + 3.3.1 struct mark5_format_generic* + new_mark5_format_from_string(const char *formatname) - stream = vlba_stream = VLBA_stream_open (filename, nbit, fanout, 0); - - if (!stream) - throw Error (InvalidParam, "Mark5bFile::open_file", - "failed VLBA_stream_open"); + A function to create a (struct mark5_format_generic) representing + one of the built-in formats. The string pointed to by + "formatname" should be of the form: FORMAT-Mbps-nChannels-nBits. + Examples for the three formats currently built into mark5acces + include: "VLBA1_4-256-4-2", "MKIV1_2-128-8-2", + "Mark5B-1024-16-2". Note that the string is case insensitive. + Also note here that in the case of VLBA and Mark4 (MKIV) the + fanout is built into the FORMAT portion of "formatname". + */ + + struct mark5_format_generic* m5format = 0; + m5format = new_mark5_format_generic_from_string (format); + if (!m5format) + throw Error (FailedCall, "Mark5bFile::open_file", + "failed new_mark5_format_generic_from_string (%s)", format); fd = 0; - // instruct the loader to only take gulps in 32/16 lots of nbits - // necessary since Mk5 files are written in 64-/32-bit words - cerr << "TRACKS = " << vlba_stream->tracks << endl; - Input::resolution = vlba_stream->tracks / nbit; - - // The factor of 2 should only apply for dual-pol data. - cerr << "NCHAN = " << vlba_stream->nchan / 2 << endl; - get_info()->set_nchan( vlba_stream->nchan / 2 ); + struct mark5_stream_generic* m5file = 0; + m5file = new_mark5_stream_file (filename, 0); + if (!m5file) + throw Error (FailedCall, "Mark5bFile::open_file", + "failed new_mark5_stream_file (%s)", filename); + - cerr << "SAMPRATE = " << vlba_stream->samprate << endl; - get_info()->set_rate ( vlba_stream->samprate ); + struct mark5_stream* m5stream = new_mark5_stream (m5file,m5format); + + stream = m5stream; + + // instruct the loader to only take gulps of samplegranularity samples + Input::resolution = m5stream->samplegranularity; int refmjd = 0; if (ascii_header_get (header,"REFMJD","%d",&refmjd) < 0) @@ -125,12 +123,12 @@ "failed read REFMJD"); cerr << "REFMJD " << refmjd << endl; - vlba_stream->mjd += refmjd; + m5stream->mjd += refmjd; - cerr << "MJD = " << vlba_stream->mjd << endl; - cerr << "SEC = " << vlba_stream->sec << endl; + cerr << "MJD = " << m5stream->mjd << endl; + cerr << "SEC = " << m5stream->sec << endl; - get_info()->set_start_time( MJD(vlba_stream->mjd, vlba_stream->sec, 0) ); + get_info()->set_start_time( MJD(m5stream->mjd, m5stream->sec, 0) ); // /////////////////////////////////////////////////////////////// // TELESCOPE @@ -213,18 +211,27 @@ "failed read BW"); get_info()->set_bandwidth (bw); - - // /////////////////////////////////////////////////////////////// - // NPOL - // - // -- generalise this later - - get_info()->set_npol(2); // read in both polns at once + + double Mega_samples_per_second = m5stream->Mbps / m5stream->nbit; + + double npol = round( (bw * 2) / Mega_samples_per_second ); + cerr << "NPOL=" << npol << endl; + + cerr << "NCHAN = " << m5stream->nchan / npol << endl; + get_info()->set_nchan( m5stream->nchan / npol ); + + cerr << "NBIT = " << m5stream->nbit << endl; + get_info()->set_nbit ( m5stream->nbit ); + + cerr << "SAMPRATE = " << m5stream->samprate << endl; + get_info()->set_rate ( m5stream->samprate ); + + get_info()->set_npol(npol); // /////////////////////////////////////////////////////////////// // NDIM --- whether the data are Nyquist or Quadrature sampled // - // VLBA data are Nyquist sampled + // MARK5 data are Nyquist sampled get_info()->set_state (Signal::Nyquist); @@ -250,17 +257,12 @@ get_info()->set_machine("Mark5b"); } -extern "C" int next_frame (struct VLBA_stream *vs); - -/*! Uses Walter's next_frame to take care of the modbits business, then - copies the result from the VLBA_stream::frame buffer into the buffer - argument. */ int64_t dsp::Mark5bFile::load_bytes (unsigned char* buffer, uint64_t bytes) { if (verbose) cerr << "Mark5bFile::load_bytes nbytes =" << bytes << endl; if (verbose) - cerr << "Mark5bFile::load_bytes leave it to VLBA_stream_get_data" << endl; + cerr << "Mark5bFile::load_bytes leave it to MARK5_stream_get_data" << endl; return bytes; } diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mark5b/Mark5bUnpacker.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark5b/Mark5bUnpacker.C --- bl-dspsr-0+git20160405/Kernel/Formats/mark5b/Mark5bUnpacker.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mark5b/Mark5bUnpacker.C 2018-03-12 23:02:35.000000000 +0000 @@ -1,13 +1,14 @@ /*************************************************************************** * - * Copyright (C) 2015 by Stuart Weston and Willem van Straten + * Copyright (C) 2016 by Willem van Straten * Licensed under the Academic Free License version 2.1 * ***************************************************************************/ #include "dsp/Mark5bUnpacker.h" #include "dsp/Mark5bFile.h" -#include "vlba_stream.h" + +#include using namespace std; @@ -43,17 +44,18 @@ throw Error (InvalidState, "dsp::Mark5bUnpacker::unpack", "Input is not a Mark5bFile"); - struct VLBA_stream* vlba_stream = (struct VLBA_stream*) file->stream; + struct mark5_stream* m5stream = (struct mark5_stream*) file->stream; float* data [npol * nchan]; + /* Stuart: this is the place in the code where we rearrange channels */ for (unsigned ipol = 0 ; ipol < npol ; ipol++) for (unsigned ichan=0; ichan < nchan; ichan++) - data[ipol + 2*ichan] = output->get_datptr(ichan,ipol); + data[ipol + npol*ichan] = output->get_datptr(ichan,ipol); - if (VLBA_stream_get_data (vlba_stream, ndat, data) < 0) + if (mark5_stream_decode(m5stream, ndat, data) < 0) throw Error (InvalidState, "dsp::Mark5bUnpacker::unpack", - "error VLBA_stream_get_data (most likely EOD)"); + "error mark5_stream_decode (most likely EOD)"); } diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/maxim/dsp/MaximFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/maxim/dsp/MaximFile.h --- bl-dspsr-0+git20160405/Kernel/Formats/maxim/dsp/MaximFile.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/maxim/dsp/MaximFile.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/maxim/dsp/MaximFile.h,v $ - $Revision: 1.3 $ - $Date: 2008/05/28 21:12:43 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/maxim/dsp/MaximFile.h #ifndef __MaximFile_h #define __MaximFile_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/maxim/dsp/MaximUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/maxim/dsp/MaximUnpacker.h --- bl-dspsr-0+git20160405/Kernel/Formats/maxim/dsp/MaximUnpacker.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/maxim/dsp/MaximUnpacker.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/maxim/dsp/MaximUnpacker.h,v $ - $Revision: 1.2 $ - $Date: 2006/07/09 13:27:08 $ - $Author: wvanstra $ */ +// dspsr/Kernel/Formats/maxim/dsp/MaximUnpacker.h #ifndef __MaximUnpacker_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mopsr/dsp/MOPSRUnpackerCUDA.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mopsr/dsp/MOPSRUnpackerCUDA.h --- bl-dspsr-0+git20160405/Kernel/Formats/mopsr/dsp/MOPSRUnpackerCUDA.h 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mopsr/dsp/MOPSRUnpackerCUDA.h 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,34 @@ +/*************************************************************************** + * + * Copyright (C) 2013 by Andrew Jameson + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#ifndef __dsp_MOPSRUnpackerCUDA_h +#define __dsp_MOPSRUnpackerCUDA_h + +//#define USE_TEXTURE_MEMORY + +#include +#include + +#include + +void mopsr_texture_alloc (void * d_staging, size_t size); + +void mopsr_unpack_prepare (cudaStream_t stream, const float scale); + +#ifdef USE_TEXTURE_MEMORY +void mopsr_unpack (cudaStream_t stream, const uint64_t ndat, + const unsigned char* stagingBufGPU, + float* into, cudaTextureObject_t * tex); +#else +void mopsr_unpack_fpt (cudaStream_t stream, const uint64_t ndat, const unsigned nchan, + float scale, int8_t const * input, float * output); +#endif +void mopsr_unpack_tfp (cudaStream_t stream, const uint64_t ndat, const unsigned nchan, + float scale, int8_t const * input, float * output); + +#endif + diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mopsr/dsp/MOPSRUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mopsr/dsp/MOPSRUnpacker.h --- bl-dspsr-0+git20160405/Kernel/Formats/mopsr/dsp/MOPSRUnpacker.h 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mopsr/dsp/MOPSRUnpacker.h 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,92 @@ +/*************************************************************************** + * + * Copyright (C) 2013 by Andrew Jameson + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#ifndef __dsp_MOPSRUnpacker_h +#define __dsp_MOPSRUnpacker_h + +#include "dsp/EightBitUnpacker.h" + +namespace dsp { + + class MOPSRUnpacker : public HistUnpacker + { + public: + + //! Constructor + MOPSRUnpacker (const char* name = "MOPSRUnpacker"); + ~MOPSRUnpacker (); + + unsigned get_output_offset (unsigned idig) const; + + unsigned get_output_ipol (unsigned idig) const; + + unsigned get_output_ichan (unsigned idig) const; + + unsigned get_ndim_per_digitizer () const; + + //! Cloner (calls new) + virtual MOPSRUnpacker * clone () const; + + //! Return true if the unpacker can operate on the specified device + bool get_device_supported (Memory*) const; + + //! synch with the Input resolution + void match_resolution (const Input*); + + //! Set the device on which the unpacker will operate + void set_device (Memory*); + + protected: + + Reference::To table; + + //! Return true if we can convert the Observation + bool matches (const Observation* observation); + + void unpack (); + + //! Return true if support the output order + bool get_order_supported (TimeSeries::Order order) const; + + //! Set the order of the dimensions in the output TimeSeries + virtual void set_output_order (TimeSeries::Order); + + BitSeries staging; + + void * gpu_stream; + + unsigned get_resolution ()const; + + void unpack_on_gpu (); + + private: + + void validate_transformation(); + + enum DataOrder { + //! unknown input order + NONE, + //! PFB single antenna input + TF, + //! PFB multi antenna input + FT, + //! Beam Formed single antenna input + T + }; + + DataOrder input_order; + + bool device_prepared; + + unsigned input_resolution; + + int debugd; + + }; +} + +#endif diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mopsr/Makefile.am bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mopsr/Makefile.am --- bl-dspsr-0+git20160405/Kernel/Formats/mopsr/Makefile.am 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mopsr/Makefile.am 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,19 @@ + +noinst_LTLIBRARIES = libmopsr.la + +nobase_include_HEADERS = dsp/MOPSRUnpacker.h + +libmopsr_la_SOURCES = MOPSRUnpacker.C + +include $(top_srcdir)/config/Makefile.include + +if HAVE_CUDA + +nobase_include_HEADERS += dsp/MOPSRUnpackerCUDA.h +libmopsr_la_SOURCES += MOPSRUnpackerCUDA.cu + +include $(top_srcdir)/config/Makefile.cuda +endif + + +AM_CPPFLAGS += @CUDA_CFLAGS@ diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mopsr/MOPSRUnpacker.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mopsr/MOPSRUnpacker.C --- bl-dspsr-0+git20160405/Kernel/Formats/mopsr/MOPSRUnpacker.C 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mopsr/MOPSRUnpacker.C 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,620 @@ +//-*-C++-*- +/*************************************************************************** + * + * Copyright (C) 2013 by Andrew Jameson + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include "dsp/MOPSRUnpacker.h" +#include "dsp/ASCIIObservation.h" +#include "dsp/BitTable.h" + +#include "Error.h" + +#if HAVE_CUDA +#include "dsp/MemoryCUDA.h" +#include "dsp/MOPSRUnpackerCUDA.h" +#include +#endif + +#include +#include + +using namespace std; + +static void* const undefined_stream = (void *) -1; + +#ifdef _DEBUG +#define CHECK_ERROR(x) check_error(x) +#define CHECK_ERROR_STREAM(x,y) check_error_stream(x,y) +#else +#define CHECK_ERROR(x) +#define CHECK_ERROR_STREAM(x,y) +#endif + +#if HAVE_CUDA +void check_error (const char *); +void check_error_stream (const char *, cudaStream_t); +#endif + +dsp::MOPSRUnpacker::MOPSRUnpacker (const char* _name) : HistUnpacker (_name) +{ + if (verbose) + cerr << "dsp::MOPSRUnpacker ctor" << endl; + + gpu_stream = undefined_stream; + + table = new BitTable (8, BitTable::TwosComplement); + device_prepared = false; + + // complex input data, 1 polarisation + set_ndig (2); + set_nstate (256); + + input_order = NONE; + input_resolution = 1; + + debugd = 0; +} + +dsp::MOPSRUnpacker::~MOPSRUnpacker () +{ +} + +/*! The quadrature components must be offset by one */ +unsigned dsp::MOPSRUnpacker::get_output_offset (unsigned idig) const +{ + return idig % 2; +} + +/*! */ +unsigned dsp::MOPSRUnpacker::get_output_ipol (unsigned idig) const +{ + return 0; +} + +/*! */ +unsigned dsp::MOPSRUnpacker::get_output_ichan (unsigned idig) const +{ + return idig / 2; +} + +unsigned dsp::MOPSRUnpacker::get_ndim_per_digitizer () const +{ + return 1; +} + +//! Return true if the unpacker support the specified output order +bool dsp::MOPSRUnpacker::get_order_supported (TimeSeries::Order order) const +{ + return ((order == TimeSeries::OrderFPT) || (order == TimeSeries::OrderTFP)); +} + +//! Set the order of the dimensions in the output TimeSeries +void dsp::MOPSRUnpacker::set_output_order (TimeSeries::Order order) +{ + if (verbose) + { + if (order == TimeSeries::OrderFPT) + cerr << "dsp::MOPSRUnpacker::set_output_order (TimeSeries::OrderFPT)" << endl; + if (order == TimeSeries::OrderTFP) + cerr << "dsp::MOPSRUnpacker::set_output_order (TimeSeries::OrderTFP)" << endl; + } + output_order = order; + output->set_order (order); +} + +dsp::MOPSRUnpacker * dsp::MOPSRUnpacker::clone () const +{ + return new MOPSRUnpacker (*this); +} + +//! Return true if the unpacker can operate on the specified device +bool dsp::MOPSRUnpacker::get_device_supported (Memory* memory) const +{ +#if HAVE_CUDA + if (verbose) + cerr << "dsp::MOPSRUnpacker::get_device_supported HAVE_CUDA" << endl; + return dynamic_cast< CUDA::DeviceMemory*> ( memory ); +#else + return false; +#endif +} + +//! Set the device on which the unpacker will operate +void dsp::MOPSRUnpacker::set_device (Memory* memory) +{ +#if HAVE_CUDA + CUDA::DeviceMemory * gpu_mem = dynamic_cast< CUDA::DeviceMemory*>( memory ); + if (gpu_mem) + { + gpu_stream = (void *) gpu_mem->get_stream(); + mopsr_unpack_prepare (gpu_mem->get_stream(), (float) table->get_scale()); + if (verbose) + cerr << "dsp::MOPSRUnpacker::set_device gpu_stream=" << gpu_stream << endl; +#ifdef USE_TEXTURE_MEMORY + CUDA::TextureMemory * texture_mem = new CUDA::TextureMemory (gpu_mem->get_stream()); + if (verbose) + cerr << "dsp::MOPSRUnpacker::set_device using texture memory ptr=" << texture_mem << endl; + texture_mem->set_format_signed(8, 8, 0, 0); + + cerr << "dsp::MOPSRUnpacker::set_device staging.set_memory (texture_mem)" << endl; + staging.set_memory( texture_mem ); +#else + if (verbose) + cerr << "dsp::MOPSRUnpacker::set_device: using gpu memory" << endl; + staging.set_memory ( gpu_mem ); +#endif + } + else + { + if (verbose) + cerr << "dsp::MOPSRUnpacker::set_device: using cpu memory" << endl; + gpu_stream = undefined_stream; + } +#else + Unpacker::set_device (memory); +#endif + device_prepared = true; +} + + +bool dsp::MOPSRUnpacker::matches (const Observation* observation) +{ + if (verbose) + { + if (observation->get_state() == Signal::Analytic) + cerr << "dsp::MOPSRUnpacker::matches state=Analytic" << endl; + else if (observation->get_state() == Signal::Intensity) + cerr << "dsp::MOPSRUnpacker::matches state=Intensity" << endl; + else + cerr << "dsp::MOPSRUnpacker::matches states=" << observation->get_state() << endl; + } + + return observation->get_machine()== "MOPSR" + && (observation->get_state() == Signal::Analytic || observation->get_state() == Signal::Intensity) + && (observation->get_nbit() == 8 || observation->get_nbit() == 32) + && (observation->get_ndim() == 2 || observation->get_ndim() == 1) + && (observation->get_npol() == 1 || observation->get_npol() == 2); +} + + +void dsp::MOPSRUnpacker::match_resolution (const Input* input) +{ + input_resolution = input->get_resolution(); + if (verbose) + cerr << "dsp::MOPSRUnpacker::match_resolution input_resolution=" << input_resolution << endl; +} + +/*! Validate whether the unpacker can handle the combination of input order + and output order */ +void dsp::MOPSRUnpacker::validate_transformation () +{ + // see if this unpacker already knows in order of the input data + if (input_order == NONE) + { + const Input * in = input->get_loader(); + const Observation * obs = in->get_info(); + const ASCIIObservation * info = dynamic_cast(obs); + if (info) + { + char buffer[8]; + if (info->custom_header_get ("ORDER", "%s", buffer) == 1) + { + if (strcmp(buffer, "TF") == 0) + { + input_order = TF; + } + else if (strcmp(buffer, "FT") == 0) + { + input_order = FT; + } + else if (strcmp(buffer, "T") == 0) + { + cerr << "input order=T" << endl; + input_order = T; + } + else + { + throw Error (InvalidState, "dsp::MOPSRUnpacker::valid_transformation", "unrecognized input order [%s]", buffer); + } + } + } + // have an assumed order when it cannot be determined + else + { + cerr << "dsp::MOPSRUnpacker::valid_transformation could not get ASCIIObservation reference" << endl; + input_order = TF; + } + } + + const unsigned nchan = input->get_nchan(); + if ((nchan == 1) && (input_order == TF)) + throw Error (InvalidState, "dsp::MOPSRUnpacker::valid_transformation", "input order not compatible with nchan=%u", nchan); + if ((nchan != 1) && (input_order == T)) + throw Error (InvalidState, "dsp::MOPSRUnpacker::valid_transformation", "input order not compatible with nchan=%u", nchan); +} + +void dsp::MOPSRUnpacker::unpack () +{ + const unsigned int nbit = input->get_nbit(); + + // 32-bit data does not have a digitizer + if ((nbit == 32) && (get_ndig() != 0)) + set_ndig (0); + + // 8-bit data has a digitizer for each channel + if ((nbit == 8) && (get_ndig() != input->get_nchan() * input->get_ndim())) + set_ndig(input->get_nchan() * input->get_ndim()); + +#if HAVE_CUDA + if (gpu_stream != undefined_stream) + { + unpack_on_gpu (); + return; + } +#endif + + if (input_order == NONE) + validate_transformation(); + + const unsigned int nchan = input->get_nchan(); + const unsigned int ndim = input->get_ndim(); + const unsigned int npol = input->get_npol(); + const unsigned int ipol = 0; + const uint64_t ndat = input->get_ndat(); + + // input channel stride - distance between successive (temporal) samples from same channel + unsigned int in_chan_stride = nchan * ndim; + unsigned int out_chan_stride = ndim; + const float* lookup = table->get_values (); + + if (verbose) + cerr << "dsp::MOPSRUnpacker::unpack in_chan_stride="<< in_chan_stride << " input_resolution=" << input_resolution << endl; + + if (debugd) + cerr << "ndat=" << ndat << " nchan=" << nchan << " ndim=" << ndim << " nbit=" << nbit << endl; + + // TF order is produced by the beam-former, TFS produced by the AQ engines + if (input_order == TF) + { + if (output->get_order() == TimeSeries::OrderFPT) + { + // 32-bit floats are produced by the beam former + if (nbit == 32) + { + for (unsigned ichan=0; ichanget_rawptr()) + in_chan_off; + float* into = output->get_datptr (ichan, ipol); + for (uint64_t idat=0; idat < ndat; idat++) + { + for (unsigned idim=0; idim < ndim; idim++) + into[idim] = from[idim]; + from += in_chan_stride; + into += out_chan_stride; + } + } + } + // 8-bit signed integers products by the PFBs + else if (nbit == 8) + { + unsigned long * hists[2]; + + // transpose from TF order to FT order + for (unsigned ichan=0; ichanget_rawptr() + in_chan_off); + float* into = output->get_datptr (ichan, ipol); + const unsigned int nfloat = ndim; + + for (unsigned idim=0; idim < ndim; idim++) + hists[idim] = get_histogram (ndim*ichan+idim); + + for (uint64_t idat=0; idat < ndat; idat++) + { + for (unsigned idim=0; idim < ndim; idim++) + { + into[idim] = float ( from[idim] ); + hists[idim][from[idim]+128]++; + } + from += in_chan_stride; + into += out_chan_stride; + } + } + } + else + { + throw Error (InvalidState, "dsp::MOPSRUnpacker::unpack", "unsupported unpacking bit width input=TF, output=FPT"); + } + } + else if (output->get_order() == TimeSeries::OrderTFP) + { + // 32-bit floats are produced by the beam former + if (nbit == 32) + { + // direct unpack from TF to TF + const float * from = (float *) input->get_rawptr(); + float * into = output->get_dattfp(); + const uint64_t nfloat = npol * nchan * ndat * ndim; + for (uint64_t ifloat=0; ifloat < nfloat; ifloat++) + { + into[ifloat] = from[ifloat]; + } + } + // 8-bit input are produced by the PFBs, Ndim == 2 + else if (nbit == 8 && ndim == 2) + { + // direct unpack from TF to TF + const unsigned char* from = input->get_rawptr(); + float* into = output->get_dattfp(); + const uint64_t nfloat = npol * nchan * ndat; + unsigned long* hist_re; + unsigned long* hist_im; + + for (uint64_t ifloat=0; ifloat < nfloat; ifloat++) + { + into[0] = lookup[ from[0] ]; + into[1] = lookup[ from[1] ]; + + unsigned ichan = ifloat % nchan; + + hist_re = get_histogram (2*ichan); + hist_im = get_histogram (2*ichan+1); + + int bin_re = int8_t(from[0]) + 128; + int bin_im = int8_t(from[1]) + 128; + + hist_re[bin_re]++; + hist_im[bin_im]++; + + into += 2; + from += 2; + } + } + else if (nbit == 8 && ndim == 1) + { + // direct unpack from TF to TF + const unsigned char* from = input->get_rawptr(); + float* into = output->get_dattfp(); + const uint64_t nfloat = npol * nchan * ndat; + unsigned long* hist = get_histogram (0); + + for (uint64_t ifloat=0; ifloat < nfloat; ifloat++) + { + into[ifloat] = lookup[ from[ifloat] ]; + } + } + else + { + throw Error (InvalidState, "dsp::MOPSRUnpacker::unpack", "unsupported unpacking bit width input=TF, output=TFP"); + } + } + else + { + throw Error (InvalidState, "dsp::MOPSRUnpacker::unpack", "output order not suitable for input == TS"); + } + } + else if (input_order == FT) + { + if (output->get_order() == TimeSeries::OrderFPT) + { + if (nbit == 32) + { + const unsigned nfloat = input_resolution * ndim; + cerr << "dsp::MOPSRUnpacker::unpack ndat="<get_datptr (ichan, 0) + iblock * input_resolution; + for (uint64_t ifloat=0; ifloat < nfloat; ifloat++) + into[ifloat] = from[ifloat]; + from += nfloat; + } + } + } + else + { + const unsigned nval = ndat * ndim; + int8_t * from = (int8_t *) input->get_rawptr(); + for (unsigned ichan=0; ichanget_datptr (ichan, 0); + for (uint64_t ival=0; ival < nval; ival++) + into[ival] = float (from[ival]); + from += nval; + } + } + } + else if (output->get_order() == TimeSeries::OrderTFP) + { + if (nbit == 32) + { + // transpose from FT to TF + const unsigned nchandim = nchan * ndim; + float * from = (float *) input->get_rawptr(); + float * into = output->get_dattfp (); + + cerr << "dsp::MOPSRUnpacker::unpack ndat=" << ndat << " nchan=" << nchan << " ndim=" << ndim << " output TFP" << endl; + + for (unsigned ichan=0; ichanget_rawptr(); + const float * from32 = (float *) input->get_rawptr(); + float * into; + + if (output->get_order() == TimeSeries::OrderFPT) + into = output->get_datptr (0, 0); + else if (output->get_order() == TimeSeries::OrderTFP) + into = output->get_dattfp(); + else + throw Error (InvalidState, "dsp::MOPSRUnpacker::unpack", "output order not suitable for input == T"); + + + if (verbose) + cerr << "dsp::MOPSRUnpacker::unpack ndim=" << ndim << endl; + unsigned long* hist_re = get_histogram (0); + if (ndim > 1) + unsigned long* hist_im = get_histogram (1); + const uint64_t nfloat = ndat * ndim; + + if (nbit == 32) + for (uint64_t ifloat=0; ifloat < nfloat; ifloat++) + into[ifloat] = from32[ifloat]; + else + for (uint64_t ifloat=0; ifloat < nfloat; ifloat++) + into[ifloat] = lookup[ from8[ifloat] ]; + } + debugd = 0; +} + +unsigned dsp::MOPSRUnpacker::get_resolution () const { return 1024; } + +#if HAVE_CUDA + +void dsp::MOPSRUnpacker::unpack_on_gpu () +{ + const uint64_t ndat = input->get_ndat(); + const unsigned nchan = input->get_nchan(); + const unsigned ndim = input->get_ndim(); + const unsigned npol = input->get_npol(); + + const uint64_t to_copy = ndat * nchan * ndim * npol; + + staging.Observation::operator=( *input ); + staging.resize(ndat); + + // staging buffer on the GPU for packed data + int8_t * d_staging = (int8_t *) staging.get_rawptr(); + const unsigned char * from = input->get_rawptr(); + float * into; + + if (ndat == 0) + { + if (verbose) + cerr << "dsp::MOPSRUnpacker::unpack_on_gpu ndat == 0" << endl; + return; + } + + switch ( output->get_order() ) + { + case TimeSeries::OrderFPT: + { + into = output->get_datptr(0,0); + break; + } + + case TimeSeries::OrderTFP: + { + into = output->get_dattfp(); + break; + } + + default: + { + throw Error (InvalidState, "dsp::MOPSRUnpacker::unpack_on_gpu", "unrecognized order"); + } + break; + } + + cudaStream_t stream = (cudaStream_t) gpu_stream; + if (verbose) + cerr << "dsp::MOPSRUnpacker::unpack_on_gpu stream=" << stream << endl; + + cudaError error; + if (stream) + { + error = cudaMemcpyAsync (d_staging, from, to_copy, cudaMemcpyHostToDevice, stream); + CHECK_ERROR_STREAM ("dsp::MOPSRUnpacker::unpack_on_gpu cudaMemcpyAsync", stream); + } + else + { + error = cudaMemcpy (d_staging, from, to_copy, cudaMemcpyHostToDevice); + CHECK_ERROR ("dsp::MOPSRUnpacker::unpack_on_gpu cudaMemcpy"); + } + + +#ifdef USE_TEXTURE_MEMORY + if (verbose) + cerr << "dsp::MOPSRUnpacker::unpack_on_gpu binding TextureMemory" << endl; + CUDA::TextureMemory * gpu_mem = dynamic_cast< CUDA::TextureMemory*>( staging.get_memory() ); + cerr << "dsp::MOPSRUnpacker::unpack_on_gpu textureMemory stream=" << stream << " gpu_mem->get_tex()= " << gpu_mem->get_tex() << endl; +#endif + + if (error != cudaSuccess) + throw Error (FailedCall, "MOPSRUnpacker::unpack_on_gpu", + "cudaMemcpy%s %s", stream?"Async":"", + cudaGetErrorString (error)); + +#ifdef USE_TEXTURE_MEMORY + mopsr_unpack (stream, ndat, d_staging, into, gpu_mem->get_tex()); +#else + if (output->get_order() == TimeSeries::OrderFPT) + { + if (verbose) + cerr << "dsp::MOPSRUnpacker::unpack_on_gpu mopsr_unpack_fpt ndat=" << ndat + << " d_staging=" << (void *) d_staging << " into=" << (void *) into << endl; + mopsr_unpack_fpt (stream, ndat, nchan, table->get_scale(), d_staging, into); + if (dsp::Operation::record_time || dsp::Operation::verbose) + if (stream) + CHECK_ERROR_STREAM ("dsp::MOPSRUnpacker::unpack_on_gpu mopsr_unpack_fpt", stream); + else + CHECK_ERROR ("dsp::MOPSRUnpacker::unpack_on_gpu mopsr_unpack_fpt"); + } + else if (output->get_order() == TimeSeries::OrderTFP) + { + if (verbose) + cerr << "dsp::MOPSRUnpacker::unpack_on_gpu mopsr_unpack_tfp ndat=" << ndat << endl; + mopsr_unpack_tfp (stream, ndat, nchan, table->get_scale(), d_staging, into); + if (dsp::Operation::record_time || dsp::Operation::verbose) + if (stream) + CHECK_ERROR_STREAM ("dsp::MOPSRUnpacker::unpack_on_gpu mopsr_unpack_tfp", stream); + else + CHECK_ERROR ("dsp::MOPSRUnpacker::unpack_on_gpu mopsr_unpack_tfp"); + } + else + throw Error (InvalidState, "dsp::MOPSRUnpacker::unpack_on_gpu", "unrecognized order"); +#endif +} + +#endif diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mopsr/MOPSRUnpackerCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mopsr/MOPSRUnpackerCUDA.cu --- bl-dspsr-0+git20160405/Kernel/Formats/mopsr/MOPSRUnpackerCUDA.cu 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mopsr/MOPSRUnpackerCUDA.cu 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,219 @@ +//-*-C++-*- +/*************************************************************************** + * + * Copyright (C) 2013 by Andrew Jameson + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#include + +#include "dsp/MOPSRUnpackerCUDA.h" +#include "dsp/Operation.h" +#include "debug.h" +#include "Error.h" + +// threads per block - C1060=256 [TODO CHECK below if changing] +#define __MOPSR_UNPACK_TPB 1024 + +#define __MOPSR_SAMPLES_PER_THREAD 4 + +#define WARP_SIZE 32 + +// global static texture declaration for MOPSR gpu unpacker +//texture mopsr_tex1dfloat2; + +// real textutre version +//texture mopsr_tex1dfloat; + +using namespace std; + +//void check_error (const char*); +void check_error_stream (const char*, cudaStream_t); + +__device__ __constant__ float mopsr_unpacker_scale; + +#ifdef USE_TEXTURE_MEMORY +__global__ void mopsr_unpack_complex_1 (float2 * output, cudaTextureObject_t tex) +{ + const int idx = blockIdx.x*blockDim.x + threadIdx.x; + output[idx] = tex1Dfetch(tex, idx); +} +#else +__global__ void mopsr_unpack_fpt_complex_1 (const int8_t * stagingBufGPU, + float * output, + const unsigned nval, + const unsigned nchan, + const unsigned nsamp_per_block, + const unsigned chan_stride) +{ + extern __shared__ int8_t sdata[]; + const unsigned ndim = 2; + + // input index + const unsigned idx = (blockIdx.x * blockDim.x + threadIdx.x); + const unsigned in_idx = idx * ndim; + + // shared memory index + const unsigned sin_idx = threadIdx.x * ndim; + + + if (idx >= nval) + { + sdata[sin_idx] = 0; + sdata[sin_idx+1] = 0; + } + else + { + sdata[sin_idx] = stagingBufGPU[in_idx]; + sdata[sin_idx+1] = stagingBufGPU[in_idx+1]; + } + + // synchronize all threads in the block + __syncthreads(); + + // now we have 1000 consective (complex) TF samples in the sdata array stored as int8_t + + // determine the output index for this thread + const unsigned ichan = threadIdx.x / nsamp_per_block; + const unsigned isamp = threadIdx.x % nsamp_per_block; + + // determine which shared memory index for this output ichan and isamp + const unsigned sout_idx = ((isamp * nchan) + ichan) * ndim; + + // convert to float + float re = (float) sdata[sout_idx]; + float im = (float) sdata[sout_idx+1]; + + // + 0.5 since scale is -128 to 127 + re += 0.5; + im += 0.5; + + // optimal scaling from bit table + re *= mopsr_unpacker_scale; + im *= mopsr_unpacker_scale; + + // finally determine the output index for this thread + const unsigned ou_idx = (ichan * chan_stride) + (blockIdx.x * nsamp_per_block * ndim) + (isamp * ndim); + + if (blockIdx.x * nsamp_per_block * nchan < nval) + { +#if _KDEBUG + if (blockIdx.x == 0) + printf ("threadIdx.x=%d sin_idx=%d, ichan=%d, isamp=%d, sout_idx=%d, ou_idx=%d\n", threadIdx.x, sin_idx, ichan, isamp, sout_idx, ou_idx); +#endif + + output[ou_idx] = re; + output[ou_idx+1] = im; + } + else + { + printf ("blockIdx.x=%d, threadIdx.x=%d val=%d >= nval=%d\n", blockIdx.x, threadIdx.x, blockIdx.x * nsamp_per_block * nchan, nval); + } + +#if _KDEBUG + if (blockIdx.x ==0 && threadIdx.x == 0) + printf ("=========================\n"); +#endif +} +#endif + +__global__ void mopsr_unpack_tfp_complex_1 (const int8_t * stagingBufGPU, + float2* output, + const unsigned nchan) +{ + const unsigned isamp = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned ichan = blockIdx.y; + const unsigned ndim = 2; + + // input and output will be in TFP order + const int8_t* from = reinterpret_cast( stagingBufGPU ) + (isamp * nchan * ndim) + (ichan * ndim); + + __shared__ float2 out; + + out.x = (float) from[0]; + out.y = (float) from[1]; + + // + 0.5 since scale is -128 to 127 + out.x += 0.5; + out.y += 0.5; + + // optimal scaling from bit table + out.x *= mopsr_unpacker_scale; + out.y *= mopsr_unpacker_scale; + + output[(isamp * nchan) + ichan] = out; +} + +void mopsr_unpack_prepare (cudaStream_t stream, const float scale) +{ + cudaError_t error = cudaMemcpyToSymbolAsync ( mopsr_unpacker_scale, &scale, sizeof(scale), 0, cudaMemcpyHostToDevice, stream); + // TODO check return value +} + +void mopsr_unpack_tfp (cudaStream_t stream, const uint64_t ndat, const unsigned nchan, + float scale, int8_t const * input, float * output) +{ + int nthread = __MOPSR_UNPACK_TPB; + int nblocks = ndat / nthread; + + // each thread will unpack 1 complex time sample from 1 channel + dim3 blocks (nblocks, nchan); + + float2 * complex_output = (float2 *) output; + + mopsr_unpack_tfp_complex_1<<>>(input, complex_output, nchan); + + if (dsp::Operation::record_time || dsp::Operation::verbose) + check_error_stream ("mopsr_unpack_tfp", stream); +} + +#ifdef USE_TEXTURE_MEMORY +void mopsr_unpack (cudaStream_t stream, const uint64_t ndat, + unsigned char const* input, float * output, + cudaTextureObject_t * tex) +#else +void mopsr_unpack_fpt (cudaStream_t stream, const uint64_t ndat, const unsigned nchan, + float scale, int8_t const * input, float * output) +#endif +{ + const unsigned npol = 1; + const unsigned ndim = 2; + const unsigned nval = ndat * nchan; + + // we want the number of threads to be module nchan + int nthread = (__MOPSR_UNPACK_TPB / nchan) * nchan; + int nblocks = nval / nthread; + if (nval % nthread) + nblocks++; + + // each thread will unpack 1 complex time sample from 1 channel + size_t sdata_bytes = nthread * ndim; + const unsigned nsamp_per_block = nthread/nchan; + const unsigned chan_stride = ndat * npol * ndim; + + if (dsp::Operation::verbose) + cerr << "mopsr_unpack_fpt nval=" << nval << " ndat=" << ndat << " nchan=" << nchan + << " input=" << (void*) input << " output=" << (void *) output + << " nblocks=" << nblocks << " nthread=" << nthread + << " sdata_bytes=" << sdata_bytes << " nsamp_per_block=" << nsamp_per_block + << " chan_stride=" << chan_stride << endl; + +#ifdef USE_TEXTURE_MEMORY + //mopsr_unpack_complex_1<<>>(complex_output, *tex); +#else + mopsr_unpack_fpt_complex_1<<>>(input, output, nval, nchan, nsamp_per_block, chan_stride); +#endif + + // AJ's theory... + // If there are no stream synchronises on the input then the CPU pinned memory load from the + // input class might be able to get ahead of a whole sequence of GPU operations, and even exceed + // one I/O loop. Therefore this should be a reuqirement to have a stream synchronize some time + // after the data are loaded from pinned memory to GPU ram and the next Input copy to pinned memory + + // put it here for now + cudaStreamSynchronize(stream); + + if (dsp::Operation::record_time || dsp::Operation::verbose) + check_error_stream ("mopsr_unpack_fpt", stream); +} diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mwa/dsp/EDAFourBit.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mwa/dsp/EDAFourBit.h --- bl-dspsr-0+git20160405/Kernel/Formats/mwa/dsp/EDAFourBit.h 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mwa/dsp/EDAFourBit.h 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,36 @@ +//-*-C++-*- +/*************************************************************************** + * + * Copyright (C) 2011 by Willem van Straten + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#ifndef __EDAFourBit_h +#define __EDAFourBit_h + +#include "dsp/FourBitUnpacker.h" + +namespace dsp +{ + //! Converts single-dish EDA data from 4-bit to floating point values + class EDAFourBit: public FourBitUnpacker + { + public: + + //! Constructor initializes bit table + EDAFourBit (); + + //! Return true if this unpacker can handle the observation + bool matches (const Observation*); + + //! Over-ride the default BitUnpacker::unpack method + void unpack (); + + //! Over-ride the default FourBitUnpacker::get_histogram method + void get_histogram (std::vector&, unsigned idig) const; + + }; +} + +#endif diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mwa/EDAFourBit.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mwa/EDAFourBit.C --- bl-dspsr-0+git20160405/Kernel/Formats/mwa/EDAFourBit.C 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mwa/EDAFourBit.C 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,94 @@ +/*************************************************************************** + * + * Copyright (C) 2017 by Willem van Straten + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#include "dsp/EDAFourBit.h" +#include "dsp/BitTable.h" + +#include +#include +using namespace std; + +dsp::EDAFourBit::EDAFourBit () + : FourBitUnpacker ("EDAFourBit") +{ + BitTable* table = new BitTable (4, BitTable::OffsetBinary); + table->set_order( BitTable::LeastToMost ); + set_table( table ); +} + +bool dsp::EDAFourBit::matches (const Observation* observation) +{ + if (verbose) + cerr << "dsp::EDAUnpacker::matches" + " machine=" << observation->get_machine() << + " nbit=" << observation->get_nbit() << endl; + + return observation->get_machine() == "EDA" + && observation->get_nbit() == 4 + && observation->get_npol() == 2 + && observation->get_ndim() == 1; +} + +void dsp::EDAFourBit::unpack () +{ + cerr << "dsp::EDAFourBit::unpack" << endl; + + const uint64_t ndat = input->get_ndat(); + + const unsigned nchan = input->get_nchan(); + const unsigned npol = input->get_npol(); + const unsigned ndim = input->get_ndim(); + + assert (nchan == 1); + assert (npol == 2); + assert (ndim == 1); + + const unsigned char* from = input->get_rawptr(); + + float* pol0 = output->get_datptr (0,0); + float* pol1 = output->get_datptr (0,1); + + unsigned long* hist = BitUnpacker::get_histogram (0); + + const float* lookup = table->get_values (); + + for (uint64_t idat = 0; idat < ndat; idat++) + { + pol0[idat] = lookup[ from[idat] * 2 ]; + pol1[idat] = lookup[ from[idat] * 2 + 1 ]; + + hist[ from[idat] ] ++; + } +} + +void dsp::EDAFourBit::get_histogram (std::vector& hist, + unsigned idig) const +{ + assert( get_nstate() == 16 ); + assert( get_nstate_internal() == 256 ); + assert( get_ndig() == 2 ); + assert( idig < 2 ); + + hist.resize( get_nstate() ); + for (unsigned i=0; i> 4) & mask; + + hist[s0] += hist_internal[i]; + } +} diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/mwa/Makefile.am bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mwa/Makefile.am --- bl-dspsr-0+git20160405/Kernel/Formats/mwa/Makefile.am 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/mwa/Makefile.am 2018-03-12 23:02:35.000000000 +0000 @@ -1,9 +1,9 @@ noinst_LTLIBRARIES = libmwa.la -nobase_include_HEADERS = dsp/MWAFile.h +nobase_include_HEADERS = dsp/MWAFile.h dsp/EDAFourBit.h -libmwa_la_SOURCES = MWAFile.C +libmwa_la_SOURCES = MWAFile.C EDAFourBit.C ############################################################################# # diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/puma/dsp/PuMaFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/puma/dsp/PuMaFile.h --- bl-dspsr-0+git20160405/Kernel/Formats/puma/dsp/PuMaFile.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/puma/dsp/PuMaFile.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/puma/dsp/PuMaFile.h,v $ - $Revision: 1.7 $ - $Date: 2008/05/28 21:12:43 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/puma/dsp/PuMaFile.h #ifndef __dsp_PuMaFile_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/puma/dsp/PuMaTwoBitCorrection.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/puma/dsp/PuMaTwoBitCorrection.h --- bl-dspsr-0+git20160405/Kernel/Formats/puma/dsp/PuMaTwoBitCorrection.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/puma/dsp/PuMaTwoBitCorrection.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/puma/dsp/PuMaTwoBitCorrection.h,v $ - $Revision: 1.2 $ - $Date: 2006/07/09 13:27:08 $ - $Author: wvanstra $ */ +// dspsr/Kernel/Formats/puma/dsp/PuMaTwoBitCorrection.h #ifndef __PuMaTwoBitCorrection_h #define __PuMaTwoBitCorrection_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/puma2/dsp/PuMa2File.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/puma2/dsp/PuMa2File.h --- bl-dspsr-0+git20160405/Kernel/Formats/puma2/dsp/PuMa2File.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/puma2/dsp/PuMa2File.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/puma2/dsp/PuMa2File.h,v $ - $Revision: 1.4 $ - $Date: 2008/05/28 21:12:43 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/puma2/dsp/PuMa2File.h #ifndef __PuMa2File_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/puma2/dsp/PuMa2_Observation.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/puma2/dsp/PuMa2_Observation.h --- bl-dspsr-0+git20160405/Kernel/Formats/puma2/dsp/PuMa2_Observation.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/puma2/dsp/PuMa2_Observation.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/puma2/dsp/PuMa2_Observation.h,v $ - $Revision: 1.3 $ - $Date: 2007/11/14 03:11:02 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/puma2/dsp/PuMa2_Observation.h #ifndef __PuMa2_Observation_h #define __PuMa2_Observation_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/puma2/dsp/PuMa2Unpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/puma2/dsp/PuMa2Unpacker.h --- bl-dspsr-0+git20160405/Kernel/Formats/puma2/dsp/PuMa2Unpacker.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/puma2/dsp/PuMa2Unpacker.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/puma2/dsp/PuMa2Unpacker.h,v $ - $Revision: 1.4 $ - $Date: 2008/03/12 14:07:48 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/puma2/dsp/PuMa2Unpacker.h #ifndef __PuMa2Unpacker_h #define __PuMa2Unpacker_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/s2/dsp/S2File.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/s2/dsp/S2File.h --- bl-dspsr-0+git20160405/Kernel/Formats/s2/dsp/S2File.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/s2/dsp/S2File.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/s2/dsp/S2File.h,v $ - $Revision: 1.16 $ - $Date: 2008/05/28 21:12:43 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/s2/dsp/S2File.h #ifndef __S2File_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/s2/dsp/S2TwoBitCorrection.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/s2/dsp/S2TwoBitCorrection.h --- bl-dspsr-0+git20160405/Kernel/Formats/s2/dsp/S2TwoBitCorrection.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/s2/dsp/S2TwoBitCorrection.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/s2/dsp/S2TwoBitCorrection.h,v $ - $Revision: 1.11 $ - $Date: 2008/05/29 07:34:58 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/s2/dsp/S2TwoBitCorrection.h #ifndef __S2TwoBitCorrection_h #define __S2TwoBitCorrection_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/s2/dsp/S2TwoBitTable.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/s2/dsp/S2TwoBitTable.h --- bl-dspsr-0+git20160405/Kernel/Formats/s2/dsp/S2TwoBitTable.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/s2/dsp/S2TwoBitTable.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/s2/dsp/S2TwoBitTable.h,v $ - $Revision: 1.4 $ - $Date: 2006/07/09 13:27:09 $ - $Author: wvanstra $ */ +// dspsr/Kernel/Formats/s2/dsp/S2TwoBitTable.h #ifndef __S2TwoBitTable_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/s2/tci_file.c bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/s2/tci_file.c --- bl-dspsr-0+git20160405/Kernel/Formats/s2/tci_file.c 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/s2/tci_file.c 2018-03-12 23:02:35.000000000 +0000 @@ -356,10 +356,9 @@ { int i; + memset(header, ' ', sizeof(*header)); header->hdr_size = 0; header->hdr_drate = 0; - sprintf (header->hdr_time, "%-*.*s", (TCI_HEADER_BASE_SIZE-6), - (TCI_HEADER_BASE_SIZE-6), " "); header->hdr_time[0] = '\0'; header->hdr_s2mode[0] = '\0'; header->hdr_tapeid[0] = '\0'; diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/s2/tci_file.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/s2/tci_file.h --- bl-dspsr-0+git20160405/Kernel/Formats/s2/tci_file.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/s2/tci_file.h 2018-03-12 23:02:35.000000000 +0000 @@ -7,7 +7,7 @@ /* $Id: tci_file.h,v 1.3 2009/05/04 23:17:13 straten Exp $ -$Log: tci_file.h,v $ +$Log: tci_file.h Revision 1.3 2009/05/04 23:17:13 straten verbosity mods diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/sigproc/dsp/SigProcDigitizer.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/sigproc/dsp/SigProcDigitizer.h --- bl-dspsr-0+git20160405/Kernel/Formats/sigproc/dsp/SigProcDigitizer.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/sigproc/dsp/SigProcDigitizer.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/sigproc/dsp/SigProcDigitizer.h,v $ - $Revision: 1.3 $ - $Date: 2011/07/19 14:59:41 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/sigproc/dsp/SigProcDigitizer.h #ifndef __SigProcDigitizer_h #define __SigProcDigitizer_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/sigproc/dsp/SigProcFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/sigproc/dsp/SigProcFile.h --- bl-dspsr-0+git20160405/Kernel/Formats/sigproc/dsp/SigProcFile.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/sigproc/dsp/SigProcFile.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/sigproc/dsp/SigProcFile.h,v $ - $Revision: 1.1 $ - $Date: 2008/10/31 05:59:55 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/sigproc/dsp/SigProcFile.h #ifndef __SigProcFile_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/sigproc/dsp/SigProcObservation.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/sigproc/dsp/SigProcObservation.h --- bl-dspsr-0+git20160405/Kernel/Formats/sigproc/dsp/SigProcObservation.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/sigproc/dsp/SigProcObservation.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/sigproc/dsp/SigProcObservation.h,v $ - $Revision: 1.2 $ - $Date: 2008/10/31 06:00:50 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/sigproc/dsp/SigProcObservation.h #ifndef __SigProcObservation_h #define __SigProcObservation_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/sigproc/dsp/SigProcOutputFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/sigproc/dsp/SigProcOutputFile.h --- bl-dspsr-0+git20160405/Kernel/Formats/sigproc/dsp/SigProcOutputFile.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/sigproc/dsp/SigProcOutputFile.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/sigproc/dsp/SigProcOutputFile.h,v $ - $Revision: 1.2 $ - $Date: 2011/09/19 01:56:42 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/sigproc/dsp/SigProcOutputFile.h #ifndef __SigProcOutputFile_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/sigproc/dsp/SigProcUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/sigproc/dsp/SigProcUnpacker.h --- bl-dspsr-0+git20160405/Kernel/Formats/sigproc/dsp/SigProcUnpacker.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/sigproc/dsp/SigProcUnpacker.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/sigproc/dsp/SigProcUnpacker.h,v $ - $Revision: 1.1 $ - $Date: 2010/05/04 15:30:40 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/sigproc/dsp/SigProcUnpacker.h #ifndef __SigProcUnpacker_h #define __SigProcUnpacker_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/sigproc/send_stuff.c bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/sigproc/send_stuff.c --- bl-dspsr-0+git20160405/Kernel/Formats/sigproc/send_stuff.c 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/sigproc/send_stuff.c 2018-03-12 23:02:35.000000000 +0000 @@ -2,8 +2,8 @@ #include #include "sigproc.h" -FILE *input, *output; -int swapout; +extern FILE *input, *output; +extern int swapout; void send_string(char *string) /* includefile */ { int len; diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/sigproc/SigProcObservation.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/sigproc/SigProcObservation.C --- bl-dspsr-0+git20160405/Kernel/Formats/sigproc/SigProcObservation.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/sigproc/SigProcObservation.C 2018-03-12 23:02:35.000000000 +0000 @@ -90,6 +90,8 @@ return "Effelsberg"; case 11: return "LOFAR"; + case 12: + return "VLA"; default: return "unknown"; break; @@ -124,6 +126,7 @@ else if (itoa == "GM") return 7; else if (itoa == "EF") return 8; else if (itoa == "LF") return 11; + else if (itoa == "VL") return 12; else return 0; } catch (Error &error) diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/ska1/dsp/SKA1UnpackerCUDA.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/ska1/dsp/SKA1UnpackerCUDA.h --- bl-dspsr-0+git20160405/Kernel/Formats/ska1/dsp/SKA1UnpackerCUDA.h 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/ska1/dsp/SKA1UnpackerCUDA.h 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,57 @@ +//-*-C++-*- +/*************************************************************************** + * + * Copyright (C) 2014 by Andrew JAmeson + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#ifndef __baseband_cuda_SKA1Unpacker_h +#define __baseband_cuda_SKA1Unpacker_h + +#include + +//#include "dsp/SKA1Unpacker.h" +#ifdef SKA1_ENGINE_IMPLEMENTATION +namespace CUDA +{ + class SKA1UnpackerEngine : public dsp::SKA1Unpacker::Engine + { + public: + + //! Default Constructor + SKA1UnpackerEngine (cudaStream_t stream); + + void setup (); + + bool get_device_supported (dsp::Memory* memory) const; + + void set_device (dsp::Memory* memory); + + void unpack (float scale, const dsp::BitSeries * input, dsp::TimeSeries * output); + + protected: + + cudaStream_t stream; + + struct cudaDeviceProp gpu; + + dsp::BitSeries staging; + + }; +} +#else + +#include + +void ska1_unpack_tfp (cudaStream_t stream, uint64_t nval, float scale, + float * into, void * staged, + unsigned nchan, unsigned npol, unsigned ndim, + size_t pol_span); + +void ska1_unpack_fpt (cudaStream_t stream, uint64_t ndat, float scale, + float * into, void * staged, unsigned nchan, + size_t pol_span); +#endif + +#endif diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/ska1/dsp/SKA1Unpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/ska1/dsp/SKA1Unpacker.h --- bl-dspsr-0+git20160405/Kernel/Formats/ska1/dsp/SKA1Unpacker.h 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/ska1/dsp/SKA1Unpacker.h 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,88 @@ +/* + + */ + +#ifndef __dsp_SKA1Unpacker_h +#define __dsp_SKA1Unpacker_h + +//#define SKA1_ENGINE_IMPLEMENTATION + +#include "dsp/EightBitUnpacker.h" +#include "ThreadContext.h" + +namespace dsp { + + class SKA1Unpacker : public HistUnpacker + { + public: + + //! Constructor + SKA1Unpacker (const char* name = "SKA1Unpacker"); + + //! Destructor + ~SKA1Unpacker (); + + //! Cloner (calls new) + virtual SKA1Unpacker * clone () const; + + //! Return true if the unpacker can operate on the specified device + bool get_device_supported (Memory*) const; + + //! Set the device on which the unpacker will operate + void set_device (Memory*); + +#ifdef SKA1_ENGINE_IMPLEMENTATION + //! Engine used to perform discrete convolution step + class Engine; + void set_engine (Engine*); +#else + void unpack_on_gpu (); +#endif + + protected: + +#ifdef SKA1_ENGINE_IMPLEMENTATION + //! Interface to alternate processing engine (e.g. GPU) + Reference::To engine; +#else + void * gpu_stream; +#endif + + Reference::To table; + + //! Return true if we can convert the Observation + bool matches (const Observation* observation); + + void unpack (); + + //BitSeries staging; + //unsigned get_resolution () const ; + + private: + + unsigned ndim; + + unsigned npol; + + bool device_prepared; + + }; + +#ifdef SKA1_ENGINE_IMPLEMENTATION + + class SKA1Unpacker::Engine : public Reference::Able + { + public: + virtual void unpack(float scale, const BitSeries * input, TimeSeries * output) = 0; + + virtual bool get_device_supported (Memory* memory) const = 0; + + virtual void set_device (Memory* memory) = 0; + + }; + +#endif + +} + +#endif diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/ska1/Makefile.am bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/ska1/Makefile.am --- bl-dspsr-0+git20160405/Kernel/Formats/ska1/Makefile.am 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/ska1/Makefile.am 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,21 @@ + +noinst_LTLIBRARIES = libska1.la + +nobase_include_HEADERS = dsp/SKA1Unpacker.h + +libska1_la_SOURCES = SKA1Unpacker.C + +if HAVE_CUDA + +nobase_include_HEADERS += dsp/SKA1UnpackerCUDA.h +libska1_la_SOURCES += SKA1UnpackerCUDA.cu + +endif + +############################################################################# +# + +include $(top_srcdir)/config/Makefile.include +include $(top_srcdir)/config/Makefile.cuda + +AM_CPPFLAGS += @CUDA_CFLAGS@ diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/ska1/SKA1Unpacker.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/ska1/SKA1Unpacker.C --- bl-dspsr-0+git20160405/Kernel/Formats/ska1/SKA1Unpacker.C 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/ska1/SKA1Unpacker.C 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,195 @@ +//-*-C++-*- +/*************************************************************************** + * + * Copyright (C) 2014 + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include "dsp/SKA1Unpacker.h" +#include "dsp/BitTable.h" + +#include "Error.h" + +#if HAVE_CUDA +#include "dsp/MemoryCUDA.h" +#include "dsp/SKA1UnpackerCUDA.h" +#include +#endif + +#include + +using namespace std; + +static void* const undefined_stream = (void *) -1; + +dsp::SKA1Unpacker::SKA1Unpacker (const char* _name) : HistUnpacker (_name) +{ + if (verbose) + cerr << "dsp::SKA1Unpacker ctor" << endl; + + set_nstate (256); + table = new BitTable (8, BitTable::TwosComplement); + + npol = 2; + ndim = 2; +} + +dsp::SKA1Unpacker::~SKA1Unpacker () +{ +} + +dsp::SKA1Unpacker * dsp::SKA1Unpacker::clone () const +{ + return new SKA1Unpacker (*this); +} + +#ifdef SKA1_ENGINE_IMPLEMENTATION +void dsp::SKA1Unpacker::set_engine (Engine* _engine) +{ + engine = _engine; +} +#endif + +//! Return true if the unpacker can operate on the specified device +bool dsp::SKA1Unpacker::get_device_supported (Memory* memory) const +{ + if (verbose) + cerr << "dsp::SKA1Unpacker::get_device_supported" << endl; +#ifdef SKA1_ENGINE_IMPLEMENTATION + if (engine) + return engine->get_device_supported (memory); + else + return false; +#else +#if HAVE_CUDA + if (verbose) + cerr << "dsp::SKA1Unpacker::get_device_supported HAVE_CUDA" << endl; + return dynamic_cast< CUDA::DeviceMemory*> ( memory ); +#else + return false; +#endif + +#endif +} + +//! Set the device on which the unpacker will operate +void dsp::SKA1Unpacker::set_device (Memory* memory) +{ + if (verbose) + cerr << "dsp::SKA1Unpacker::set_device" << endl; +#ifdef SKA1_ENGINE_IMPLEMENTATION + if (engine) + engine->set_device(memory); + else + Unpacker::set_device (memory); +#else +#if HAVE_CUDA + CUDA::DeviceMemory * gpu_mem = dynamic_cast< CUDA::DeviceMemory*>( memory ); + if (gpu_mem) + { + //cerr << "dsp::SKA1Unpacker::set_device activating GPU" << endl; + gpu_stream = (void *) gpu_mem->get_stream(); + //staging.set_memory( gpu_mem ); + } + else + gpu_stream = undefined_stream; +#else + Unpacker::set_device (memory); +#endif +#endif + device_prepared = true; +} + +bool dsp::SKA1Unpacker::matches (const Observation* observation) +{ + return observation->get_machine()== "SKA1" + && observation->get_ndim() == 2 + && observation->get_npol() == 2 + && observation->get_nbit() == 8; +} + +void dsp::SKA1Unpacker::unpack () +{ + if (verbose) + cerr << "dsp::SKA1Unpacker::unpack()" << endl; + +#ifdef SKA1_ENGINE_IMPLEMENTATION + if (engine) + { + if (verbose) + cerr << "dsp::SKA1Unpacker::unpack using Engine" << endl; + engine->unpack(table->get_scale(), input, output); + return; + } +#else +#if HAVE_CUDA + if (gpu_stream != undefined_stream) + { + unpack_on_gpu (); + return; + } +#endif +#endif + + // some programs (digifil) do not call set_device + if ( ! device_prepared ) + set_device ( Memory::get_manager ()); + + // Data format is TFP + + const uint64_t ndat = input->get_ndat(); + const unsigned nchan = input->get_nchan(); + + unsigned in_offset = 0; + const unsigned into_stride = ndim; + const unsigned from_stride = nchan * ndim * npol; + const float * lookup = table->get_values (); + + for (unsigned ichan=0; ichanget_datptr (ichan, ipol); + const unsigned char * from = input->get_rawptr() + in_offset; + + for (uint64_t idat=0; idatget_ndat(); + const unsigned nchan = input->get_nchan(); + const unsigned ndim = input->get_ndim(); + const unsigned npol = input->get_npol(); + + if (ndat == 0) + return; + + void * from = (void *) input->get_rawptr(); + cudaStream_t stream = (cudaStream_t) gpu_stream; + + uint64_t nval = ndat * nchan * npol; + + float * into = (float *) output->get_datptr(0,0); + size_t pol_span = output->get_datptr(0, 1) - output->get_datptr(0,0); + + ska1_unpack_fpt (stream, ndat, table->get_scale(), into, from, nchan, pol_span); +} +#endif +#endif diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/ska1/SKA1UnpackerCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/ska1/SKA1UnpackerCUDA.cu --- bl-dspsr-0+git20160405/Kernel/Formats/ska1/SKA1UnpackerCUDA.cu 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/ska1/SKA1UnpackerCUDA.cu 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,279 @@ +//-*-C++-*- +/*************************************************************************** + * + * Copyright (C) 2010 by Willem van Straten + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#include +#include + +#include "dsp/SKA1UnpackerCUDA.h" +#include "dsp/Operation.h" +#include "dsp/MemoryCUDA.h" + +#include "Error.h" +#define WARP_SIZE 32 +#define BLOCK_SIZE 1024 +//#define _GDEBUG + +using namespace std; + +void check_error_stream (const char*, cudaStream_t); + +__global__ void k_unpack_fpt (float2 * to, const char2 * from, + uint64_t ndat, uint64_t ostride, + float scale) +{ + const uint64_t idat = (blockIdx.x * blockDim.x) + threadIdx.x; + if (idat >= ndat) + return; + + const unsigned ichanpol = blockIdx.y * gridDim.z + blockIdx.z; + const unsigned blk_stride = gridDim.y * gridDim.z * BLOCK_SIZE; + + // iblk ichanpol isamp + const uint64_t idx = (blockIdx.x*blk_stride) + ichanpol*BLOCK_SIZE + threadIdx.x; + const uint64_t odx = (ichanpol * ostride) + idat; + + char2 packed = from[idx]; + float2 unpacked; + unpacked.x = ((float) packed.x + 0.5) * scale; + unpacked.y = ((float) packed.y + 0.5) * scale; + //unpacked.x = (float) scale; + //unpacked.y = (float) scale; + to[odx] = unpacked; +} + +__global__ void k_unpack_tfp (uint64_t nval, float scale, + float2 * to, const int16_t * from, + const unsigned nchan, const unsigned npol, + size_t pol_span, unsigned nval_per_thread, + unsigned nval_per_block) +{ + extern __shared__ int16_t sdata[]; + + // shared memory for this block + const unsigned ndim = 2; + const unsigned warp_num = threadIdx.x / WARP_SIZE; + const unsigned warp_idx = threadIdx.x % WARP_SIZE; + const unsigned block_offset = blockIdx.x * nval_per_block; + unsigned idx = (warp_num * (WARP_SIZE * nval_per_thread)) + warp_idx; + + // read input data as 2 x int8_t pairs into shm + unsigned ival; + for (ival=0; ival= nchanpol) + { + ichanpol = 0; + ichunk++; + isamp += WARP_SIZE; + out_idx = out_block_offset + isamp; + } + else + out_idx += pol_span; + } + } + +} + +#ifdef SKA1_ENGINE_IMPLEMENTATION + +CUDA::SKA1UnpackerEngine::SKA1UnpackerEngine (cudaStream_t _stream) +{ + stream = _stream; +} + +void CUDA::SKA1UnpackerEngine::setup () +{ + // determine cuda device properties for block & grid size + int device; + cudaGetDevice(&device); + cudaGetDeviceProperties (&gpu, device); +} + +bool CUDA::SKA1UnpackerEngine::get_device_supported (dsp::Memory* memory) const +{ + return dynamic_cast< CUDA::DeviceMemory*> ( memory ); +} + +void CUDA::SKA1UnpackerEngine::set_device (dsp::Memory* memory) +{ + CUDA::DeviceMemory * gpu_mem = dynamic_cast< CUDA::DeviceMemory*>( memory ); + staging.set_memory( gpu_mem); +} + + +void CUDA::SKA1UnpackerEngine::unpack (float scale, const dsp::BitSeries * input, dsp::TimeSeries * output) +{ + const uint64_t ndat = input->get_ndat(); + const unsigned nchan = input->get_nchan(); + const unsigned ndim = input->get_ndim(); + const unsigned npol = input->get_npol(); + + // gpu staging buffer for input Bitseries Block + staging.Observation::operator=( *input ); + staging.resize(ndat); + + // copy from CPU Bitseries to GPU staging Bitseries + void * from = (void *) input->get_rawptr(); + void * staged = (void *) staging.get_rawptr(); + uint64_t nval = ndat * nchan * npol; + uint64_t nbytes = nval * ndim; + + if (dsp::Operation::verbose) + cerr << "CUDA::SKA1UnpackerEngine::unpack from=" << from + << " to=" << staged << " nbytes=" << nbytes << endl; + + // ensure no GPU related operations are pending on this stream + cudaStreamSynchronize (stream); + + cudaError_t error = cudaMemcpyAsync (staged, from, nbytes, cudaMemcpyHostToDevice, stream); + if (error != cudaSuccess) + throw Error (FailedCall, "CUDA::SKA1Unpacker::unpack", + "cudaMemcpyAsync %s", cudaGetErrorString (error)); + + float * into = (float *) output->internal_get_buffer(); + size_t pol_span = output->get_datptr(0, 1) - output->get_datptr(0,0); + + unsigned chunk_size = gpu.warpSize; + unsigned nchunk_per_block = gpu.sharedMemPerBlock / (chunk_size * nchan * npol * ndim); + unsigned nval_per_block = nchunk_per_block * chunk_size * nchan * npol; + + unsigned nthreads = gpu.maxThreadsPerBlock; + unsigned nblocks = nval / nval_per_block; + if (nval % nval_per_block > 0) + nblocks++; + + unsigned nval_per_thread = nval_per_block / nthreads; + if (nval_per_block % nthreads) + nval_per_thread++; + + size_t sbytes = nval_per_block * ndim; + + // unpack dem bits + k_unpack<<>> (nval, scale, (float2 *) into, (int16_t *) staged, nchan, npol, pol_span, nval_per_thread, nval_per_block); + + cudaStreamSynchronize(stream); + + if (dsp::Operation::record_time || dsp::Operation::verbose) + check_error_stream ("CUDA::SKA1UnpackerEngine::unpack", stream); + +} +#else +void ska1_unpack_tfp (cudaStream_t stream, uint64_t nval, float scale, + float * into, void * staged, + unsigned nchan, unsigned npol, unsigned ndim, + size_t pol_span) +{ + const unsigned warpSize = 32; + const unsigned sharedMemPerBlock = 49152; + const unsigned maxThreadsPerBlock = 1024; + + unsigned chunk_size = warpSize; + unsigned nchunk_per_block = sharedMemPerBlock / (chunk_size * nchan * npol * ndim); + unsigned nval_per_block = nchunk_per_block * chunk_size * nchan * npol; + + unsigned nthreads = maxThreadsPerBlock; + unsigned nblocks = nval / nval_per_block; + if (nval % nval_per_block > 0) + nblocks++; + + unsigned nval_per_thread = nval_per_block / nthreads; + if (nval_per_block % nthreads) + nval_per_thread++; + + size_t sbytes = nval_per_block * ndim; + +//#ifdef _GDEBUG + cerr << "nval=" << nval << " scale=" << scale << " nchan=" << nchan << " npol=" << npol << " pol_span=" << pol_span << endl; + cerr << "into=" << (void *) into << " staged = " << staged << endl; + cerr << "nblocks=" << nblocks << " nthreads=" << nthreads << " sbytes=" << sbytes << endl; + cerr << "nval_per_thread=" << nval_per_thread << " nval_per_block=" << nval_per_block << endl; +//#endif + + // unpack dem bits + k_unpack_tfp<<>> (nval, scale, (float2 *) into, (int16_t *) staged, nchan, npol, pol_span, nval_per_thread, nval_per_block); + + if (dsp::Operation::record_time || dsp::Operation::verbose) + check_error_stream ("CUDA::SKA1UnpackerEngine::unpack", stream); + + return; +} + +void ska1_unpack_fpt (cudaStream_t stream, uint64_t ndat, float scale, + float * into, void * from, unsigned nchan, + size_t pol_span) +{ + const unsigned nthreads = 1024; + const unsigned npol = 2; + const unsigned ndim = 2; + + dim3 blocks (ndat / nthreads, nchan, npol); + if (ndat % nthreads) + blocks.x++; + + // output pol stride in uints of float2 + const uint64_t pol_stride = (uint64_t) pol_span / ndim; + +#ifdef _GDEBUG + cerr << "ndat=" << ndat << " nchan=" << nchan << " pol_span=" << pol_span << endl; + cerr << "pol_stride=" << pol_stride << endl; + cerr << "into=" << (void *) into << " from=" << from << endl; + cerr << "nblocks=" << blocks.x << " nthreads=" << nthreads << endl; +#endif + + //uint64_t myscale = reinterpret_cast(stream); + //scale = (float) myscale; + //cerr << "stream=" << (void *) stream << " scale=" << scale << endl; + + //const unsigned sdata_bytes = nthreads * ndim; + k_unpack_fpt<<>> ((float2 *) into, (char2 *) from, ndat, pol_stride, scale); + + if (dsp::Operation::record_time || dsp::Operation::verbose) + check_error_stream ("CUDA::SKA1UnpackerEngine::k_unpack_fpt", stream); + + return; +} +#endif diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/spda1k/dsp/spda1k_File.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/spda1k/dsp/spda1k_File.h --- bl-dspsr-0+git20160405/Kernel/Formats/spda1k/dsp/spda1k_File.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/spda1k/dsp/spda1k_File.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/spda1k/dsp/spda1k_File.h,v $ - $Revision: 1.1 $ - $Date: 2009/12/01 07:55:12 $ - $Author: ahotan $ */ +// dspsr/Kernel/Formats/spda1k/dsp/spda1k_File.h #ifndef __SPDA1K_File_h #define __SPDA1K_File_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/spda1k/dsp/spda1k_Unpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/spda1k/dsp/spda1k_Unpacker.h --- bl-dspsr-0+git20160405/Kernel/Formats/spda1k/dsp/spda1k_Unpacker.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/spda1k/dsp/spda1k_Unpacker.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/spda1k/dsp/spda1k_Unpacker.h,v $ - $Revision: 1.1 $ - $Date: 2009/12/01 07:55:12 $ - $Author: ahotan $ */ +// dspsr/Kernel/Formats/spda1k/dsp/spda1k_Unpacker.h #ifndef __SPDA1K_Unpacker_h #define __SPDA1K_Unpacker_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/spigot/dsp/ACFUnpack.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/spigot/dsp/ACFUnpack.h --- bl-dspsr-0+git20160405/Kernel/Formats/spigot/dsp/ACFUnpack.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/spigot/dsp/ACFUnpack.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/spigot/dsp/ACFUnpack.h,v $ - $Revision: 1.3 $ - $Date: 2006/07/09 13:27:09 $ - $Author: wvanstra $ */ +// dspsr/Kernel/Formats/spigot/dsp/ACFUnpack.h #ifndef __ACFUnpack_h #define __ACFUnpack_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/spigot/dsp/SpigotFile.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/spigot/dsp/SpigotFile.h --- bl-dspsr-0+git20160405/Kernel/Formats/spigot/dsp/SpigotFile.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/spigot/dsp/SpigotFile.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/spigot/dsp/SpigotFile.h,v $ - $Revision: 1.4 $ - $Date: 2008/05/28 21:12:43 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/spigot/dsp/SpigotFile.h #ifndef __dsp_SpigotFile_h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/Unpacker_registry.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/Unpacker_registry.C --- bl-dspsr-0+git20160405/Kernel/Formats/Unpacker_registry.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/Unpacker_registry.C 2018-03-12 23:02:35.000000000 +0000 @@ -60,6 +60,11 @@ static dsp::Unpacker::Register::Enter caspsr; #endif +#if HAVE_ska1 +#include "dsp/SKA1Unpacker.h" +static dsp::Unpacker::Register::Enter ska1; +#endif + #if HAVE_cpsr #include "dsp/CPSRTwoBitCorrection.h" static dsp::Unpacker::Register::Enter cpsr; @@ -104,7 +109,9 @@ #if HAVE_kat #include "dsp/KAT7Unpacker.h" +#include "dsp/MeerKATUnpacker.h" static dsp::Unpacker::Register::Enter kat7; +static dsp::Unpacker::Register::Enter meerkat; #endif #if HAVE_lofar_dal @@ -149,6 +156,11 @@ static dsp::Unpacker::Register::Enter mark5; #endif +#if HAVE_mark5b +#include "dsp/Mark5bUnpacker.h" +static dsp::Unpacker::Register::Enter mark5b; +#endif + #if HAVE_maxim #include "dsp/MaximUnpacker.h" static dsp::Unpacker::Register::Enter maxim; @@ -159,8 +171,14 @@ static dsp::Unpacker::Register::Enter miniunpack; #endif +#if HAVE_mopsr +#include "dsp/MOPSRUnpacker.h" +static dsp::Unpacker::Register::Enter mopsr; +#endif + #if HAVE_mwa -// There is no MWA unpacker checked into the repository +#include "dsp/EDAFourBit.h" +static dsp::Unpacker::Register::Enter eda4bit; #endif #if HAVE_pmdaq @@ -200,11 +218,19 @@ static dsp::Unpacker::Register::Enter fits; #endif + +#if HAVE_emerlin +#include "dsp/EmerlinUnpacker.h" +static dsp::Unpacker::Register::Enter emerlin; +#endif + #if HAVE_vdif #include "dsp/VDIFTwoBitCorrection.h" static dsp::Unpacker::Register::Enter vdif; #include "dsp/VDIFTwoBitCorrectionMulti.h" static dsp::Unpacker::Register::Enter vdif_multi; +#include "dsp/VDIFFourBitUnpacker.h" +static dsp::Unpacker::Register::Enter vdif4; #include "dsp/VDIFEightBitUnpacker.h" static dsp::Unpacker::Register::Enter vdif8; #endif @@ -253,9 +279,17 @@ static dsp::Unpacker::Register::Enter gen8bit; /* + Generic four-bit unpacker is used if no other four-bit unpacker steps up +*/ + +#include "dsp/GenericFourBitUnpacker.h" +static dsp::Unpacker::Register::Enter gen4bit; + +/* get_registry is defined here to ensure that this file is linked */ dsp::Unpacker::Register& dsp::Unpacker::get_register() { return Register::get_registry(); } + diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/vdif/dsp/VDIFFourBitUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/vdif/dsp/VDIFFourBitUnpacker.h --- bl-dspsr-0+git20160405/Kernel/Formats/vdif/dsp/VDIFFourBitUnpacker.h 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/vdif/dsp/VDIFFourBitUnpacker.h 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,33 @@ +//-*-C++-*- +/*************************************************************************** + * + * Copyright (C) 2016 by Paul Demorest + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#ifndef __VDIFFourBitUnpacker_h +#define __VDIFFourBitUnpacker_h + +#include "dsp/FourBitUnpacker.h" + +namespace dsp { + + //! Unpack 4-bit, single-pol VDIF data + class VDIFFourBitUnpacker : public FourBitUnpacker { + + public: + + //! Constructor + VDIFFourBitUnpacker (const char* name = "VDIFFourBitUnpacker"); + + protected: + + //! Return true if we can convert the Observation + virtual bool matches (const Observation* observation); + + }; + +} + +#endif // !defined(__VDIFEightBitUnpacker_h) diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/vdif/Makefile.am bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/vdif/Makefile.am --- bl-dspsr-0+git20160405/Kernel/Formats/vdif/Makefile.am 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/vdif/Makefile.am 2018-03-12 23:02:35.000000000 +0000 @@ -5,12 +5,14 @@ dsp/VDIFTwoBitCorrection.h \ dsp/VDIFTwoBitCorrectionMulti.h \ dsp/VDIFTwoBitTable.h \ + dsp/VDIFFourBitUnpacker.h \ dsp/VDIFEightBitUnpacker.h libvdif_la_SOURCES = VDIFFile.C \ VDIFTwoBitCorrection.C \ VDIFTwoBitCorrectionMulti.C \ VDIFTwoBitTable.C \ + VDIFFourBitUnpacker.C \ VDIFEightBitUnpacker.C \ vdifio.c vdifio.h diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/vdif/VDIFFile.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/vdif/VDIFFile.C --- bl-dspsr-0+git20160405/Kernel/Formats/vdif/VDIFFile.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/vdif/VDIFFile.C 2018-03-12 23:02:35.000000000 +0000 @@ -214,7 +214,7 @@ "Read vdif_nchan=%d, this is currently not supported", vdif_nchan); get_info()->set_npol( vdif_nchan ); get_info()->set_nchan( 1 ); - get_info()->set_rate( (double) get_info()->get_bandwidth() * 1e6 + get_info()->set_rate( fabs((double) get_info()->get_bandwidth()) * 1e6 / (double) get_info()->get_nchan() * (get_info()->get_state() == Signal::Nyquist ? 2.0 : 1.0)); if (verbose) cerr << "VDIFFile::open_file rate = " << get_info()->get_rate() << endl; diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/vdif/VDIFFourBitUnpacker.C bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/vdif/VDIFFourBitUnpacker.C --- bl-dspsr-0+git20160405/Kernel/Formats/vdif/VDIFFourBitUnpacker.C 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/vdif/VDIFFourBitUnpacker.C 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,33 @@ +/*************************************************************************** + * + * Copyright (C) 2008 by Jayanta Roy and Willem van Straten + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#include "dsp/VDIFFourBitUnpacker.h" +#include "dsp/BitTable.h" + +using namespace std; + +//! Constructor +dsp::VDIFFourBitUnpacker::VDIFFourBitUnpacker (const char* name) + : FourBitUnpacker ("VDIFFourBit") +{ + BitTable* table = new BitTable (4, BitTable::OffsetBinary); + table->set_order( BitTable::LeastToMost ); + set_table( table ); +} + +bool dsp::VDIFFourBitUnpacker::matches (const Observation* observation) +{ + if (verbose) + cerr << "dsp::VDIFFourBitUnpacker::matches machine=" + << observation->get_machine() + << " nbit=" << observation->get_nbit() << endl; + + return observation->get_machine() == "VDIF" + && observation->get_nbit() == 4 + && observation->get_npol() == 1; +} + diff -Nru bl-dspsr-0+git20160405/Kernel/Formats/wapp/dsp/WAPPUnpacker.h bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/wapp/dsp/WAPPUnpacker.h --- bl-dspsr-0+git20160405/Kernel/Formats/wapp/dsp/WAPPUnpacker.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Kernel/Formats/wapp/dsp/WAPPUnpacker.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Kernel/Formats/wapp/dsp/WAPPUnpacker.h,v $ - $Revision: 1.2 $ - $Date: 2006/11/20 16:06:20 $ - $Author: straten $ */ +// dspsr/Kernel/Formats/wapp/dsp/WAPPUnpacker.h #ifndef __WAPPUnpacker_h #define __WAPPUnpacker_h diff -Nru bl-dspsr-0+git20160405/Makefile.am bl-dspsr-0.0~git20180312.50ea209/Makefile.am --- bl-dspsr-0+git20160405/Makefile.am 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Makefile.am 2018-03-12 23:02:35.000000000 +0000 @@ -18,3 +18,9 @@ backends.list: cp $(top_srcdir)/config/backends.default backends.list +# make clean deletes the local_include directory (and any out-of-date headers) +clean-local: clean-local-include +.PHONY: clean-local-include +clean-local-include: + -rm -rf local_include + diff -Nru bl-dspsr-0+git20160405/More/Makefile.am bl-dspsr-0.0~git20180312.50ea209/More/Makefile.am --- bl-dspsr-0+git20160405/More/Makefile.am 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/More/Makefile.am 2018-03-12 23:02:35.000000000 +0000 @@ -1,15 +1,12 @@ -SUBDIRS = - -lib_LTLIBRARIES = libdspsrmore.la -libdspsrmore_la_SOURCES = -libdspsrmore_la_LIBADD = if HAVE_PGPLOT -SUBDIRS += Plotting -libdspsrmore_la_LIBADD += Plotting/libPlotting.la +SUBDIRS = Plotting Applications + +lib_LTLIBRARIES = libdspsrmore.la +libdspsrmore_la_SOURCES = $(top_srcdir)/Signal/General/ +libdspsrmore_la_LIBADD = Plotting/libPlotting.la -SUBDIRS += Applications endif include $(top_srcdir)/config/Makefile.include diff -Nru bl-dspsr-0+git20160405/More/Plotting/dsp/FluxPlot.h bl-dspsr-0.0~git20180312.50ea209/More/Plotting/dsp/FluxPlot.h --- bl-dspsr-0+git20160405/More/Plotting/dsp/FluxPlot.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/More/Plotting/dsp/FluxPlot.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/psrchive/psrchive/More/Plotting/Pulsar/FluxPlot.h,v $ - $Revision: 1.32 $ - $Date: 2009/02/13 12:06:52 $ - $Author: straten $ */ +// psrchive/More/Plotting/Pulsar/FluxPlot.h #ifndef __Pulsar_FluxPlot_h #define __Pulsar_FluxPlot_h diff -Nru bl-dspsr-0+git20160405/python/Makefile.am bl-dspsr-0.0~git20180312.50ea209/python/Makefile.am --- bl-dspsr-0+git20160405/python/Makefile.am 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/python/Makefile.am 2018-03-12 23:02:35.000000000 +0000 @@ -26,5 +26,5 @@ ############################################################################# # -INCLUDES = @SWIG_PYTHON_CPPFLAGS@ -I$(top_builddir)/local_include @PSRCHIVE_CFLAGS@ +AM_CPPFLAGS = @SWIG_PYTHON_CPPFLAGS@ -I$(top_builddir)/local_include @PSRCHIVE_CFLAGS@ diff -Nru bl-dspsr-0+git20160405/Signal/General/bench_oversample.csh bl-dspsr-0.0~git20180312.50ea209/Signal/General/bench_oversample.csh --- bl-dspsr-0+git20160405/Signal/General/bench_oversample.csh 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/bench_oversample.csh 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,30 @@ +#!/bin/tcsh + +set npt = 8 +set done = 0 + +while ( ( $npt < 5000000 ) && ( $done == 0 ) ) + + set npt_pow2 = `echo $npt | awk '{print ($1 * 4)}'` + + set result = `./undersampling_speed -cuda -f $npt_pow2 -b $npt_pow2 -c 1 -p 1 -t 64|& tail -n 1` + if ( $? == 0) then + set gflops_pow2 = `echo $result | awk -F= '{print $NF}'` + else + set done = 1 + endif + + set npt_fwd = `echo $npt | awk '{print ($1 * 6)}'` + set npt_bwd = `echo $npt | awk '{print ($1 * 5)}'` + set result = `./undersampling_speed -cuda -f $npt_fwd -b $npt_bwd -c 1 -p 1 -t 64 |& tail -n 1` + if ( $? == 0) then + set gflops_over = `echo $result | awk -F= '{print $NF}'` + else + set done = 1 + endif + + echo "npts pow2=$npt_pow2 -> gflops=$gflops_pow2 npt_fwd=$npt_fwd -> gflops=$gflops_over" + + @ npt = $npt * 2 + +end diff -Nru bl-dspsr-0+git20160405/Signal/General/Convolution.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/Convolution.C --- bl-dspsr-0+git20160405/Signal/General/Convolution.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/Convolution.C 2018-03-12 23:02:35.000000000 +0000 @@ -5,6 +5,10 @@ * ***************************************************************************/ +#if HAVE_CONFIG_H +#include +#endif + #include "dsp/Convolution.h" #include "dsp/WeightedTimeSeries.h" #include "dsp/Apodization.h" @@ -14,6 +18,10 @@ #include "dsp/Dedispersion.h" #include "dsp/Scratch.h" +#if HAVE_CUDA +#include "dsp/MemoryCUDA.h" +#endif + #include "FTransform.h" //#define _DEBUG 1 @@ -33,6 +41,28 @@ { } +//! Set the device memory to use +void dsp::Convolution::set_device (Memory* mem) +{ + memory = mem; + +#if HAVE_CUDA + CUDA::DeviceMemory* device_memory = dynamic_cast< CUDA::DeviceMemory*> ( mem); + + if ( device_memory ) + { + Scratch* gpu_scratch = new Scratch; + gpu_scratch->set_memory (device_memory); + set_scratch (gpu_scratch); + } +#endif +} + +void dsp::Convolution::set_engine (Engine * _engine) +{ + engine = _engine; +} + //! Set the frequency response function void dsp::Convolution::set_response (Response* _response) { @@ -76,11 +106,11 @@ { if (!response) throw Error (InvalidState, "dsp::Convolution::prepare", - "no frequency response"); + "no frequency response"); if (input->get_detected()) throw Error (InvalidState, "dsp::Convolution::prepare", - "input data are detected"); + "input data are detected"); response->match (input); @@ -90,7 +120,7 @@ // response must have at least two points in it if (response->get_ndat() < 2) throw Error (InvalidState, "dsp::Convolution::prepare", - "invalid response size"); + "invalid response size"); // if the response has 8 dimensions, then perform matrix convolution matrix_convolution = response->get_ndim() == 8; @@ -102,13 +132,13 @@ // if matrix convolution, then there must be two polns if (matrix_convolution && npol != 2) throw Error (InvalidState, "dsp::Convolution::prepare", - "matrix response and input.npol != 2"); + "matrix response and input.npol != 2"); // response must contain a unique kernel for each channel if (response->get_nchan() != nchan) throw Error (InvalidState, "dsp::Convolution::prepare", - "invalid response nsub=%d != nchan=%d", - response->get_nchan(), nchan); + "invalid response nsub=%d != nchan=%d", + response->get_nchan(), nchan); // number of points after first fft n_fft = response->get_ndat(); @@ -123,7 +153,7 @@ if (verbose) cerr << "Convolution::prepare filt=" << n_fft - << " smear=" << nfilt_tot << endl; + << " smear=" << nfilt_tot << endl; // 2 arrays needed: one for each of the forward and backward FFT results // 2 floats per complex number @@ -150,25 +180,34 @@ } else throw Error (InvalidState, "dsp::Convolution::prepare", - "Cannot transform Signal::State=" - + tostring(input->get_state())); + "Cannot transform Signal::State=" + + tostring(input->get_state())); // the FFT size must be greater than the number of discarded points if (nsamp_fft < nsamp_overlap) throw Error (InvalidState, "dsp::Convolution::prepare", - "error nfft=%d < nfilt=%d", nsamp_fft, nsamp_overlap); + "error nfft=%d < nfilt=%d", nsamp_fft, nsamp_overlap); if (has_buffering_policy()) { if (verbose) cerr << "dsp::Convolution::prepare" - " reserve=" << nsamp_fft << endl; + " reserve=" << nsamp_fft << endl; get_buffering_policy()->set_minimum_samples (nsamp_fft); } prepare_output (); + if (engine) + { + if (verbose) + cerr << "dsp::Convolution::make_preparations setup engine" << endl; + engine->prepare (this); + prepared = true; + return; + } + using namespace FTransform; if (state == Signal::Nyquist) @@ -191,13 +230,19 @@ if (verbose) cerr << "dsp::Convolution::prepare_output nsamp fft=" << nsamp_fft - << " overlap=" << nsamp_overlap << " step=" << nsamp_step << endl; + << " overlap=" << nsamp_overlap << " step=" << nsamp_step << endl; // number of FFTs for this data block npart = 0; if (ndat >= nsamp_fft) npart = (ndat-nsamp_overlap)/nsamp_step; + if (engine) + { + //scratch_needed = npart * n_fft * 2; + scratch_needed = n_fft * 2 * 2; + } + /* The input must be buffered before the output is modified because the transformation may be inplace @@ -220,6 +265,14 @@ if ( state == Signal::Nyquist ) output->set_rate( 0.5*get_input()->get_rate() ); + + // set the input sample + uint64_t output_ndat = output->get_ndat(); + int64_t input_sample = input->get_input_sample(); + if (output_ndat == 0) + output->set_input_sample (0); + else if (input_sample >= 0) + output->set_input_sample ((input_sample / nsamp_step) * nsamp_step); } //! Reserve the maximum amount of output space required @@ -232,17 +285,17 @@ if (verbose) cerr << "Convolution::reserve ndat=" << ndat << " nfft=" << nsamp_fft - << " npart=" << npart << endl; + << " npart=" << npart << endl; uint64_t output_ndat = npart * nsamp_step; if ( state == Signal::Nyquist ) output_ndat /= 2; - + if( input != output ) output->resize (output_ndat); else output->set_ndat (output_ndat); - + // nfilt_pos complex points are dropped from the start of the first FFT output->change_start_time (nfilt_pos); @@ -304,6 +357,12 @@ if (matrix_convolution) spectrum[1] += n_fft * 2; + if (engine) + { + engine->set_scratch (spectrum[0]); + engine->perform (input, output, npart); + return; + } float* complex_time = spectrum[1] + n_fft * 2; // although only two extra points are required, adding 4 ensures that @@ -315,7 +374,7 @@ if (verbose) cerr << "dsp::Convolution::transformation step nsamp=" << nsamp_step - << " bytes=" << nbytes_step << " ndim=" << ndim << endl; + << " bytes=" << nbytes_step << " ndim=" << ndim << endl; const unsigned cross_pol = matrix_convolution ? 2 : 1; @@ -331,71 +390,71 @@ for (unsigned ipol=0; ipol < npol; ipol++) for (uint64_t ipart=0; ipart < npart; ipart++) { - offset = ipart * step; - - for (jpol=0; jpol(input->get_datptr (ichan, ipol)) + offset; - - if (apodization) - { - apodization -> operate (ptr, complex_time); - ptr = complex_time; - } - - DEBUG("FORWARD: nfft=" << nsamp_fft << " in=" << ptr \ - << " out=" << spectrum[ipol]); - - if (state == Signal::Nyquist) - forward->frc1d (nsamp_fft, spectrum[ipol], ptr); - - else if (state == Signal::Analytic) - forward->fcc1d (nsamp_fft, spectrum[ipol], ptr); - - } - - if (matrix_convolution) { - - response->operate (spectrum[0], spectrum[1], ichan); - - if (passband) - passband->integrate (spectrum[0], spectrum[1], ichan); - - } - - else { - - response->operate (spectrum[ipol], ipol, ichan); - - if (passband) - passband->integrate (spectrum[ipol], ipol, ichan); - - } - - for (jpol=0; jpol +#endif + +using namespace std; + +#if HAVE_CUFFT_CALLBACKS + +// [0] channel offset ( ichan * npt) +// [1] npt +// [2] first_ipt ( nfilt_pos ) +// [3] last_ipt ( npt - nfilt_neg ) +// [4] nfilt_tot +__device__ __constant__ unsigned conv_params[5]; + +///////////////////////////////////////////////////////////////////////// +// +// store with multiplication by dedispersion kernel [no FFT batching] +// +__device__ void CB_convolve_and_store (void * dataOut, size_t offset, cufftComplex d, void * callerInfo, void *sharedPtr) +{ + // the dedispersion kernel complex float for this element of the FFT + const cufftComplex k = ((cufftComplex *) callerInfo)[conv_params[0] + offset]; + ((cufftComplex*)dataOut)[offset] = cuCmulf (d, k); +} + +__device__ void CB_convolve_and_store_batch (void * dataOut, size_t offset, cufftComplex d, void * callerInfo, void *sharedPtr) +{ + // the dedispersion kernel value for this element of the FFT + const unsigned kernel_offset = conv_params[0] + (offset % conv_params[1]); + const cufftComplex k = ((cufftComplex *) callerInfo)[kernel_offset]; + + ((cufftComplex*)dataOut)[offset] = cuCmulf (d, k); +} +__device__ cufftCallbackStoreC d_store_fwd = CB_convolve_and_store; +__device__ cufftCallbackStoreC d_store_fwd_batch = CB_convolve_and_store_batch; + +///////////////////////////////////////////////////////////////////////// +// +// store with output filtering on +// +__device__ void CB_filtered_store (void * dataOut, size_t offset, cufftComplex d, void * callerInfo, void *sharedPtr) +{ + // if offset < nfilt_pos, discard + if (offset < conv_params[2]) + return; + + // if offset > (npt - nfilt_neg), discard + if (offset >= conv_params[3]) + return; + + ((cufftComplex*)dataOut)[offset - conv_params[2]] = d; +} + +__device__ void CB_filtered_store_batch (void * dataOut, size_t offset, cufftComplex d, void * callerInfo, void *sharedPtr) +{ + const unsigned ibatch = offset / conv_params[1]; + const unsigned ipt = offset - (ibatch * conv_params[1]); + + // if ipt < nfilt_pos, discard + if (ipt < conv_params[2]) + return; + + // if ipt > (npt - nfilt_neg), discard + if (ipt >= conv_params[3]) + return; + + // substract the required offsets + offset -= ((ibatch * conv_params[4]) + conv_params[2]); + + ((cufftComplex*)dataOut)[offset] = d; +} + +__device__ cufftCallbackStoreC d_store_bwd = CB_filtered_store; +__device__ cufftCallbackStoreC d_store_bwd_batch = CB_filtered_store_batch; + +void setup_callbacks_ConvolutionCUDA (cufftHandle plan_fwd, cufftHandle plan_bwd, + cufftHandle plan_fwd_batched, cufftHandle plan_bwd_batched, + cufftComplex * d_kernels, int nbatch, cudaStream_t stream) +{ + cudaError_t error; + cufftResult_t result; + + cufftCallbackStoreC h_store_fwd; + cufftCallbackStoreC h_store_bwd; + cufftCallbackStoreC h_store_fwd_batch; + cufftCallbackStoreC h_store_bwd_batch; + + error = cudaMemcpyFromSymbolAsync(&h_store_fwd, d_store_fwd, + sizeof(h_store_fwd), 0, + cudaMemcpyDeviceToHost, stream); + if (error != cudaSuccess) + throw Error (FailedCall, "CUDA::ConvolutionEngine::setup_callbacks", + "cudaMemcpyFromSymbolAsync failed for h_store_fwd"); + + error = cudaMemcpyFromSymbolAsync(&h_store_bwd, d_store_bwd, + sizeof(h_store_bwd), 0, + cudaMemcpyDeviceToHost, stream); + if (error != cudaSuccess) + throw Error (FailedCall, "CUDA::ConvolutionEngine::setup_callbacks", + "cudaMemcpyFromSymbolAsync failed for h_store_bwd"); + + error = cudaMemcpyFromSymbolAsync(&h_store_fwd_batch, d_store_fwd_batch, + sizeof(h_store_fwd_batch), 0, + cudaMemcpyDeviceToHost, stream); + if (error != cudaSuccess) + throw Error (FailedCall, "CUDA::ConvolutionEngine::setup_callbacks", + "cudaMemcpyFromSymbolAsync failed for h_store_fwd_batch"); + + error = cudaMemcpyFromSymbolAsync(&h_store_bwd_batch, d_store_bwd_batch, + sizeof(h_store_bwd_batch), 0, + cudaMemcpyDeviceToHost, stream); + if (error != cudaSuccess) + throw Error (FailedCall, "CUDA::ConvolutionEngine::setup_callbacks", + "cudaMemcpyFromSymbolAsync failed for h_store_bwd_batch"); + + result = cufftXtSetCallback (plan_fwd, (void **)&h_store_fwd, + CUFFT_CB_ST_COMPLEX, (void **)&d_kernels); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_callbacks", + "cufftXtSetCallback (plan_fwd, h_store_fwd)"); + + result = cufftXtSetCallback (plan_bwd, (void **)&h_store_bwd, + CUFFT_CB_ST_COMPLEX, 0); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_callbacks", + "cufftXtSetCallback (plan_bwd, h_store_bwd)"); + + if (nbatch > 0) + { + result = cufftXtSetCallback (plan_fwd_batched, (void **)&h_store_fwd_batch, + CUFFT_CB_ST_COMPLEX, (void **)&d_kernels); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_callbacks", + "cufftXtSetCallback (plan_fwd_batched, h_store_fwd_batch)"); + + result = cufftXtSetCallback (plan_bwd_batched, (void **)&h_store_bwd_batch, + CUFFT_CB_ST_COMPLEX, 0); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_callbacks", + "cufftXtSetCallback (plan_bwd_batched, h_store_bwd_batch)"); + } +} + +void setup_callbacks_conv_params (unsigned * h_ptr, unsigned h_size, cudaStream_t stream) +{ + cudaError_t error = cudaMemcpyToSymbolAsync (conv_params, (void *) h_ptr, + h_size, 0, + cudaMemcpyHostToDevice, stream); + if (error != cudaSuccess) + { + throw Error (InvalidState, "CUDA::ConvolutionEngine::setup_kernel", + "could not initialize convolution params in device memory"); + } + +} + +// +// +// + +// [0] first_ipt ( nfilt_pos ) +// [1] last_ipt ( npt - nfilt_neg ) +__device__ __constant__ unsigned conv_params_spectral[2]; + +///////////////////////////////////////////////////////////////////////// +// +// store with multiplication by dedispersion kernel +// +__device__ void CB_convolve_and_store_spectral (void * dataOut, size_t offset, cufftComplex d, void * callerInfo, void *sharedPtr) +{ + // the dedispersion kernel complex float for this element of the FFT + const cufftComplex k = ((cufftComplex *) callerInfo)[offset]; + ((cufftComplex*)dataOut)[offset] = cuCmulf (d, k); +} +__device__ cufftCallbackStoreC d_store_fwd_spectral = CB_convolve_and_store_spectral; + +///////////////////////////////////////////////////////////////////////// +// +// store with output filtering on +// +__device__ void CB_filtered_store_spectral (void * dataOut, size_t offset, cufftComplex d, void * callerInfo, void *sharedPtr) +{ + // if offset < nfilt_pos, discard + if (offset < conv_params_spectral[0]) + return; + + // if offset > (npt - nfilt_neg), discard + if (offset >= conv_params_spectral[1]) + return; + + ((cufftComplex*)dataOut)[offset - conv_params_spectral[0]] = d; +} +__device__ cufftCallbackStoreC d_store_bwd_spectral = CB_filtered_store_spectral; + + +void setup_callbacks_ConvolutionCUDASpectral (cufftHandle plan_fwd, cufftHandle plan_bwd, cufftComplex * d_kernels, cudaStream_t stream) +{ + cudaError_t error; + cufftResult_t result; + + cufftCallbackStoreC h_store_fwd; + cufftCallbackStoreC h_store_bwd; + + error = cudaMemcpyFromSymbolAsync(&h_store_fwd, d_store_fwd_spectral, + sizeof(h_store_fwd), 0, + cudaMemcpyDeviceToHost, stream); + if (error != cudaSuccess) + throw Error (FailedCall, "CUDA::ConvolutionEngineSpectral::setup_callbacks", + "cudaMemcpyFromSymbolAsync failed for h_store_fwd"); + + error = cudaMemcpyFromSymbolAsync(&h_store_bwd, d_store_bwd_spectral, + sizeof(h_store_bwd), 0, + cudaMemcpyDeviceToHost, stream); + if (error != cudaSuccess) + throw Error (FailedCall, "CUDA::ConvolutionEngineSpectral::setup_callbacks", + "cudaMemcpyFromSymbolAsync failed for h_store_bwd"); + + result = cufftXtSetCallback (plan_fwd, (void **)&h_store_fwd, + CUFFT_CB_ST_COMPLEX, (void **)&d_kernels); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::setup_callbacks", + "cufftXtSetCallback (plan_fwd, h_store_fwd)"); + + result = cufftXtSetCallback (plan_bwd, (void **)&h_store_bwd, + CUFFT_CB_ST_COMPLEX, 0); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::setup_callbacks", + "cufftXtSetCallback (plan_bwd, h_store_bwd)"); +} + +void setup_callbacks_conv_params_spectral (unsigned * h_ptr, unsigned h_size, cudaStream_t stream) +{ + cudaError_t error = cudaMemcpyToSymbolAsync (conv_params_spectral, (void *) h_ptr, + h_size, 0, cudaMemcpyHostToDevice, stream); + if (error != cudaSuccess) + { + throw Error (InvalidState, "CUDA::ConvolutionEngineSpectral::setup_kernel", + "could not initialize convolution params in device memory"); + } +} + + + + +#endif diff -Nru bl-dspsr-0+git20160405/Signal/General/ConvolutionCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Signal/General/ConvolutionCUDA.cu --- bl-dspsr-0+git20160405/Signal/General/ConvolutionCUDA.cu 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/ConvolutionCUDA.cu 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,797 @@ +//-*-C++-*- + +/*************************************************************************** + * + * Copyright (C) 2015 by Andrew Jameson + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#include "dsp/ConvolutionCUDA.h" +#include "CUFFTError.h" +#include "debug.h" + +#if HAVE_CUFFT_CALLBACKS +#include "dsp/ConvolutionCUDACallbacks.h" +#include +#endif + +#include +#include + +using namespace std; + +void check_error_stream (const char*, cudaStream_t); + +__global__ void k_multiply_conv (float2* d_fft, const __restrict__ float2 * kernel, unsigned npart) +{ + const unsigned npt = blockDim.x * gridDim.x; + unsigned i = blockIdx.x*blockDim.x + threadIdx.x; + + // load the kernel for this fine channel + const float2 k = kernel[i]; + + while (i < npt * npart) + { + d_fft[i] = cuCmulf(d_fft[i], k); + i += npt; + } +} + +__global__ void k_ncopy_conv (float2* output_data, unsigned output_stride, + const float2* input_data, unsigned input_stride, + unsigned to_copy) +{ + // shift the input forward FFT by the required number of batches + input_data += blockIdx.y * input_stride; + + // shift in output forward + output_data += blockIdx.y * output_stride; + + unsigned index = blockIdx.x * blockDim.x + threadIdx.x; + + if (index < to_copy) + output_data[index] = input_data[index]; +} + + +#if HAVE_CUFFT_CALLBACKS +/* +// [0] channel offset ( ichan * npt) +// [1] npt +// [2] first_ipt ( nfilt_pos ) +// [3] last_ipt ( npt - nfilt_neg ) +// [4] nfilt_tot +__device__ __constant__ unsigned conv_params[5]; + +///////////////////////////////////////////////////////////////////////// +// +// store with multiplication by dedispersion kernel [no FFT batching] +// +__device__ void CB_convolve_and_store (void * dataOut, size_t offset, cufftComplex d, void * callerInfo, void *sharedPtr) +{ + // the dedispersion kernel complex float for this element of the FFT + const cufftComplex k = ((cufftComplex *) callerInfo)[conv_params[0] + offset]; + ((cufftComplex*)dataOut)[offset] = cuCmulf (d, k); +} + +__device__ void CB_convolve_and_store_batch (void * dataOut, size_t offset, cufftComplex d, void * callerInfo, void *sharedPtr) +{ + // the dedispersion kernel value for this element of the FFT + const unsigned kernel_offset = conv_params[0] + (offset % conv_params[1]); + const cufftComplex k = ((cufftComplex *) callerInfo)[kernel_offset]; + + ((cufftComplex*)dataOut)[offset] = cuCmulf (d, k); +} +__device__ cufftCallbackStoreC d_store_fwd = CB_convolve_and_store; +__device__ cufftCallbackStoreC d_store_fwd_batch = CB_convolve_and_store_batch; + +///////////////////////////////////////////////////////////////////////// +// +// store with output filtering on +// +__device__ void CB_filtered_store (void * dataOut, size_t offset, cufftComplex d, void * callerInfo, void *sharedPtr) +{ + // if offset < nfilt_pos, discard + if (offset < conv_params[2]) + return; + + // if offset > (npt - nfilt_neg), discard + if (offset >= conv_params[3]) + return; + + ((cufftComplex*)dataOut)[offset - conv_params[2]] = d; +} + +__device__ void CB_filtered_store_batch (void * dataOut, size_t offset, cufftComplex d, void * callerInfo, void *sharedPtr) +{ + const unsigned ibatch = offset / conv_params[1]; + const unsigned ipt = offset - (ibatch * conv_params[1]); + + // if ipt < nfilt_pos, discard + if (ipt < conv_params[2]) + return; + + // if ipt > (npt - nfilt_neg), discard + if (ipt >= conv_params[3]) + return; + + // substract the required offsets + offset -= ((ibatch * conv_params[4]) + conv_params[2]); + + ((cufftComplex*)dataOut)[offset] = d; +} + +__device__ cufftCallbackStoreC d_store_bwd = CB_filtered_store; +__device__ cufftCallbackStoreC d_store_bwd_batch = CB_filtered_store_batch; +*/ +#endif + +CUDA::ConvolutionEngine::ConvolutionEngine (cudaStream_t _stream) +{ + stream = _stream; + + // create plan handles + cufftResult result; + + result = cufftCreate (&plan_fwd); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::ConvolutionEngine", + "cufftCreate(plan_fwd)"); + + result = cufftCreate (&plan_fwd_batched); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::ConvolutionEngine", + "cufftCreate(plan_fwd_batched)"); + + result = cufftCreate (&plan_bwd); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::ConvolutionEngine", + "cufftCreate(plan_bwd)"); + + result = cufftCreate (&plan_bwd_batched); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::ConvolutionEngine", + "cufftCreate(plan_bwd_batched)"); + + nbatch = 0; + npt_fwd = 0; + npt_bwd = 0; + + work_area = 0; + work_area_size = 0; + + buf = 0; + d_kernels = 0; +} + +CUDA::ConvolutionEngine::~ConvolutionEngine() +{ + cufftResult result; + + result = cufftDestroy (plan_fwd); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::~ConvolutionEngine", + "cufftDestroy(plan_fwd)"); + + result = cufftDestroy (plan_fwd_batched); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::~ConvolutionEngine", + "cufftDestroy(plan_fwd)"); + + result = cufftDestroy (plan_bwd); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::~ConvolutionEngine", + "cufftDestroy(plan_bwd)"); + + result = cufftDestroy (plan_bwd_batched); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::~ConvolutionEngine", + "cufftDestroy(plan_bwd_batched)"); + + if (work_area) + { + cudaError_t error = cudaFree (work_area); + if (error != cudaSuccess) + throw Error (FailedCall, "CUDA::ConvolutionEngine::~ConvolutionEngine", + "cudaFree(%xu): %s", &work_area, + cudaGetErrorString (error)); + } +} + +void CUDA::ConvolutionEngine::set_scratch (void * scratch) +{ + d_scratch = (cufftComplex *) scratch; +} + +// prepare all relevant attributes for the engine +void CUDA::ConvolutionEngine::prepare (dsp::Convolution * convolution) +{ + const dsp::Response* response = convolution->get_response(); + + npt_bwd = response->get_ndat(); + npt_fwd = convolution->get_minimum_samples(); + nsamp_overlap = convolution->get_minimum_samples_lost(); + nsamp_step = npt_fwd - nsamp_overlap; + nfilt_pos = response->get_impulse_pos (); + nfilt_neg = response->get_impulse_neg (); + + if (convolution->get_input()->get_state() == Signal::Nyquist) + type_fwd = CUFFT_R2C; + else + type_fwd = CUFFT_C2C; + + // configure the dedispersion kernel + setup_kernel (convolution->get_response()); + + // configure the singular FFT + setup_singular (); + + // it is only more efficient to batch about to about 1M points + // at least on the TitanX, so lets choose the number of batches + // based on that + unsigned npart = 1048576 / npt_fwd; + + if (npart > 1) + setup_batched (npart); + else + nbatch = 0; + +#if HAVE_CUFFT_CALLBACKS + setup_callbacks_ConvolutionCUDA (plan_fwd, plan_bwd, plan_fwd_batched, plan_bwd_batched, d_kernels, nbatch, stream); +#endif + + // initialize the kernel size configuration + mp.init(); + mp.set_nelement (npt_bwd); +} + +// setup the convolution kernel based on the reposnse +void CUDA::ConvolutionEngine::setup_kernel (const dsp::Response * response) +{ + unsigned nchan = response->get_nchan(); + unsigned ndat = response->get_ndat(); + unsigned ndim = response->get_ndim(); + + assert (ndim == 2); + assert (d_kernels == 0); + + // allocate memory for dedispersion kernel of all channels + unsigned kernels_size = ndat * sizeof(cufftComplex) * nchan; + cudaError_t error = cudaMalloc ((void**)&d_kernels, kernels_size); + if (error != cudaSuccess) + { + throw Error (InvalidState, "CUDA::ConvolutionEngine::setup_kernel", + "could not allocate device memory for dedispersion kernel"); + } + + // copy all kernels from host to device + const float* kernel = response->get_datptr (0,0); + + cerr << "CUDA::ConvolutionEngine::setup_kernel cudaMemcpy stream=" << stream + << " size=" << kernels_size << endl; + if (stream) + error = cudaMemcpyAsync (d_kernels, kernel, kernels_size, cudaMemcpyHostToDevice, stream); + else + error = cudaMemcpy (d_kernels, kernel, kernels_size, cudaMemcpyHostToDevice); + if (error != cudaSuccess) + { + throw Error (InvalidState, "CUDA::ConvolutionEngine::setup_kernel", + "could not copy dedispersion kernel to device"); + } + +#if HAVE_CUFFT_CALLBACKS + error = cudaMallocHost ((void **) h_conv_params, sizeof(unsigned) * 5); + if (error != cudaSuccess) + throw Error (InvalidState, "CUDA::ConvolutionEngine::setup_kernel", + "could not allocate memory for h_conv_params"); + + h_conv_params[0] = 0; + h_conv_params[1] = npt_bwd; + h_conv_params[2] = nfilt_pos; + h_conv_params[3] = npt_bwd - nfilt_neg; + h_conv_params[4] = nfilt_pos + nfilt_neg; + + setup_callbacks_conv_params (h_conv_params, sizeof(h_conv_params), stream); + +#endif +} + +void CUDA::ConvolutionEngine::setup_singular () +{ + if (dsp::Operation::verbose) + cerr << "CUDA::ConvolutionEngine::setup_singular fwd=" << npt_fwd + << " bwd=" << npt_bwd << endl; + + // setup forward plan + cufftResult result = cufftPlan1d (&plan_fwd, npt_fwd, type_fwd, 1); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_singular", + "cufftPlan1d(plan_fwd)"); + + result = cufftSetStream (plan_fwd, stream); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_singular", + "cufftSetStream(plan_fwd)"); + + // setup backward plan + result = cufftPlan1d (&plan_bwd, npt_bwd, CUFFT_C2C, 1); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_singular", + "cufftPlan1d(plan_bwd)"); + + result = cufftSetStream (plan_bwd, stream); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_singular", + "cufftSetStream(plan_bwd)"); + + size_t buffer_size = npt_bwd * sizeof (cufftComplex); + cudaError_t error = cudaMalloc ((void **) &buf, buffer_size); + if (error != cudaSuccess) + throw Error (FailedCall, "CUDA::ConvolutionEngine::setup_singular", + "cudaMalloc(%x, %u): %s", &buf, buffer_size, + cudaGetErrorString (error)); +} + + +// configure the singular and batched FFT plans +void CUDA::ConvolutionEngine::setup_batched (unsigned _nbatch) +{ + if (dsp::Operation::verbose) + cerr << "CUDA::ConvolutionEngine::setup_batched npt_fwd=" << npt_fwd + << " npt_bwd=" << npt_bwd << " nbatch=" << _nbatch << endl; + + nbatch = _nbatch; + + int rank = 1; + int inembed[1]; + int onembed[1]; + int istride, ostride, idist, odist; + cufftResult result; + + // now setup the forward batched plan + size_t work_size_fwd, work_size_bwd; + + // complex layout plans for input + inembed[0] = npt_fwd; + onembed[0] = npt_bwd; + + istride = 1; + ostride = 1; + + // the fordward FFT only moves forward a shorter amount + idist = nsamp_step; + odist = npt_bwd; + + // setup forward fft + result = cufftMakePlanMany (plan_fwd_batched, rank, &npt_fwd, + inembed, istride, idist, + onembed, ostride, odist, + type_fwd, nbatch, &work_size_fwd); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_batched", + "cufftMakePlanMany (plan_fwd_batched)"); + + result = cufftSetStream (plan_fwd_batched, stream); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_batched", + "cufftSetStream(plan_fwd_batched)"); + + // get a rough estimate on work buffer size + work_size_fwd = 0; + result = cufftEstimateMany(rank, &npt_fwd, + inembed, istride, idist, + onembed, ostride, odist, + type_fwd, nbatch, &work_size_fwd); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_batched", + "cufftEstimateMany(plan_fwd)"); + + // complex layout plans for input + inembed[0] = npt_bwd; + onembed[0] = nsamp_step; + + istride = 1; + ostride = 1; + + // the fordward FFT only moves forward a shorter amount + idist = npt_bwd; + odist = nsamp_step; + + // the backward FFT is a has a simple layout (npt_bwd) + DEBUG("CUDA::ConvolutionEngine::setup_batched cufftMakePlanMany (plan_bwd_batched)"); + result = cufftMakePlanMany (plan_bwd_batched, rank, &npt_bwd, + inembed, istride, idist, + onembed, ostride, odist, + CUFFT_C2C, nbatch, &work_size_bwd); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_batched", + "cufftMakePlanMany (plan_bwd_batched)"); + + result = cufftSetStream (plan_bwd_batched, stream); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_batched", + "cufftSetStream(plan_bwd_batched)"); + + DEBUG("CUDA::ConvolutionEngine::setup_batched bwd FFT plan set"); + + work_size_bwd = 0; + result = cufftEstimateMany(rank, &npt_bwd, + inembed, istride, idist, + onembed, ostride, odist, + CUFFT_C2C, nbatch, &work_size_bwd); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_batched", + "cufftEstimateMany(plan_fwd)"); + + work_area_size = (work_size_fwd > work_size_bwd) ? work_size_fwd : work_size_bwd; + auto_allocate = work_area_size > 0; + + DEBUG("CUDA::ConvolutionEngine::setup_batched cufftSetAutoAllocation(plan_fwd)"); + result = cufftSetAutoAllocation(plan_fwd_batched, auto_allocate); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_batched", + "cufftSetAutoAllocation(plan_bwd_batched, %d)", + auto_allocate); + + DEBUG("CUDA::ConvolutionEngine::setup_batched cufftSetAutoAllocation(plan_bwd_batched)"); + result = cufftSetAutoAllocation(plan_bwd_batched, auto_allocate); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_batched", + "cufftSetAutoAllocation(plan_bwd_batched, %d)", auto_allocate); + + // free the space allocated for buf in setup_singular + cudaError_t error = cudaFree (buf); + if (error != cudaSuccess) + throw Error (FailedCall, "CUDA::ConvolutionEngine::setup_batched", + "cudaFree(%x): %s", &buf, cudaGetErrorString (error)); + + size_t batched_buffer_size = npt_bwd * nbatch * sizeof (cufftComplex); + error = cudaMalloc ((void **) &buf, batched_buffer_size); + if (error != cudaSuccess) + throw Error (FailedCall, "CUDA::ConvolutionEngine::setup_batched", + "cudaMalloc(%x, %u): %s", &buf, batched_buffer_size, + cudaGetErrorString (error)); + + // allocate device memory for dedispsersion kernel (1 channel) + + if (work_area_size > 0) + { + if (work_area) + { + error = cudaFree (work_area); + if (error != cudaSuccess) + throw Error (FailedCall, "CUDA::ConvolutionEngine::setup", + "cudaFree(%xu): %s", &work_area, + cudaGetErrorString (error)); + } + DEBUG("CUDA::ConvolutionEngine::setup cudaMalloc("< 0) + { + result = cufftXtSetCallback (plan_fwd_batched, (void **)&h_store_fwd_batch, + CUFFT_CB_ST_COMPLEX, (void **)&d_kernels); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_callbacks", + "cufftXtSetCallback (plan_fwd_batched, h_store_fwd_batch)"); + + result = cufftXtSetCallback (plan_bwd_batched, (void **)&h_store_bwd_batch, + CUFFT_CB_ST_COMPLEX, 0); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_callbacks", + "cufftXtSetCallback (plan_bwd_batched, h_store_bwd_batch)"); + } +} +*/ +#endif + + +// Perform convolution choosing the optimal batched size or if ndat is not as +// was configured, then perform singular +void CUDA::ConvolutionEngine::perform (const dsp::TimeSeries* input, dsp::TimeSeries * output, unsigned npart) +{ + if (dsp::Operation::verbose) + cerr << "CUDA::ConvolutionEngine::perform (" << npart << ")" << endl; + + if (npart == 0) + return; + + if (type_fwd == CUFFT_C2C) + perform_complex (input, output, npart); + else + perform_real (input, output, npart); + +} + +void CUDA::ConvolutionEngine::perform_complex (const dsp::TimeSeries* input, + dsp::TimeSeries * output, + unsigned npart) +{ + const unsigned npol = input->get_npol(); + const unsigned nchan = input->get_nchan(); + const unsigned ndim = input->get_ndim(); + + cufftComplex * in; + cufftComplex * out; + cufftResult result; + + const unsigned in_step_batch = nsamp_step * nbatch; + const unsigned out_step_batch = nsamp_step * nbatch; + + unsigned nbp = 0; + if (nbatch > 0) + nbp = npart / nbatch; + + if (dsp::Operation::verbose) + cerr << "CUDA::ConvolutionEngine::perform_complex npart=" << npart + << " nbatch=" << nbatch + << " npb=" << nbp << " nsamp_step=" << nsamp_step << endl; + +#if !HAVE_CUFFT_CALLBACKS + dim3 blocks = dim3 (nsamp_step, nbatch, 0); + if (nsamp_step % mp.get_nthread()) + blocks.x++; +#endif + + for (unsigned ichan=0; ichanget_datptr (ichan, ipol); + out = (cufftComplex *) output->get_datptr (ichan, ipol); + + // for each batched FFT + for (unsigned i=0; i>> (buf, + d_kernels + k_offset, + nbatch); + + // perform the inverse batched FFT (in-place) + result = cufftExecC2C (plan_bwd_batched, buf, buf, CUFFT_INVERSE); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::perform_complex", + "cufftExecC2C(plan_bwd_batched)"); + + // copy batches of output from input + k_ncopy_conv<<>> (out, nsamp_step, + buf + nfilt_pos, npt_bwd, + out_step_batch); +#endif + + out += out_step_batch; + in += in_step_batch; + } + + for (unsigned ipart=nbp*nbatch; ipart>> (buf, + d_kernels + k_offset, + 1); + + // perform the inverse batched FFT (in-place) + result = cufftExecC2C (plan_bwd, buf, buf, CUFFT_INVERSE); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::perform", + "cufftExecC2C(plan_bwd_batched)"); + + // copy batches of output from input + k_ncopy_conv<<>> (out, nsamp_step, + buf + nfilt_pos, npt_bwd, + nsamp_step); +#endif + + in += nsamp_step; + out += nsamp_step; + } + } + } + + if (dsp::Operation::record_time || dsp::Operation::verbose) + check_error_stream( "CUDA::ConvolutionEngine::perform_complex", stream ); +} + +void CUDA::ConvolutionEngine::perform_real(const dsp::TimeSeries* input, + dsp::TimeSeries * output, + unsigned npart) +{ + const unsigned npol = input->get_npol(); + const unsigned nchan = input->get_nchan(); + const unsigned ndim = input->get_ndim(); + + cufftReal * in; + cufftComplex * out; + cufftResult result; + + const unsigned out_nsamp_step = nsamp_step / 2; + + const unsigned in_step_batch = nsamp_step * nbatch; + const unsigned out_step_batch = out_nsamp_step * nbatch; + + unsigned nbp = 0; + if (nbatch > 0) + nbp = npart / nbatch; + + dim3 blocks = dim3 (out_nsamp_step, nbatch, 0); + if (out_nsamp_step % mp.get_nthread()) + blocks.x++; + + if (dsp::Operation::verbose) + cerr << "CUDA::ConvolutionEngine::perform_real nsamp_step=" << nsamp_step + << " npt_bwd=" << npt_bwd << endl; + + for (unsigned ichan=0; ichanget_datptr (ichan, ipol); + out = (cufftComplex *) output->get_datptr (ichan, ipol); + + // for each batched FFT + for (unsigned i=0; i>> (buf, + d_kernels + k_offset, + nbatch); + + // perform the inverse batched FFT (in-place) + result = cufftExecC2C (plan_bwd_batched, buf, buf, CUFFT_INVERSE); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::perform_real", + "cufftExecC2C(plan_bwd_batched)"); + + // copy batches of output from input + k_ncopy_conv<<>> (out, out_nsamp_step, + buf + nfilt_pos, npt_bwd, + out_step_batch); + + in += in_step_batch; + out += out_step_batch; + } + + for (unsigned ipart=nbp*nbatch; ipart>> (buf, + d_kernels + k_offset, + 1); + + // perform the inverse batched FFT (in-place) + result = cufftExecC2C (plan_bwd, buf, buf, CUFFT_INVERSE); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::perform", + "cufftExecC2C(plan_bwd_batched)"); + + // copy batches of output from input + k_ncopy_conv<<>> (out, out_nsamp_step, + buf + nfilt_pos, npt_bwd, + out_step_batch); + in += nsamp_step; + out += out_nsamp_step; + } + } + } + if (dsp::Operation::record_time || dsp::Operation::verbose) + check_error_stream( "CUDA::ConvolutionEngine::perform_real", stream ); +} diff -Nru bl-dspsr-0+git20160405/Signal/General/ConvolutionCUDASpectral.cu bl-dspsr-0.0~git20180312.50ea209/Signal/General/ConvolutionCUDASpectral.cu --- bl-dspsr-0+git20160405/Signal/General/ConvolutionCUDASpectral.cu 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/ConvolutionCUDASpectral.cu 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,618 @@ +//-*-C++-*- + +/*************************************************************************** + * + * Copyright (C) 2015 by Andrew Jameson + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#include "dsp/ConvolutionCUDASpectral.h" +#include "CUFFTError.h" +#include "debug.h" + +#if HAVE_CUFFT_CALLBACKS +#include "dsp/ConvolutionCUDACallbacks.h" +#include +#endif + +#include +#include + +using namespace std; + +void check_error_stream (const char*, cudaStream_t); + +// ichan == blockIdx.y +// ipt_bwd == blockIdx.x * blockDim.x + threadIdx.x +__global__ void k_multiply_conv_spectral (float2* d_fft, const __restrict__ float2 * kernel, unsigned npt_bwd) +{ + const unsigned idx = (blockIdx.y * npt_bwd) + (blockIdx.x * blockDim.x) + threadIdx.x; + d_fft[idx] = cuCmulf(d_fft[idx], kernel[idx]); +} + +// ichan == blockIdx.y +// ipt_bwd == blockIdx.x * blockDim.x + threadIdx.x +__global__ void k_ncopy_conv_spectral (float2* output_data, uint64_t ostride, + const float2* input_data, uint64_t istride, + unsigned nfilt_pos, unsigned nsamp_step) +{ + + const unsigned idx = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (idx < nfilt_pos) + return; + + uint64_t in_offset = istride * blockIdx.y; + uint64_t out_offset = ostride * blockIdx.y; + + unsigned isamp = idx; + unsigned osamp = idx - nfilt_pos; + + if (osamp < nsamp_step) + output_data[out_offset + osamp] = input_data[in_offset + isamp]; +} + +CUDA::ConvolutionEngineSpectral::ConvolutionEngineSpectral (cudaStream_t _stream) +{ + stream = _stream; + + // create plan handles + cufftResult result; + + result = cufftCreate (&plan_fwd); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::ConvolutionEngineSpectral", + "cufftCreate(plan_fwd)"); + + result = cufftCreate (&plan_bwd); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::ConvolutionEngineSpectral", + "cufftCreate(plan_bwd)"); + + fft_configured = false; + nchan = 0; + npt_fwd = 0; + npt_bwd = 0; + + work_area = 0; + work_area_size = 0; + + buf = 0; + d_kernels = 0; +} + +CUDA::ConvolutionEngineSpectral::~ConvolutionEngineSpectral() +{ + cufftResult result; + + result = cufftDestroy (plan_fwd); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::~ConvolutionEngineSpectral", + "cufftDestroy(plan_fwd)"); + + result = cufftDestroy (plan_bwd); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::~ConvolutionEngineSpectral", + "cufftDestroy(plan_bwd)"); + + if (work_area) + { + cudaError_t error = cudaFree (work_area); + if (error != cudaSuccess) + throw Error (FailedCall, "CUDA::ConvolutionEngineSpectral::~ConvolutionEngineSpectral", + "cudaFree(%xu): %s", &work_area, + cudaGetErrorString (error)); + } + + if (buf) + { + cudaError_t error = cudaFree (buf); + if (error != cudaSuccess) + throw Error (FailedCall, "CUDA::ConvolutionEngineSpectral::~ConvolutionEngineSpectral", + "cudaFree(%xu): %s", &buf, + cudaGetErrorString (error)); + } +} + +void CUDA::ConvolutionEngineSpectral::regenerate_plans() +{ + cufftResult result; + result = cufftDestroy (plan_fwd); + result = cufftCreate (&plan_fwd); + + result = cufftDestroy (plan_bwd); + result = cufftCreate (&plan_bwd); +} + +void CUDA::ConvolutionEngineSpectral::set_scratch (void * scratch) +{ + d_scratch = (cufftComplex *) scratch; +} + +// prepare all relevant attributes for the engine +void CUDA::ConvolutionEngineSpectral::prepare (dsp::Convolution * convolution) +{ + const dsp::Response* response = convolution->get_response(); + + nchan = response->get_nchan(); + npt_bwd = response->get_ndat(); + npt_fwd = convolution->get_minimum_samples(); + nsamp_overlap = convolution->get_minimum_samples_lost(); + nsamp_step = npt_fwd - nsamp_overlap; + nfilt_pos = response->get_impulse_pos (); + nfilt_neg = response->get_impulse_neg (); + + if (convolution->get_input()->get_state() == Signal::Nyquist) + type_fwd = CUFFT_R2C; + else + type_fwd = CUFFT_C2C; + + // configure the dedispersion kernel + setup_kernel (convolution->get_response()); + + fft_configured = false; + + // initialize the kernel size configuration + mp.init(); + mp.set_nelement (npt_bwd); +} + +// setup the convolution kernel based on the reposnse +void CUDA::ConvolutionEngineSpectral::setup_kernel (const dsp::Response * response) +{ + unsigned nchan = response->get_nchan(); + unsigned ndat = response->get_ndat(); + unsigned ndim = response->get_ndim(); + + assert (ndim == 2); + assert (d_kernels == 0); + + if (dsp::Operation::verbose) + cerr << "CUDA::ConvolutionEngineSpectral::setup_kernel response: " + << "nchan=" << nchan << " ndat=" << ndat << " ndim=" << ndim << endl; + + // allocate memory for dedispersion kernel of all channels + unsigned kernels_size = ndat * sizeof(cufftComplex) * nchan; + cudaError_t error = cudaMalloc ((void**)&d_kernels, kernels_size); + if (error != cudaSuccess) + { + throw Error (InvalidState, "CUDA::ConvolutionEngineSpectral::setup_kernel", + "could not allocate device memory for dedispersion kernel"); + } + + // copy all kernels from host to device + const float* kernel = response->get_datptr (0,0); + + if (dsp::Operation::verbose) + cerr << "CUDA::ConvolutionEngineSpectral::setup_kernel cudaMemcpy stream=" + << stream << " size=" << kernels_size << endl; + if (stream) + error = cudaMemcpyAsync (d_kernels, kernel, kernels_size, cudaMemcpyHostToDevice, stream); + else + error = cudaMemcpy (d_kernels, kernel, kernels_size, cudaMemcpyHostToDevice); + if (error != cudaSuccess) + { + throw Error (InvalidState, "CUDA::ConvolutionEngineSpectral::setup_kernel", + "could not copy dedispersion kernel to device"); + } + +#if HAVE_CUFFT_CALLBACKS + error = cudaMallocHost ((void **) h_conv_params, sizeof(unsigned) * 2); + if (error != cudaSuccess) + throw Error (InvalidState, "CUDA::ConvolutionEngineSpectral::setup_kernel", + "could not allocate memory for h_conv_params"); + + h_conv_params[0] = nfilt_pos; + h_conv_params[1] = npt_bwd - nfilt_neg; + setup_callbacks_conv_params_spectral (h_conv_params, sizeof (h_conv_params), stream); +#endif +} + +// configure the batched FFT plans +void CUDA::ConvolutionEngineSpectral::setup_batched (const dsp::TimeSeries* input, + dsp::TimeSeries * output) +{ + if (dsp::Operation::verbose) + cerr << "CUDA::ConvolutionEngineSpectral::setup_batched npt_fwd=" << npt_fwd + << " npt_bwd=" << npt_bwd << endl; + + nchan = input->get_nchan(); + npol = input->get_npol(); + unsigned ndim = input->get_ndim(); + +#ifdef _DEBUG + cerr << "CUDA::ConvolutionEngineSpectral::setup_batched nchan=" << nchan + << " npol=" << npol << " ndat=" << input->get_ndat() << endl; +#endif + + input_stride = (input->get_datptr (1, 0) - input->get_datptr (0, 0)) / ndim; + output_stride = (output->get_datptr (1, 0) - output->get_datptr (0, 0) ) / ndim; + + int rank = 1; + int inembed[1]; + int onembed[1]; + int istride, ostride, idist, odist; + cufftResult result; + + // now setup the forward batched plan + size_t work_size_fwd, work_size_bwd; + + // complex layout plans for input + inembed[0] = npt_fwd; + onembed[0] = npt_bwd; + + istride = 1; + ostride = 1; + + idist = (int) input_stride; + odist = npt_bwd; + +#ifdef _DEBUG + cerr << "CUDA::ConvolutionEngineSpectral::setup_batched npt_fwd=" << npt_fwd + << " nbatch=" << nchan << endl; + cerr << "CUDA::ConvolutionEngineSpectral::setup_batched input_stride=" + << input_stride << " output_stride=" << output_stride << endl; +#endif + + // setup forward fft + result = cufftMakePlanMany (plan_fwd, rank, &npt_fwd, + inembed, istride, idist, + onembed, ostride, odist, + type_fwd, nchan, &work_size_fwd); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::setup_batched", + "cufftMakePlanMany (plan_fwd)"); + + result = cufftSetStream (plan_fwd, stream); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::setup_batched", + "cufftSetStream(plan_fwd)"); + + // get a rough estimate on work buffer size + work_size_fwd = 0; + result = cufftEstimateMany(rank, &npt_fwd, + inembed, istride, idist, + onembed, ostride, odist, + type_fwd, nchan, &work_size_fwd); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::setup_batched", + "cufftEstimateMany(plan_fwd)"); + + istride = 1; + ostride = 1; + +#ifdef HAVE_CUFFT_CALLBACKS + inembed[0] = npt_bwd; + onembed[0] = nsamp_step; + + idist = npt_bwd; + odist = (int) output_stride; +#else + inembed[0] = npt_bwd; + onembed[0] = npt_bwd; + + idist = npt_bwd; + odist = npt_bwd; +#endif + + // the backward FFT is a has a simple layout (npt_bwd) + DEBUG("CUDA::ConvolutionEngineSpectral::setup_batched cufftMakePlanMany (plan_bwd)"); + result = cufftMakePlanMany (plan_bwd, rank, &npt_bwd, + inembed, istride, idist, + onembed, ostride, odist, + CUFFT_C2C, nchan, &work_size_bwd); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::setup_batched", + "cufftMakePlanMany (plan_bwd)"); + + result = cufftSetStream (plan_bwd, stream); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::setup_batched", + "cufftSetStream(plan_bwd)"); + + DEBUG("CUDA::ConvolutionEngineSpectral::setup_batched bwd FFT plan set"); + + work_size_bwd = 0; + result = cufftEstimateMany(rank, &npt_bwd, + inembed, istride, idist, + onembed, ostride, odist, + CUFFT_C2C, nchan, &work_size_bwd); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::setup_batched", + "cufftEstimateMany(plan_fwd)"); + +/* + work_area_size = (work_size_fwd > work_size_bwd) ? work_size_fwd : work_size_bwd; + auto_allocate = work_area_size > 0; + + DEBUG("CUDA::ConvolutionEngineSpectral::setup_batched cufftSetAutoAllocation(plan_fwd)"); + result = cufftSetAutoAllocation(plan_fwd, auto_allocate); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::setup_batched", + "cufftSetAutoAllocation(plan_bwd, %d)", + auto_allocate); + + DEBUG("CUDA::ConvolutionEngineSpectral::setup_batched cufftSetAutoAllocation(plan_bwd)"); + result = cufftSetAutoAllocation(plan_bwd, auto_allocate); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::setup_batched", + "cufftSetAutoAllocation(plan_bwd, %d)", auto_allocate); + +*/ + // free the space allocated for buf in setup_singular + cudaError_t error; + if (buf) + { + error = cudaFree (buf); + if (error != cudaSuccess) + throw Error (FailedCall, "CUDA::ConvolutionEngineSpectral::setup_batched", + "cudaFree(%x): %s", &buf, cudaGetErrorString (error)); + } + + size_t batched_buffer_size = npt_bwd * nchan * sizeof (cufftComplex); + error = cudaMalloc ((void **) &buf, batched_buffer_size); + if (error != cudaSuccess) + throw Error (FailedCall, "CUDA::ConvolutionEngineSpectral::setup_batched", + "cudaMalloc(%x, %u): %s", &buf, batched_buffer_size, + cudaGetErrorString (error)); + + // allocate device memory for dedispsersion kernel (1 channel) +/* + if (work_area_size > 0) + { + if (work_area) + { + error = cudaFree (work_area); + if (error != cudaSuccess) + throw Error (FailedCall, "CUDA::ConvolutionEngineSpectral::setup", + "cudaFree(%xu): %s", &work_area, + cudaGetErrorString (error)); + } + DEBUG("CUDA::ConvolutionEngineSpectral::setup cudaMalloc("<get_datptr (1, 0) - input->get_datptr (0, 0)) / input->get_ndim(); + uint64_t curr_ostride = (output->get_datptr (1, 0) - output->get_datptr (0, 0)) / output->get_ndim(); + + if (dsp::Operation::verbose) + { + cerr << "CUDA::ConvolutionEngineSpectral::perform istride prev=" << input_stride << " curr=" << curr_istride << " ndim=" << input->get_ndim() << endl; + cerr << "CUDA::ConvolutionEngineSpectral::perform ostride prev=" << output_stride << " curr=" << curr_ostride << " ndim=" << +output->get_ndim() << endl; + } + + if (curr_istride != input_stride || curr_ostride != output_stride) + { + if (dsp::Operation::verbose) + cerr << "CUDA::ConvolutionEngineSpectral::perform reconfiguring FFT batch sizes" << endl; + fft_configured = false; + } + + if (!fft_configured) + { + regenerate_plans (); + setup_batched (input, output); +#if HAVE_CUFFT_CALLBACKS + cerr << "CUDA::ConvolutionEngineSpectral::perform setup_callbacks_ConvolutionCUDASpectral()" << endl; + setup_callbacks_ConvolutionCUDASpectral (plan_fwd, plan_bwd, d_kernels, stream); +#endif + fft_configured = true; + } + + if (type_fwd == CUFFT_C2C) + { + perform_complex (input, output, npart); + } + else + { + cerr << "CUDA::ConvolutionEngineSpectral::perform_real not implemented" << endl; + //perform_real (input, output, npart); + } +} + +void CUDA::ConvolutionEngineSpectral::perform_complex (const dsp::TimeSeries* input, + dsp::TimeSeries * output, + unsigned npart) +{ + const unsigned npol = input->get_npol(); + const unsigned nchan = input->get_nchan(); + const unsigned ndim = input->get_ndim(); + const uint64_t ipol_stride = input_stride / npol; + const uint64_t opol_stride = output_stride / npol; + + cufftComplex * in; + cufftComplex * out; + cufftResult result; + + if (dsp::Operation::verbose) + cerr << "CUDA::ConvolutionEngineSpectral::perform_complex npart=" << npart + << " nsamp_step=" << nsamp_step << endl; + +#if !HAVE_CUFFT_CALLBACKS + dim3 blocks = dim3 (npt_bwd / mp.get_nthread(), nchan); + unsigned nthreads = mp.get_nthread(); + + if (npt_bwd <= nthreads) + { + blocks.x = 1; + nthreads = npt_bwd; + } + else + { + if (npt_bwd % nthreads) + blocks.x++; + } +#endif + + cufftComplex * in_t = (cufftComplex *) input->get_datptr (0, 0); + cufftComplex * out_t = (cufftComplex *) output->get_datptr (0, 0); + + if (dsp::Operation::verbose) + cerr << "CUDA::ConvolutionEngineSpectral::perform_complex in=" << in_t << " out=" << out_t << endl; + + for (unsigned ipart=0; ipart>> (buf, d_kernels, npt_bwd); + + // perform the inverse batched FFT (in-place) + result = cufftExecC2C (plan_bwd, buf, buf, CUFFT_INVERSE); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::perform_complex", + "cufftExecC2C(plan_bwd)"); + + // copy batches of output from input + k_ncopy_conv_spectral<<>> (out, output_stride, + buf, npt_bwd, + nfilt_pos, nsamp_step); +#endif + in += ipol_stride; + out += opol_stride; + } + + in_t += nsamp_step; + out_t += nsamp_step; + } + + if (dsp::Operation::record_time || dsp::Operation::verbose) + check_error_stream( "CUDA::ConvolutionEngineSpectral::perform_complex", stream ); +} + +#if 0 +void CUDA::ConvolutionEngineSpectral::perform_real(const dsp::TimeSeries* input, + dsp::TimeSeries * output, + unsigned npart) +{ + const unsigned npol = input->get_npol(); + const unsigned nchan = input->get_nchan(); + const unsigned ndim = input->get_ndim(); + + cufftReal * in; + cufftComplex * out; + cufftResult result; + + const unsigned out_nsamp_step = nsamp_step / 2; + + const unsigned in_step_batch = nsamp_step * nbatch; + const unsigned out_step_batch = out_nsamp_step * nbatch; + + unsigned nbp = 0; + if (nbatch > 0) + nbp = npart / nbatch; + + dim3 blocks = dim3 (out_nsamp_step, nbatch, 0); + if (out_nsamp_step % mp.get_nthread()) + blocks.x++; + + if (dsp::Operation::verbose) + cerr << "CUDA::ConvolutionEngineSpectral::perform_real nsamp_step=" << nsamp_step + << " npt_bwd=" << npt_bwd << endl; + + for (unsigned ichan=0; ichanget_datptr (ichan, ipol); + out = (cufftComplex *) output->get_datptr (ichan, ipol); + + // for each batched FFT + for (unsigned i=0; i>> (buf, + d_kernels + k_offset, + nbatch); + + // perform the inverse batched FFT (in-place) + result = cufftExecC2C (plan_bwd, buf, buf, CUFFT_INVERSE); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::perform_real", + "cufftExecC2C(plan_bwd)"); + + // copy batches of output from input + k_ncopy_conv<<>> (out, out_nsamp_step, + buf + nfilt_pos, npt_bwd, + out_step_batch); + + in += in_step_batch; + out += out_step_batch; + } + + for (unsigned ipart=nbp*nbatch; ipart>> (buf, + d_kernels + k_offset, + 1); + + // perform the inverse batched FFT (in-place) + result = cufftExecC2C (plan_bwd, buf, buf, CUFFT_INVERSE); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngineSpectral::perform", + "cufftExecC2C(plan_bwd)"); + + // copy batches of output from input + k_ncopy_conv<<>> (out, out_nsamp_step, + buf + nfilt_pos, npt_bwd, + out_step_batch); + in += nsamp_step; + out += out_nsamp_step; + } + } + } + if (dsp::Operation::record_time || dsp::Operation::verbose) + check_error_stream( "CUDA::ConvolutionEngineSpectral::perform_real", stream ); +} +#endif diff -Nru bl-dspsr-0+git20160405/Signal/General/cross_detect.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/cross_detect.h --- bl-dspsr-0+git20160405/Signal/General/cross_detect.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/cross_detect.h 2018-03-12 23:02:35.000000000 +0000 @@ -4,10 +4,7 @@ * Licensed under the Academic Free License version 2.1 * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/cross_detect.h,v $ - $Revision: 1.1 $ - $Date: 2006/10/15 18:56:39 $ - $Author: straten $ */ +// dspsr/Signal/General/cross_detect.h #ifndef __cross_detect_h #define __cross_detect_h diff -Nru bl-dspsr-0+git20160405/Signal/General/cufft_callback_bench.cu bl-dspsr-0.0~git20180312.50ea209/Signal/General/cufft_callback_bench.cu --- bl-dspsr-0+git20160405/Signal/General/cufft_callback_bench.cu 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/cufft_callback_bench.cu 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,523 @@ +/*************************************************************************** + * + * Copyright (C) 2015 by Andrew Jameson + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#if HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include + +#include "CUFFTError.h" +#include "CommandLine.h" +#include "RealTimer.h" + +#include +#include +#include +#include + +using namespace std; + +__global__ void k_unpack (cuFloatComplex * output, const __restrict__ char2 * input, const float scale) +{ + unsigned i = blockIdx.x*blockDim.x + threadIdx.x; + + char2 element = input[i]; + output[i] = make_cuComplex ((float) element.x/scale, (float) element.y/scale); +} + +__global__ void k_multiply (float2* d_fft, const __restrict__ float2 * kernel, unsigned npart) +{ + const unsigned npt = blockDim.x * gridDim.x; + unsigned i = blockIdx.x*blockDim.x + threadIdx.x; + + // load the kernel for this fine channel + const float2 k = kernel[i]; + + while (i < npt * npart) + { + const float2 d = d_fft[i]; + const float x = d.x * k.x - d.y * k.y; + d_fft[i].y = d.x * k.y + d.y * k.x; + d_fft[i].x = x; + i += npt; + } +} + +///////////////////////////////////////////////////////////////////////// +// +// store with multiplication by dedispersion kernel +// +__device__ void CB_convolve_and_storeC (void * dataOut, size_t offset, cufftComplex d, void * callerInfo, void *sharedPtr) +{ + // the dedispersion kernel value for this element of the FFT + const cufftComplex k = ((cufftComplex *) callerInfo)[offset]; + ((cufftComplex*)dataOut)[offset] = cuCmulf (d, k); +} + +__device__ cufftCallbackStoreC d_store_fwd_C = CB_convolve_and_storeC; + + +///////////////////////////////////////////////////////////////////////// +// +// convert an 8bit number to 32 bit +// +__device__ cufftComplex cufft_callback_load_8bit( + void *dataIn, + size_t offset, + void *callerInfo, + void *sharedPtr) +{ + const __restrict__ char2 in = ((char2 *)dataIn)[offset]; + const float scale = 127.0f; + return make_cuComplex ((float)in.x/scale, (float) in.y/scale); + //char2 in = ((char2*)dataIn)[offset]; + //float2 out; + //out.x = (float) in.x / scale; + //out.y = (float) in.y / scale; + + //return out; + //return make_cuComplex ((float) element.x, (float) element.y); + //return make_cuComplex ((float) element.x/scale, (float) element.y/scale); +} +__device__ cufftCallbackLoadC d_load_8bit_fwd_C = cufft_callback_load_8bit; + + +///////////////////////////////////////////////////////////////////////// +// +// convert an 16bit number to 32 bit +// +__device__ cufftComplex cufft_callback_load_half2( + void *dataIn, + size_t offset, + void *callerInfo, + void *sharedPtr) +{ + half * ptr = (half*) dataIn + (2*offset); + return make_cuComplex ( __half2float(ptr[0]), __half2float(ptr[1])); +} + +__device__ cufftCallbackLoadC d_load_half2_fwd_C = cufft_callback_load_half2; + + +///////////////////////////////////////////////////////////////////////// +// +// store with output filtering on +// +__device__ void CB_filtered_store (void * dataOut, size_t offset, cufftComplex d, void * callerInfo, void *sharedPtr) +{ + unsigned nfilt_pos = ((unsigned *) callerInfo)[0]; + unsigned nsamp_filt = ((unsigned *) callerInfo)[1]; + + offset -= nfilt_pos; + if ((offset > 0) && (offset < nsamp_filt)) + ((cufftComplex*)dataOut)[offset] = d; +} + +__device__ cufftCallbackStoreC d_store_bwd_C = CB_filtered_store; + +class Speed : public Reference::Able +{ +public: + + Speed (); + + // parse command line options + void parseOptions (int argc, char** argv); + + // run the test + void runTest (); + +protected: + + int npt; + int niter; + unsigned gpu_id; + bool cuda; +}; + + +Speed::Speed () +{ + gpu_id = 0; + niter = 16; + npt = 1024; + cuda = false; +} + +int main(int argc, char** argv) try +{ + Speed speed; + speed.parseOptions (argc, argv); + speed.runTest (); + return 0; +} +catch (Error& error) +{ + cerr << error << endl; + return -1; +} + +void Speed::parseOptions (int argc, char** argv) +{ + CommandLine::Menu menu; + CommandLine::Argument* arg; + + menu.set_help_header ("undersampling_speed - measure under sampling speed"); + menu.set_version ("undersampling_speed version 1.0"); + + arg = menu.add (npt, 'n', "npt"); + arg->set_help ("number of points in each FFT"); + +#if HAVE_CUFFT + arg = menu.add (gpu_id, 'd'); + arg->set_help ("GPU device ID"); +#endif + + arg = menu.add (niter, 't', "ninter"); + arg->set_help ("number of iterations (batch/loops)"); + +#if HAVE_CUFFT + arg = menu.add (cuda, "cuda"); + arg->set_help ("benchmark CUDA"); +#endif + + menu.parse (argc, argv); +} + +#if HAVE_CUFFT +void check_error_stream (const char*, cudaStream_t); +#endif + +void Speed::runTest () +{ +#ifdef _DEBUG + dsp::Operation::verbose = true; + dsp::Observation::verbose = true; +#endif + + // assume complex FFTs + const unsigned ndim = 2; + + cudaStream_t stream = 0; + if (cuda) + { + cerr << "using GPU " << gpu_id << endl; + cudaError_t err = cudaSetDevice(gpu_id); + if (err != cudaSuccess) + throw Error (InvalidState, "undersampling_speed", + "cudaSetDevice failed: %s", cudaGetErrorString(err)); + + err = cudaStreamCreate( &stream ); + if (err != cudaSuccess) + throw Error (InvalidState, "undersampling_speed", + "cudaStreamCreate failed: %s", cudaGetErrorString(err)); + + } + + const unsigned ndat = npt * niter; + const unsigned raw_size = ndat * ndim * sizeof(int8_t); + const unsigned half2_size = ndat * ndim * sizeof(half); + const unsigned unpacked_size = ndat * ndim * sizeof(float); + const unsigned kernel_size = npt * sizeof (cuFloatComplex); + + char2 * raw; + half2 * input_h2; + cufftComplex * input; + cufftComplex * buffer; + cufftComplex * output; + cufftComplex * d_kernel; + unsigned * d_offsets; + cufftResult result; + size_t work_size; + + cudaMalloc ((void **) &raw, raw_size); + cudaMalloc ((void **) &input_h2, half2_size); + cudaMalloc ((void **) &input, unpacked_size); + cudaMalloc ((void **) &buffer, unpacked_size); + cudaMalloc ((void **) &output, unpacked_size); + cudaMalloc ((void **) &d_kernel, kernel_size); + cudaMalloc ((void **) &d_offsets, 2 * sizeof(unsigned)); + + cudaMemsetAsync ((void *) raw, 0, raw_size, stream); + cudaMemsetAsync ((void *) input, 0, unpacked_size, stream); + cudaMemsetAsync ((void *) input_h2, 0, half2_size, stream); + cudaMemsetAsync ((void *) d_kernel, 0, kernel_size, stream); + + unsigned * h_offsets; + cudaMallocHost((void **) &h_offsets, 2 * sizeof(unsigned)); + h_offsets[0] = (unsigned) (npt / 15); + h_offsets[1] = (unsigned) (npt / 15); + + cudaMemcpyAsync ((void *) d_offsets, (void *) h_offsets, 2 * sizeof(unsigned), cudaMemcpyHostToDevice, stream); + + // all plans are using batched FFTs to ensure at least 1M points + + cufftHandle plan_batch; + result = cufftCreate (&plan_batch); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "Speed::runTest", "cufftCreate(plan_batch)"); + + int rank = 1; + result = cufftMakePlanMany (plan_batch, rank, &npt, NULL, 0, 0, NULL, 0, 0, + CUFFT_C2C, niter, &work_size); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "Speed::runTest", "cufftMakePlanMany (plan_batch)"); + + result = cufftSetStream (plan_batch, stream); + if (result != CUFFT_SUCCESS) + CUFFTError (result, "Speed::runTest", "cufftSetStream (plan_batch)"); + + + cufftHandle plan_callback; + result = cufftCreate (&plan_callback); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "Speed::runTest", "cufftCreate(plan_callback)"); + + result = cufftMakePlanMany (plan_callback, rank, &npt, NULL, 0, 0, NULL, 0, 0, + CUFFT_C2C, niter, &work_size); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "Speed::runTest", "cufftMakePlanMany (plan_callback)"); + + result = cufftSetStream (plan_callback, stream); + if (result != CUFFT_SUCCESS) + CUFFTError (result, "Speed::runTest", "cufftSetStream (plan_callback)"); + + cufftHandle plan_half; + result = cufftCreate (&plan_half); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "Speed::runTest", "cufftCreate(plan_half)"); + + result = cufftMakePlanMany (plan_half, rank, &npt, NULL, 0, 0, NULL, 0, 0, + CUFFT_C2C, niter, &work_size); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "Speed::runTest", "cufftMakePlanMany (plan_half)"); + + result = cufftSetStream (plan_half, stream); + if (result != CUFFT_SUCCESS) + CUFFTError (result, "Speed::runTest", "cufftSetStream (plan_half)"); + + cufftHandle plan_bwd; + result = cufftCreate (&plan_bwd); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "Speed::runTest", "cufftCreate(plan_bwd)"); + + result = cufftMakePlanMany (plan_bwd, rank, &npt, NULL, 0, 0, NULL, 0, 0, + CUFFT_C2C, niter, &work_size); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "Speed::runTest", "cufftMakePlanMany (plan_bwd)"); + + result = cufftSetStream (plan_bwd, stream); + if (result != CUFFT_SUCCESS) + CUFFTError (result, "Speed::runTest", "cufftSetStream (plan_bwd)"); + + cufftHandle plan_bwd_cb; + result = cufftCreate (&plan_bwd_cb); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "Speed::runTest", "cufftCreate(plan_bwd_cb)"); + + result = cufftMakePlanMany (plan_bwd_cb, rank, &npt, NULL, 0, 0, NULL, 0, 0, + CUFFT_C2C, niter, &work_size); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "Speed::runTest", "cufftMakePlanMany (plan_bwd_cb)"); + + result = cufftSetStream (plan_bwd_cb, stream); + if (result != CUFFT_SUCCESS) + CUFFTError (result, "Speed::runTest", "cufftSetStream (plan_bwd_cb)"); + + + + + RealTimer timer_batch; + RealTimer timer_callback; + RealTimer timer_half; + RealTimer timer_; + + cufftCallbackLoadC h_load_8bit_fwd_C; + cufftCallbackLoadC h_load_half2_fwd_C; + cufftCallbackStoreC h_store_fwd_C; + cufftCallbackStoreC h_store_bwd_C; + cudaError_t error; + + error = cudaMemcpyFromSymbolAsync(&h_load_8bit_fwd_C, + d_load_8bit_fwd_C, + sizeof(h_load_8bit_fwd_C), + 0, + cudaMemcpyDeviceToHost, + stream); + if (error != cudaSuccess) + throw Error (FailedCall, "Speed::runTest", + "cudaMemcpyFromSymbolAsync failed for h_load_8bit_fwd_C"); + + + error = cudaMemcpyFromSymbolAsync(&h_load_half2_fwd_C, + d_load_half2_fwd_C, + sizeof(h_load_half2_fwd_C), + 0, + cudaMemcpyDeviceToHost, + stream); + if (error != cudaSuccess) + throw Error (FailedCall, "Speed::runTest", + "cudaMemcpyFromSymbolAsync failed for h_load_half2_fwd_C"); + + error = cudaMemcpyFromSymbolAsync(&h_store_fwd_C, + d_store_fwd_C, + sizeof(h_store_fwd_C), + 0, + cudaMemcpyDeviceToHost, + stream); + if (error != cudaSuccess) + throw Error (FailedCall, "Speed::runTest", + "cudaMemcpyFromSymbolAsync failed for h_store_fwd_C"); + + error = cudaMemcpyFromSymbolAsync(&h_store_bwd_C, + d_store_bwd_C, + sizeof(h_store_bwd_C), + 0, + cudaMemcpyDeviceToHost, + stream); + if (error != cudaSuccess) + throw Error (FailedCall, "Speed::runTest", + "cudaMemcpyFromSymbolAsync failed for h_store_bwd_C"); + + result = cufftXtSetCallback (plan_callback, + (void **)&h_load_8bit_fwd_C, + CUFFT_CB_LD_COMPLEX, + 0); + if (result == CUFFT_LICENSE_ERROR) + throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_callbacks", + "CUFFT Callback invalid license"); + cerr << "result=" << result << endl; + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_callbacks", + "cufftXtSetCallback (plan_fwd, h_load_8bit_fwd_C)"); + +/* + result = cufftXtSetCallback (plan_half, + (void **)&h_load_half2_fwd_C, + CUFFT_CB_LD_COMPLEX, + 0); + if (result == CUFFT_LICENSE_ERROR) + throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_callbacks", + "CUFFT Callback invalid license"); + cerr << "result=" << result << endl; + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_callbacks", + "cufftXtSetCallback (plan_fwd, h_load_half2_fwd_C)"); +*/ + + result = cufftXtSetCallback (plan_callback, + (void **)&h_store_fwd_C, + CUFFT_CB_ST_COMPLEX, + (void **)&d_kernel); + if (result == CUFFT_LICENSE_ERROR) + throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_callbacks", + "CUFFT Callback invalid license"); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_callbacks", + "cufftXtSetCallback (plan_fwd, h_store_fwd_C)"); + + result = cufftXtSetCallback (plan_bwd_cb, + (void **)&h_store_bwd_C, + CUFFT_CB_ST_COMPLEX, + (void **)&d_offsets); + if (result == CUFFT_LICENSE_ERROR) + throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_callbacks", + "CUFFT Callback invalid license"); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::ConvolutionEngine::setup_callbacks", + "cufftXtSetCallback (plan_bwd_cb, h_store_bwd_C)"); + + + cudaStreamSynchronize (stream); +/* + timer_half.start(); + + result = cufftExecC2C (plan_half, (cufftComplex *) input_h2, output, CUFFT_FORWARD); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "Speed::runTest", "cufftExecC2C(plan_half)"); + cudaStreamSynchronize(stream); + + timer_half.stop(); +*/ + + timer_callback.start (); + + result = cufftExecC2C (plan_callback, (cuFloatComplex *) raw, buffer, CUFFT_FORWARD); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "Speed::runTest", "cufftExecC2C(plan_callback)"); + + result = cufftExecC2C (plan_bwd_cb, output, buffer, CUFFT_INVERSE); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "Speed::runTest", "cufftExecC2C(plan_callback)"); + + cudaStreamSynchronize(stream); + + timer_callback.stop (); + double total_time, time_per_fft, time_us; + + total_time = timer_callback.get_elapsed(); + time_per_fft = total_time / niter; + time_us = time_per_fft * 1e6; + cerr << "CALLBACK: total_time=" << total_time << " time_per_fft=" << time_per_fft + << " time_us=" << time_us << endl; + + timer_batch.start (); + + unsigned nthreads = 1024; + unsigned nblocks = ndat / nthreads; + if (ndat % nthreads != 0) + nblocks++; + + k_unpack<<>> (input, raw, 127.0f); + + result = cufftExecC2C (plan_batch, input, buffer, CUFFT_FORWARD); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "Speed::runTest", "cufftExecC2C(plan_batch)"); + + nthreads = 1024; + nblocks = npt / nthreads; + if (npt % nthreads) + nblocks++; + + k_multiply<<>> (buffer, d_kernel, niter); + + result = cufftExecC2C (plan_bwd, buffer, buffer, CUFFT_INVERSE); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "Speed::runTest", "cufftExecC2C(plan_callback)"); + + cufftComplex * ou = output; + cufftComplex * in = buffer; + + for (unsigned i=0; i= 6050 + case CUFFT_NOT_IMPLEMENTED: + return "Not Implemented"; + case CUFFT_LICENSE_ERROR: + return "License error"; +#endif } return "unrecognized cufftResult"; } diff -Nru bl-dspsr-0+git20160405/Signal/General/Dedispersion.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/Dedispersion.C --- bl-dspsr-0+git20160405/Signal/General/Dedispersion.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/Dedispersion.C 2018-03-12 23:02:35.000000000 +0000 @@ -11,6 +11,7 @@ #include "ThreadContext.h" #include "Error.h" + #include using namespace std; @@ -209,12 +210,33 @@ note that each time sample depends upon the preceding impulse_pos points. */ + +unsigned smearing_samples_threshold = 16 * 1024 * 1024; + void dsp::Dedispersion::prepare () { if (!smearing_samples_set) { + unsigned threshold = smearing_samples_threshold / nchan; + supported_channels = vector (nchan, true); + unsigned ichan = 0; + + while( (impulse_neg = smearing_samples (-1)) > threshold ) + { + supported_channels[ichan] = false; + ichan ++; + if (ichan == nchan) + throw Error (InvalidState, + "dsp::Dedispersion::prepare", + "smearing samples=%u exceeds threshold=%u", + impulse_neg, threshold); + } + + if (verbose) + cerr << "dsp::Dedispersion::prepare " + << ichan << " unsupported channels" << endl; + impulse_pos = smearing_samples (1); - impulse_neg = smearing_samples (-1); } if (psrdisp_compatible) @@ -223,13 +245,6 @@ " using symmetric impulse response function" << endl; impulse_pos = impulse_neg; } - -#if 0 - // test the effect of a possibly common error in the interpretation of HR75 - impulse_pos += impulse_neg; - impulse_neg = 0; -#endif - } @@ -354,6 +369,9 @@ //! Return the effective number of smearing samples unsigned dsp::Dedispersion::get_effective_smearing_samples () const { + if (verbose) + cerr << "dsp::Dedispersion::get_effective_smearing_samples" << endl; + return smearing_samples (0); } @@ -372,6 +390,13 @@ double ch_abs_bw = abs_bw / double(nchan); double lower_ch_cfreq = centre_frequency - (abs_bw - ch_abs_bw) / 2.0; + unsigned ichan=0; + while (ichan < supported_channels.size() && !supported_channels[ichan]) + { + lower_ch_cfreq += ch_abs_bw; + ichan++; + } + // calculate the smearing (in the specified half of the band) if (half) { @@ -414,7 +439,7 @@ if (verbose) cerr << "dsp::Dedispersion::smearing_samples = " - << int(tsmear * sampling_rate) << endl; + << int64_t(tsmear * sampling_rate) << endl; // add another ten percent, just to be sure that the pollution due // to the cyclical convolution effect is minimized diff -Nru bl-dspsr-0+git20160405/Signal/General/Detection.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/Detection.C --- bl-dspsr-0+git20160405/Signal/General/Detection.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/Detection.C 2018-03-12 23:02:35.000000000 +0000 @@ -106,9 +106,6 @@ return; } - if (input->get_ndat() == 0) - return; - if (!inplace) resize_output (); diff -Nru bl-dspsr-0+git20160405/Signal/General/DetectionCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Signal/General/DetectionCUDA.cu --- bl-dspsr-0+git20160405/Signal/General/DetectionCUDA.cu 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/DetectionCUDA.cu 2018-03-12 23:02:35.000000000 +0000 @@ -156,6 +156,9 @@ << " input.span=" << input_span << " output.span=" << output_span << endl; + if (ndat == 0) + return; + dim3 threads (128); dim3 blocks (ndat/threads.x, nchan); @@ -170,7 +173,7 @@ ndat); if (dsp::Operation::record_time || dsp::Operation::verbose) - check_error ("CUDA::DetectionEngine::polarimetry"); + check_error_stream ("CUDA::DetectionEngine::polarimetry", stream); } // dubiuous about the correctness here... TODO AJ diff -Nru bl-dspsr-0+git20160405/Signal/General/digifits.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/digifits.C --- bl-dspsr-0+git20160405/Signal/General/digifits.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/digifits.C 2018-03-12 23:02:35.000000000 +0000 @@ -87,6 +87,11 @@ arg = menu.add (config->block_size, 'B', "MB"); arg->set_help ("block size in megabytes"); + string ram_limit; + arg = menu.add (ram_limit, 'U', "MB"); + arg->set_help ("upper limit on RAM usage"); + arg->set_long_help ("specify the floating point number of megabytes; e.g. -U 256 \n"); + //arg = menu.add (&config->filterbank, // &dsp::Filterbank::Config::set_freq_res, // 'x', "nfft"); @@ -123,6 +128,9 @@ arg = menu.add (config->nsblk, "nsblk", "N"); arg->set_help ("output block size in samples (default=2048)"); + arg = menu.add (config->integration_length, 'L', "seconds"); + arg->set_help ("set maximum file length"); + arg = menu.add (config->dedisperse, 'K'); arg->set_help ("remove inter-channel dispersion delays"); @@ -138,6 +146,12 @@ menu.parse (argc, argv); + if (!ram_limit.empty()) + { + double MB = fromstring (ram_limit); + config->set_maximum_RAM (uint64_t( MB * 1024.0 * 1024.0 )); + } + //if (revert) // config->order = dsp::TimeSeries::OrderFPT; } diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/ACFilterbank.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/ACFilterbank.h --- bl-dspsr-0+git20160405/Signal/General/dsp/ACFilterbank.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/ACFilterbank.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/ACFilterbank.h,v $ - $Revision: 1.5 $ - $Date: 2006/07/09 13:27:11 $ - $Author: wvanstra $ */ +// dspsr/Signal/General/dsp/ACFilterbank.h #ifndef __ACFilterbank_h #define __ACFilterbank_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/AutoCorrelation.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/AutoCorrelation.h --- bl-dspsr-0+git20160405/Signal/General/dsp/AutoCorrelation.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/AutoCorrelation.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/AutoCorrelation.h,v $ - $Revision: 1.2 $ - $Date: 2006/07/09 13:27:11 $ - $Author: wvanstra $ */ +// dspsr/Signal/General/dsp/AutoCorrelation.h #ifndef __AutoCorrelation_h #define __AutoCorrelation_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/BandpassMonitor.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/BandpassMonitor.h --- bl-dspsr-0+git20160405/Signal/General/dsp/BandpassMonitor.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/BandpassMonitor.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/BandpassMonitor.h,v $ - $Revision: 1.6 $ - $Date: 2008/10/03 05:42:40 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/BandpassMonitor.h #ifndef __baseband_dsp_BandpassMonitor_h #define __baseband_dsp_BandpassMonitor_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/BitStatsPlotter.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/BitStatsPlotter.h --- bl-dspsr-0+git20160405/Signal/General/dsp/BitStatsPlotter.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/BitStatsPlotter.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/BitStatsPlotter.h,v $ - $Revision: 1.5 $ - $Date: 2008/07/13 00:38:54 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/BitStatsPlotter.h #ifndef __BitStatsPlotter_h #define __BitStatsPlotter_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/ConvolutionCUDACallbacks.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/ConvolutionCUDACallbacks.h --- bl-dspsr-0+git20160405/Signal/General/dsp/ConvolutionCUDACallbacks.h 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/ConvolutionCUDACallbacks.h 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,30 @@ +//-*-C++-*- +/*************************************************************************** + * + * Copyright (C) 2015 by Andrew Jameson + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#ifndef __baseband_cuda_Convolution_Callbacks_h +#define __baseband_cuda_Convolution_Callbacks_h + +#include +#include + +#if HAVE_CUFFT_CALLBACKS + + void setup_callbacks_ConvolutionCUDA (cufftHandle plan_fwd, cufftHandle plan_bwd, + cufftHandle plan_fwd_batch, cufftHandle plan_bwd_batch, + cufftComplex * d_kernels, int nbatch, cudaStream_t stream); + + void setup_callbacks_conv_params (unsigned * h_ptr, unsigned h_size, cudaStream_t stream); + + void setup_callbacks_ConvolutionCUDASpectral (cufftHandle plan_fwd, cufftHandle plan_bwd, + cufftComplex * d_kernels, cudaStream_t stream); + + void setup_callbacks_conv_params_spectral (unsigned * h_ptr, unsigned h_size, cudaStream_t stream); + +#endif + +#endif //__baseband_cuda_Convolution_Callbacks_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/ConvolutionCUDA.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/ConvolutionCUDA.h --- bl-dspsr-0+git20160405/Signal/General/dsp/ConvolutionCUDA.h 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/ConvolutionCUDA.h 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,110 @@ +//-*-C++-*- +/*************************************************************************** + * + * Copyright (C) 2015 by Andrew Jameson + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#ifndef __baseband_cuda_Convolution_h +#define __baseband_cuda_Convolution_h + +#include +#include + +#include "dsp/Convolution.h" +#include "dsp/LaunchConfig.h" + +namespace CUDA +{ + class ConvolutionEngine : public dsp::Convolution::Engine + { + public: + + //! Default Constructor + ConvolutionEngine (cudaStream_t stream); + ~ConvolutionEngine(); + + void set_scratch (void * scratch); + + //! prepare the required attributes for the engine + void prepare (dsp::Convolution * convolution); + + //! setup the dedispersion kernel from the response + void setup_kernel (const dsp::Response * response); + + //! configure the singular FFTs + void setup_singular (); + + //! configure the batched FFTs + void setup_batched (unsigned nbatch); + +#if HAVE_CUFFT_CALLBACKS + //! setup FFT callbacks + //void setup_callbacks (); +#endif + + void perform (const dsp::TimeSeries* input, dsp::TimeSeries* output, + unsigned npart); + + protected: + + void perform_complex (const dsp::TimeSeries* input, dsp::TimeSeries * output, + unsigned npart); + + void perform_real (const dsp::TimeSeries* input, dsp::TimeSeries * output, + unsigned npart); + + cudaStream_t stream; + + LaunchConfig1D mp; + + cufftType type_fwd; + + cufftHandle plan_fwd; + + cufftHandle plan_bwd; + + cufftHandle plan_fwd_batched; + + cufftHandle plan_bwd_batched; + + size_t kernel_size; + + // dedispersion kernel for all input channels in device memory + cufftComplex * d_kernels; + + // device scratch memory + cufftComplex * d_scratch; + + cufftComplex * buf; + + void * work_area; + + size_t work_area_size; + + int auto_allocate; + + int npt_fwd; + + int npt_bwd; + + int nbatch; + + unsigned nsamp_overlap; + + unsigned nsamp_step; + + unsigned nfilt_pos; + + unsigned nfilt_neg; + +#if HAVE_CUFFT_CALLBACKS + unsigned h_conv_params[5]; +#endif + + }; +} + +#endif + diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/ConvolutionCUDASpectral.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/ConvolutionCUDASpectral.h --- bl-dspsr-0+git20160405/Signal/General/dsp/ConvolutionCUDASpectral.h 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/ConvolutionCUDASpectral.h 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,115 @@ +//-*-C++-*- +/*************************************************************************** + * + * Copyright (C) 2015 by Andrew Jameson + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#ifndef __baseband_cuda_ConvolutionSpectral_h +#define __baseband_cuda_ConvolutionSpectral_h + +#include +#include + +#include "dsp/Convolution.h" +#include "dsp/LaunchConfig.h" + +namespace CUDA +{ + class ConvolutionEngineSpectral : public dsp::Convolution::Engine + { + public: + + //! Default Constructor + ConvolutionEngineSpectral (cudaStream_t stream); + ~ConvolutionEngineSpectral(); + + void regenerate_plans(); + + void set_scratch (void * scratch); + + //! prepare the required attributes for the engine + void prepare (dsp::Convolution * convolution); + + //! setup the dedispersion kernel from the response + void setup_kernel (const dsp::Response * response); + + //! configure batched FFT + void setup_batched (const dsp::TimeSeries* input, dsp::TimeSeries * output); + +#if HAVE_CUFFT_CALLBACKS + //! setup FFT callbacks + void setup_callbacks (); +#endif + + void perform (const dsp::TimeSeries* input, dsp::TimeSeries* output, + unsigned npart); + + protected: + + void perform_complex (const dsp::TimeSeries* input, dsp::TimeSeries * output, + unsigned npart); + + void perform_real (const dsp::TimeSeries* input, dsp::TimeSeries * output, + unsigned npart); + + cudaStream_t stream; + + LaunchConfig1D mp; + + cufftType type_fwd; + + cufftHandle plan_fwd; + + cufftHandle plan_bwd; + + size_t kernel_size; + + // dedispersion kernel for all input channels in device memory + cufftComplex * d_kernels; + + // device scratch memory + cufftComplex * d_scratch; + + cufftComplex * buf; + + void * work_area; + + size_t work_area_size; + + int auto_allocate; + + int nchan; + + int npol; + + bool fft_configured; + + uint64_t input_stride; + + uint64_t output_stride; + + int npt_fwd; + + int npt_bwd; + + int nbatch; + + unsigned nsamp_overlap; + + unsigned nsamp_step; + + unsigned nfilt_pos; + + unsigned nfilt_neg; + +#if HAVE_CUFFT_CALLBACKS + unsigned h_conv_params[2]; +#endif + + }; +} + +#endif + diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/Convolution.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Convolution.h --- bl-dspsr-0+git20160405/Signal/General/dsp/Convolution.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Convolution.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/Convolution.h,v $ - $Revision: 1.26 $ - $Date: 2011/08/04 21:06:30 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/Convolution.h #ifndef __Convolution_h #define __Convolution_h @@ -58,8 +55,7 @@ public: //! Null constructor - Convolution (const char* name = "Convolution", - Behaviour type = anyplace); + Convolution (const char* name = "Convolution", Behaviour type = outofplace); //! Destructor virtual ~Convolution (); @@ -103,6 +99,14 @@ //! Return a pointer to the integrated passband virtual const Response* get_passband() const; + //! Set the memory allocator to be used + void set_device (Memory *); + + //! Engine used to perform discrete convolution step + class Engine; + + void set_engine (Engine*); + protected: //! Perform the convolution transformation on the input TimeSeries @@ -126,6 +130,8 @@ friend class TFPFilterbank; friend class SKFilterbank; + Reference::To memory; + unsigned nfilt_tot; unsigned nfilt_pos; unsigned nfilt_neg; @@ -144,7 +150,22 @@ unsigned scratch_needed; uint64_t npart; unsigned n_fft; + + //! Interface to alternate processing engine (e.g. GPU) + Reference::To engine; }; + + class Convolution::Engine : public Reference::Able + { + public: + + virtual void set_scratch (void *) = 0; + + virtual void prepare (dsp::Convolution * convolution) = 0; + + virtual void perform (const TimeSeries* in, TimeSeries* out, unsigned npart) = 0; + }; + } diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/Dedispersion.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Dedispersion.h --- bl-dspsr-0+git20160405/Signal/General/dsp/Dedispersion.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Dedispersion.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/Dedispersion.h,v $ - $Revision: 1.29 $ - $Date: 2010/04/11 05:21:43 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/Dedispersion.h #ifndef __Dedispersion_h #define __Dedispersion_h @@ -183,6 +180,10 @@ //! Flag that the response and bandpass attributes reflect the state bool built; + //! Supported frequency channels + /*! Set to false when the dispersive smearing is too large */ + std::vector supported_channels; + //! Return the effective smearing time in seconds (worker function) double smearing_time (int half) const; diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/DedispersionSampleDelay.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/DedispersionSampleDelay.h --- bl-dspsr-0+git20160405/Signal/General/dsp/DedispersionSampleDelay.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/DedispersionSampleDelay.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/DedispersionSampleDelay.h,v $ - $Revision: 1.2 $ - $Date: 2009/06/17 10:16:54 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/DedispersionSampleDelay.h #ifndef __Dedispersion_SampleDelay_h #define __Dedispersion_SampleDelay_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/DetectionCUDA.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/DetectionCUDA.h --- bl-dspsr-0+git20160405/Signal/General/dsp/DetectionCUDA.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/DetectionCUDA.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/DetectionCUDA.h,v $ - $Revision: 1.3 $ - $Date: 2010/06/01 10:46:29 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/DetectionCUDA.h #ifndef __baseband_cuda_Detection_h #define __baseband_cuda_Detection_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/Detection.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Detection.h --- bl-dspsr-0+git20160405/Signal/General/dsp/Detection.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Detection.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/Detection.h,v $ - $Revision: 1.20 $ - $Date: 2010/06/01 09:12:18 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/Detection.h #ifndef __Detection_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/Example.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Example.h --- bl-dspsr-0+git20160405/Signal/General/dsp/Example.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Example.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/Example.h,v $ - $Revision: 1.3 $ - $Date: 2006/07/09 13:27:12 $ - $Author: wvanstra $ */ +// dspsr/Signal/General/dsp/Example.h #ifndef __Example_h #define __Example_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/ExcisionStatsPlotter.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/ExcisionStatsPlotter.h --- bl-dspsr-0+git20160405/Signal/General/dsp/ExcisionStatsPlotter.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/ExcisionStatsPlotter.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/ExcisionStatsPlotter.h,v $ - $Revision: 1.1 $ - $Date: 2008/07/16 07:00:16 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/ExcisionStatsPlotter.h #ifndef __ExcisionStatsPlotter_h #define __ExcisionStatsPlotter_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/FilterbankConfig.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/FilterbankConfig.h --- bl-dspsr-0+git20160405/Signal/General/dsp/FilterbankConfig.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/FilterbankConfig.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/FilterbankConfig.h,v $ - $Revision: 1.2 $ - $Date: 2011/07/15 04:18:11 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/FilterbankConfig.h #ifndef __FilterbankConfig_h #define __FilterbankConfig_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/filterbank_cuda.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/filterbank_cuda.h --- bl-dspsr-0+git20160405/Signal/General/dsp/filterbank_cuda.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/filterbank_cuda.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,11 +6,8 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/filterbank_cuda.h,v $ - $Revision: 1.1 $ +// dspsr/Signal/General/dsp/filterbank_cuda.h - $Date: 2011/10/07 11:10:14 $ - $Author: straten $ */ #ifndef __filterbank_cuda_h #define __filterbank_cuda_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/FilterbankCUDA.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/FilterbankCUDA.h --- bl-dspsr-0+git20160405/Signal/General/dsp/FilterbankCUDA.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/FilterbankCUDA.h 2018-03-12 23:02:35.000000000 +0000 @@ -7,10 +7,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/FilterbankCUDA.h,v $ - $Revision: 1.17 $ - $Date: 2011/10/07 11:10:14 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/FilterbankCUDA.h #ifndef __FilterbankCUDA_h #define __FilterbankCUDA_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/filterbank_engine.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/filterbank_engine.h --- bl-dspsr-0+git20160405/Signal/General/dsp/filterbank_engine.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/filterbank_engine.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/filterbank_engine.h,v $ - $Revision: 1.1 $ - $Date: 2011/10/07 11:01:50 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/filterbank_engine.h #ifndef __filterbank_engine_h #define __filterbank_engine_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/Filterbank.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Filterbank.h --- bl-dspsr-0+git20160405/Signal/General/dsp/Filterbank.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Filterbank.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/Filterbank.h,v $ - $Revision: 1.26 $ - $Date: 2011/10/07 11:01:50 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/Filterbank.h #ifndef __Filterbank_h #define __Filterbank_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/FourthMoment.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/FourthMoment.h --- bl-dspsr-0+git20160405/Signal/General/dsp/FourthMoment.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/FourthMoment.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/FourthMoment.h,v $ - $Revision: 1.2 $ - $Date: 2009/06/08 19:45:01 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/FourthMoment.h #ifndef __FourthMoment_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/GeometricDelay.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/GeometricDelay.h --- bl-dspsr-0+git20160405/Signal/General/dsp/GeometricDelay.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/GeometricDelay.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/GeometricDelay.h,v $ - $Revision: 1.2 $ - $Date: 2010/06/27 10:56:21 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/GeometricDelay.h #ifndef __Geometric_SampleDelay_h #define __Geometric_SampleDelay_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/LaunchConfig.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/LaunchConfig.h --- bl-dspsr-0+git20160405/Signal/General/dsp/LaunchConfig.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/LaunchConfig.h 2018-03-12 23:02:35.000000000 +0000 @@ -7,6 +7,9 @@ * ***************************************************************************/ +#ifndef __LaunchConfig_h +#define __LaunchConfig_h + #include namespace CUDA @@ -24,6 +27,10 @@ //! gets the current device ID and calls cudaGetDeviceProperties void init (); + + size_t get_max_threads_per_block (); + + size_t get_max_shm (); }; @@ -54,3 +61,5 @@ unsigned get_nthread() { return nthread; } }; } + +#endif diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/LevelMonitor.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/LevelMonitor.h --- bl-dspsr-0+git20160405/Signal/General/dsp/LevelMonitor.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/LevelMonitor.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/LevelMonitor.h,v $ - $Revision: 1.9 $ - $Date: 2010/03/22 06:06:58 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/LevelMonitor.h #ifndef __LevelMonitor_h #define __LevelMonitor_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/LoadToFil.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/LoadToFil.h --- bl-dspsr-0+git20160405/Signal/General/dsp/LoadToFil.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/LoadToFil.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/LoadToFil.h,v $ - $Revision: 1.4 $ - $Date: 2011/12/21 06:02:20 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/LoadToFil.h #ifndef __dspsr_LoadToFil_h #define __dspsr_LoadToFil_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/LoadToFITS.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/LoadToFITS.h --- bl-dspsr-0+git20160405/Signal/General/dsp/LoadToFITS.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/LoadToFITS.h 2018-03-12 23:02:35.000000000 +0000 @@ -70,6 +70,10 @@ // Sets default values Config (); + // set block_size to result in at least this much RAM usage + void set_maximum_RAM (uint64_t); + uint64_t get_maximum_RAM () const { return maximum_RAM; } + // input data block size in MB double block_size; @@ -105,6 +109,9 @@ //! hold offset and scale constant after first update bool rescale_constant; + + //! set maximum length for a file + double integration_length; //! number of bits used to re-digitize the floating point time series int nbits; diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/MultiThread.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/MultiThread.h --- bl-dspsr-0+git20160405/Signal/General/dsp/MultiThread.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/MultiThread.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/MultiThread.h,v $ - $Revision: 1.2 $ - $Date: 2011/08/24 22:16:04 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/MultiThread.h #ifndef __dspsr_MultiThread_h #define __dspsr_MultiThread_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/on_host.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/on_host.h --- bl-dspsr-0+git20160405/Signal/General/dsp/on_host.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/on_host.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/on_host.h,v $ - $Revision: 1.4 $ - $Date: 2010/04/24 14:13:38 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/on_host.h #ifndef __on_host_h #define __on_host_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/OptimalFFT.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/OptimalFFT.h --- bl-dspsr-0+git20160405/Signal/General/dsp/OptimalFFT.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/OptimalFFT.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/OptimalFFT.h,v $ - $Revision: 1.3 $ - $Date: 2010/05/18 15:39:58 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/OptimalFFT.h #ifndef __OptimalFFT_h #define __OptimalFFT_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/OptimalFilterbank.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/OptimalFilterbank.h --- bl-dspsr-0+git20160405/Signal/General/dsp/OptimalFilterbank.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/OptimalFilterbank.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/OptimalFilterbank.h,v $ - $Revision: 1.1 $ - $Date: 2010/05/18 15:23:17 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/OptimalFilterbank.h #ifndef __OptimalFilterbank_h #define __OptimalFilterbank_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/Pipeline.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Pipeline.h --- bl-dspsr-0+git20160405/Signal/General/dsp/Pipeline.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Pipeline.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/Pipeline.h,v $ - $Revision: 1.1 $ - $Date: 2011/08/23 20:55:19 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/Pipeline.h #ifndef __dspsr_Pipeline_h #define __dspsr_Pipeline_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/PolnSelectCUDA.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/PolnSelectCUDA.h --- bl-dspsr-0+git20160405/Signal/General/dsp/PolnSelectCUDA.h 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/PolnSelectCUDA.h 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,47 @@ +//-*-C++-*- +/*************************************************************************** + * + * Copyright (C) 2015 by Andrew Jameson + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#ifndef __baseband_cuda_PolnSelect_h +#define __baseband_cuda_PolnSelect_h + +#include "dsp/PolnSelect.h" +#include "dsp/LaunchConfig.h" + +namespace CUDA +{ + class PolnSelectEngine : public dsp::PolnSelect::Engine + { + public: + + //! Default Constructor + PolnSelectEngine (cudaStream_t stream); + + ~PolnSelectEngine (); + + void setup (); + + void fpt_polnselect (int ipol, + const dsp::TimeSeries* in, + dsp::TimeSeries* out); + + void tfp_polnselect (int ipol, + const dsp::TimeSeries* in, + dsp::TimeSeries* out); + + protected: + + cudaStream_t stream; + + //! gpu configuration + LaunchConfig gpu_config; + + }; +} + +#endif + diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/PolnSelect.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/PolnSelect.h --- bl-dspsr-0+git20160405/Signal/General/dsp/PolnSelect.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/PolnSelect.h 2018-03-12 23:02:35.000000000 +0000 @@ -34,11 +34,32 @@ // Get the currently selected poln index int get_ipol () const { return ipol_keep; } + class Engine; + + void set_engine (Engine*); + protected: //! The polarization to keep int ipol_keep; }; + + class PolnSelect::Engine : public OwnStream + { + public: + + virtual void setup () = 0; + + virtual void fpt_polnselect (int ipol, + const dsp::TimeSeries * in, + dsp::TimeSeries * out) = 0; + + virtual void tfp_polnselect (int ipol, + const dsp::TimeSeries* in, + dsp::TimeSeries* out) = 0; + + }; + } #endif diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/PScrunchCUDA.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/PScrunchCUDA.h --- bl-dspsr-0+git20160405/Signal/General/dsp/PScrunchCUDA.h 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/PScrunchCUDA.h 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,45 @@ +//-*-C++-*- +/*************************************************************************** + * + * Copyright (C) 2015 by Andrew Jameson + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#ifndef __baseband_cuda_PScrunch_h +#define __baseband_cuda_PScrunch_h + +#include "dsp/PScrunch.h" +#include "dsp/LaunchConfig.h" + +namespace CUDA +{ + class PScrunchEngine : public dsp::PScrunch::Engine + { + public: + + //! Default Constructor + PScrunchEngine (cudaStream_t stream); + + ~PScrunchEngine (); + + void setup (); + + void fpt_pscrunch (const dsp::TimeSeries* in, + dsp::TimeSeries* out); + + void tfp_pscrunch (const dsp::TimeSeries* in, + dsp::TimeSeries* out); + + protected: + + cudaStream_t stream; + + //! gpu configuration + LaunchConfig gpu_config; + + }; +} + +#endif + diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/PScrunch.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/PScrunch.h --- bl-dspsr-0+git20160405/Signal/General/dsp/PScrunch.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/PScrunch.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/PScrunch.h,v $ - $Revision: 1.1 $ - $Date: 2008/07/01 12:23:21 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/PScrunch.h #ifndef __baseband_dsp_PScrunch_h #define __baseband_dsp_PScrunch_h @@ -32,7 +29,31 @@ //! PScrunch to zero mean and unit variance void transformation (); + + class Engine; + + void set_engine (Engine*); + + protected: + + Reference::To engine; + }; + + class PScrunch::Engine : public OwnStream + { + public: + + virtual void setup () = 0; + + virtual void fpt_pscrunch (const dsp::TimeSeries * in, + dsp::TimeSeries * out) = 0; + + virtual void tfp_pscrunch (const dsp::TimeSeries* in, + dsp::TimeSeries* out) = 0; + + }; + } #endif diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/Rescale.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Rescale.h --- bl-dspsr-0+git20160405/Signal/General/dsp/Rescale.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Rescale.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/Rescale.h,v $ - $Revision: 1.9 $ - $Date: 2010/02/02 11:18:41 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/Rescale.h #ifndef __baseband_dsp_Rescale_h #define __baseband_dsp_Rescale_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/Response.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Response.h --- bl-dspsr-0+git20160405/Signal/General/dsp/Response.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Response.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/Response.h,v $ - $Revision: 1.33 $ - $Date: 2011/01/06 05:16:55 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/Response.h #ifndef __Response_h #define __Response_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/ResponseProduct.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/ResponseProduct.h --- bl-dspsr-0+git20160405/Signal/General/dsp/ResponseProduct.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/ResponseProduct.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/ResponseProduct.h,v $ - $Revision: 1.6 $ - $Date: 2009/06/12 06:18:56 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/ResponseProduct.h #ifndef __ResponseProduct_h #define __ResponseProduct_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/RFIFilter.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/RFIFilter.h --- bl-dspsr-0+git20160405/Signal/General/dsp/RFIFilter.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/RFIFilter.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/RFIFilter.h,v $ - $Revision: 1.4 $ - $Date: 2009/06/17 10:16:54 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/RFIFilter.h #ifndef __RFIFilter_h #define __RFIFilter_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/SampleDelayFunction.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SampleDelayFunction.h --- bl-dspsr-0+git20160405/Signal/General/dsp/SampleDelayFunction.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SampleDelayFunction.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/SampleDelayFunction.h,v $ - $Revision: 1.6 $ - $Date: 2010/06/27 10:56:25 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/SampleDelayFunction.h #ifndef __baseband_dsp_SampleDelayFunction_h #define __baseband_dsp_SampleDelayFunction_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/SampleDelay.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SampleDelay.h --- bl-dspsr-0+git20160405/Signal/General/dsp/SampleDelay.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SampleDelay.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/SampleDelay.h,v $ - $Revision: 1.5 $ - $Date: 2010/05/21 07:29:37 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/SampleDelay.h #ifndef __baseband_dsp_SampleDelay_h #define __baseband_dsp_SampleDelay_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/Shape.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Shape.h --- bl-dspsr-0+git20160405/Signal/General/dsp/Shape.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/Shape.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/Shape.h,v $ - $Revision: 1.12 $ - $Date: 2009/06/07 01:22:34 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/Shape.h #ifndef __Shape_h #define __Shape_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/SingleThread.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SingleThread.h --- bl-dspsr-0+git20160405/Signal/General/dsp/SingleThread.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SingleThread.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/SingleThread.h,v $ - $Revision: 1.7 $ - $Date: 2012/01/19 21:46:17 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/SingleThread.h #ifndef __dspsr_SingleThread_h #define __dspsr_SingleThread_h @@ -151,6 +148,7 @@ Reference::To device_memory; void* gpu_stream; + int gpu_device; }; diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/SKComputerCUDA.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKComputerCUDA.h --- bl-dspsr-0+git20160405/Signal/General/dsp/SKComputerCUDA.h 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKComputerCUDA.h 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,50 @@ +//-*-C++-*- +/*************************************************************************** + * + * Copyright (C) 2016 by Andrew Jameson + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#ifndef __baseband_cuda_SKComputer_h +#define __baseband_cuda_SKComputer_h + +#include "dsp/SKComputer.h" +#include "dsp/MemoryCUDA.h" + +namespace CUDA +{ + class Memory; + + class SKComputerEngine : public dsp::SKComputer::Engine + { + public: + + //! Default Constructor + SKComputerEngine (dsp::Memory * memory); + + void setup (); + + void compute (const dsp::TimeSeries* input, dsp::TimeSeries* output, + dsp::TimeSeries *output_tscr, unsigned tscrunch); + + void insertsk (const dsp::TimeSeries* input, dsp::TimeSeries* output, + unsigned tscrunch); + + protected: + + DeviceMemory * device_memory; + + cudaStream_t stream; + + // device work buffer + float * work_buffer; + + size_t work_buffer_size; + + int max_threads_per_block; + + }; +} + +#endif diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/SKComputer.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKComputer.h --- bl-dspsr-0+git20160405/Signal/General/dsp/SKComputer.h 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKComputer.h 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,56 @@ +//-*-C++-*- +/*************************************************************************** + * + * Copyright (C) 2016 by Andrew Jameson + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#include "dsp/TimeSeries.h" +#include "dsp/Transformation.h" + +#ifndef __SKComputer_h +#define __SKComputer_h + +namespace dsp { + + class SKComputer: public Transformation { + + public: + + //! Null constructor + SKComputer (); + + ~SKComputer(); + + //! Engine used to perform discrete convolution step + class Engine; + + void set_engine (Engine*); + + protected: + + //! Perform the transformation on the input time series + void transformation (); + + //! Interface to alternate processing engine (e.g. GPU) + Reference::To engine; + + }; + + class SKComputer::Engine : public Reference::Able + { + public: + Engine () {} + + virtual void setup () = 0; + + virtual void compute (const dsp::TimeSeries* input, dsp::TimeSeries* output, + dsp::TimeSeries *output_tscr, unsigned tscrunch) = 0; + + virtual void insertsk (const dsp::TimeSeries* input, dsp::TimeSeries* output, + unsigned tscrunch) = 0; + }; +} + +#endif diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/SKDetectorCUDA.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKDetectorCUDA.h --- bl-dspsr-0+git20160405/Signal/General/dsp/SKDetectorCUDA.h 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKDetectorCUDA.h 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,76 @@ +//-*-C++-*- +/*************************************************************************** + * + * Copyright (C) 2016 by Andrew Jameson + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#ifndef __baseband_cuda_SKDetector_h +#define __baseband_cuda_SKDetector_h + +#include "dsp/SKDetector.h" +#include "dsp/MemoryCUDA.h" + +#include "dsp/TransferCUDA.h" +#include "dsp/TransferBitSeriesCUDA.h" + +namespace CUDA +{ + class SKDetectorEngine : public dsp::SKDetector::Engine + { + public: + + //! Default Constructor + SKDetectorEngine (dsp::Memory * memory); + + void setup (); + + void reset_mask (dsp::BitSeries* output); + + void detect_ft (const dsp::TimeSeries* input, dsp::BitSeries* output, + float upper_thresh, float lower_thresh); + + void detect_fscr(const dsp::TimeSeries* input, dsp::BitSeries* output, + const float lower, const float upper, + unsigned schan, unsigned echan); + + void detect_tscr (const dsp::TimeSeries* input, const dsp::TimeSeries* input_tscr, dsp::BitSeries* output, + float upper_thresh, float lower_thresh); + + int count_mask (const dsp::BitSeries* output); + + float * get_estimates (const dsp::TimeSeries* input); + + unsigned char * get_zapmask (const dsp::BitSeries* input); + + + protected: + + DeviceMemory * device_memory; + + cudaStream_t stream; + + unsigned nchan; + + unsigned npol; + + //! DDFB span, i.e. n floats between channels from raw base ptr + unsigned span; + + int max_threads_per_block; + + PinnedMemory * pinned_memory; + + dsp::TimeSeries * estimates_host; + + dsp::BitSeries * zapmask_host; + + dsp::TransferCUDA* transfer_estimates; + + dsp::TransferBitSeriesCUDA* transfer_zapmask; + + }; +} + +#endif diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/SKDetector.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKDetector.h --- bl-dspsr-0+git20160405/Signal/General/dsp/SKDetector.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKDetector.h 2018-03-12 23:02:35.000000000 +0000 @@ -67,6 +67,11 @@ //! The arrays will be reset when count_zapped is next called void reset_count () { unfiltered_hits = 0; } + //! Engine used to perform detection + class Engine; + + void set_engine (Engine*); + protected: //! Reserve the required amount of output space required @@ -75,6 +80,9 @@ //! Perform the transformation on the input time series void transformation (); + //! Interface to alternate processing engine (e.g. GPU) + Reference::To engine; + void reset_mask (); void detect_tscr (); @@ -99,7 +107,7 @@ //! Tsrunched SK statistic timeseries for the current block Reference::To input_tscr; - + //! Number of time samples integrated into tscr SK estimates unsigned tscr_M; @@ -149,6 +157,35 @@ }; + class SKDetector::Engine : public Reference::Able + { + public: + + virtual void setup () = 0; + + virtual void reset_mask (dsp::BitSeries* output) = 0; + + virtual void detect_ft (const dsp::TimeSeries* input, dsp::BitSeries* output, + float upper_thresh, float lower_thresh) = 0; + + virtual void detect_fscr (const dsp::TimeSeries* input, dsp::BitSeries* output, + const float lower, const float upper, + unsigned s_chan, unsigned e_chan) = 0; + + virtual void detect_tscr (const dsp::TimeSeries* input, + const dsp::TimeSeries * input_tscr, + dsp::BitSeries* output, + float upper_thresh, float lower_thresh) = 0; + + virtual int count_mask (const dsp::BitSeries* output) = 0; + + virtual float * get_estimates (const TimeSeries* input) = 0; + + virtual unsigned char * get_zapmask (const BitSeries* input) = 0; + + }; + + } #endif diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/SKFilterbankCUDA.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKFilterbankCUDA.h --- bl-dspsr-0+git20160405/Signal/General/dsp/SKFilterbankCUDA.h 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKFilterbankCUDA.h 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,75 @@ +//-*-C++-*- +/*************************************************************************** + * + * Copyright (C) 2015 by Andrew Jameson + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#ifndef __baseband_cuda_SKFilterbank_h +#define __baseband_cuda_SKFilterbank_h + +#include +#include + +#include "dsp/SKFilterbank.h" +#include "dsp/MemoryCUDA.h" +#include "dsp/LaunchConfig.h" + +namespace CUDA +{ + class SKFilterbankEngine : public dsp::SKFilterbank::Engine + { + public: + + //! Default Constructor + SKFilterbankEngine (dsp::Memory * _memory, unsigned _tscrunch); + + ~SKFilterbankEngine(); + + void setup (); + + void prepare (const dsp::TimeSeries* input, unsigned _nfft); + + void perform (const dsp::TimeSeries* input, + dsp::TimeSeries* output, + dsp::TimeSeries* output_tscr); + + protected: + + DeviceMemory * memory; + + void fft_real (cufftReal *in, cufftComplex * out); + + void fft_complex (cufftComplex *in, cufftComplex * out); + + cudaStream_t stream; + + cufftType type; + + cufftHandle plan; + + void * buffer; + + size_t buffer_size; + + void * sums; + + size_t sums_size; + + int nchan; + + int npol; + + int npt; + + int nbatch; + + int tscrunch; + + int max_threads_per_block; + }; +} + +#endif + diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/SKFilterbank.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKFilterbank.h --- bl-dspsr-0+git20160405/Signal/General/dsp/SKFilterbank.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKFilterbank.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/SKFilterbank.h,v $ - $Revision: 1.2 $ - $Date: 2011/08/04 21:06:12 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/SKFilterbank.h #ifndef __SKFilterbank_h #define __SKFilterbank_h @@ -27,7 +24,7 @@ public: //! Null constructor - SKFilterbank ( unsigned _n_threads ); + SKFilterbank ( unsigned _n_threads=1 ); ~SKFilterbank (); //! Engine used to perform discrete convolution step @@ -107,7 +104,14 @@ class SKFilterbank::Engine : public Reference::Able { public: - Engine () {} + + virtual void setup () = 0; + + virtual void prepare (const dsp::TimeSeries* input, unsigned _nfft) = 0; + + virtual void perform (const dsp::TimeSeries* input, dsp::TimeSeries* output, + dsp::TimeSeries *output_tscr) = 0; + }; } diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/SKMaskerCUDA.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKMaskerCUDA.h --- bl-dspsr-0+git20160405/Signal/General/dsp/SKMaskerCUDA.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKMaskerCUDA.h 2018-03-12 23:02:35.000000000 +0000 @@ -10,6 +10,7 @@ #define __baseband_cuda_SKMasker_h #include "dsp/SKMasker.h" +#include "dsp/MemoryCUDA.h" namespace CUDA { @@ -18,22 +19,18 @@ public: //! Default Constructor - SKMaskerEngine (cudaStream_t stream); + SKMaskerEngine (dsp::Memory * memory); - void setup (unsigned nchan, unsigned npol, unsigned span); + void setup (); - void perform (dsp::BitSeries* mask, unsigned mask_offset, dsp::TimeSeries* out, - unsigned offset, unsigned end); + void perform (dsp::BitSeries* mask, const dsp::TimeSeries* input, + dsp::TimeSeries* out, unsigned M); protected: - cudaStream_t stream; - - unsigned nchan; - unsigned npol; + DeviceMemory * device_memory; - //! DDFB span, i.e. n floats between channels from raw base ptr - unsigned span; + cudaStream_t stream; int max_threads_per_block; diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/SKMasker.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKMasker.h --- bl-dspsr-0+git20160405/Signal/General/dsp/SKMasker.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SKMasker.h 2018-03-12 23:02:35.000000000 +0000 @@ -74,9 +74,9 @@ class SKMasker::Engine : public Reference::Able { public: - virtual void setup (unsigned nchan, unsigned npol, unsigned span) = 0; + virtual void setup () = 0; - virtual void perform (BitSeries* mask, unsigned mask_offset, TimeSeries* out, unsigned offset, unsigned end) = 0; + virtual void perform (BitSeries* mask, const TimeSeries* in, TimeSeries* out, unsigned M) = 0; }; diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/SpectralKurtosisCUDA.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SpectralKurtosisCUDA.h --- bl-dspsr-0+git20160405/Signal/General/dsp/SpectralKurtosisCUDA.h 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SpectralKurtosisCUDA.h 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,82 @@ +//-*-C++-*- +/*************************************************************************** + * + * Copyright (C) 2016 by Andrew Jameson + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#ifndef __baseband_cuda_SpectralKurtosis_h +#define __baseband_cuda_SpectralKurtosis_h + +#include "dsp/SpectralKurtosis.h" + +#include "dsp/MemoryCUDA.h" +#include "dsp/SKComputerCUDA.h" +#include "dsp/SKDetectorCUDA.h" +#include "dsp/SKMaskerCUDA.h" + +#include "dsp/TransferCUDA.h" +#include "dsp/TransferBitSeriesCUDA.h" + +namespace CUDA +{ + + class SpectralKurtosisEngine : public dsp::SpectralKurtosis::Engine + { + public: + + //! Default Constructor + SpectralKurtosisEngine (dsp::Memory * memory); + + void setup (); + + void compute (const dsp::TimeSeries* input, dsp::TimeSeries* output, + dsp::TimeSeries *output_tscr, unsigned tscrunch); + + void reset_mask (dsp::BitSeries* output); + + void detect_ft (const dsp::TimeSeries* input, dsp::BitSeries* output, + float upper_thresh, float lower_thresh); + + void detect_fscr (const dsp::TimeSeries* input, dsp::BitSeries* output, + const float lower, const float upper, + unsigned schan, unsigned echan); + + void detect_tscr (const dsp::TimeSeries* input, + const dsp::TimeSeries * input_tscr, + dsp::BitSeries* output, + float upper, float lower); + + int count_mask (const dsp::BitSeries* output); + + float * get_estimates (const dsp::TimeSeries * estimates); + + unsigned char * get_zapmask (const dsp::BitSeries * zapmask); + + void mask (dsp::BitSeries* mask, const dsp::TimeSeries *in, dsp::TimeSeries* out, unsigned M); + + void insertsk (const dsp::TimeSeries* input, dsp::TimeSeries* out, unsigned M); + + protected: + + DeviceMemory * device_memory; + + cudaStream_t stream; + + SKComputerEngine * computer; + + SKDetectorEngine * detector; + + SKMaskerEngine * masker; + + float * work_buffer; + + size_t work_buffer_size; + + int max_threads_per_block; + + }; +} + +#endif diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/SpectralKurtosis.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SpectralKurtosis.h --- bl-dspsr-0+git20160405/Signal/General/dsp/SpectralKurtosis.h 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/SpectralKurtosis.h 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,215 @@ +//-*-C++-*- +/*************************************************************************** + * + * Copyright (C) 2016 by Andrew Jameson + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#include "dsp/Transformation.h" +#include "dsp/TimeSeries.h" +#include "dsp/BitSeries.h" +#include "dsp/Memory.h" + +#ifndef __SpectralKurtosis_h +#define __SpectralKurtosis_h + +#define ZAP_ALL 0 +#define ZAP_SKFB 1 +#define ZAP_FSCR 2 +#define ZAP_TSCR 3 + +namespace dsp { + + //! Perform Spectral Kurtosis on Input Timeseries, creating output Time Series + /*! Output will be in time, frequency, polarization order */ + + class SpectralKurtosis: public Transformation { + + public: + + //! Default constructor + SpectralKurtosis (); + + //! Destructor + ~SpectralKurtosis (); + + bool get_order_supported (TimeSeries::Order order) const; + + void set_M (unsigned _M) { M = _M; } + + //! Set the RFI thresholds with the specified factor + void set_thresholds (unsigned _M, unsigned _std_devs); + + //! Set the channel range to conduct detection + void set_channel_range (unsigned start, unsigned end); + + //! Set various options for Specral Kurtosis + void set_options (bool _disable_fscr, bool _disable_tscr, bool _disable_ft); + + void reserve (); + + void prepare (); + + void prepare_output (); + + //! The number of time samples used to calculate the SK statistic + unsigned get_M () const { return M; } + + //! The excision threshold in number of standard deviations + unsigned get_excision_threshold () const { return std_devs; } + + //! Total SK statistic for each poln/channel, post filtering + void get_filtered_sum (std::vector& sum) const + { sum = filtered_sum; } + + //! Hits on filtered average for each channel + void get_filtered_hits (std::vector& hits) const + { hits = filtered_hits; } + + //! Total SK statistic for each poln/channel, before filtering + void get_unfiltered_sum (std::vector& sum) const + { sum = unfiltered_sum; } + + //! Hits on unfiltered SK statistic, same for each channel + uint64_t get_unfiltered_hits () const { return unfiltered_hits; } + + //! The arrays will be reset when count_zapped is next called + void reset_count () { unfiltered_hits = 0; } + + + //! Engine used to perform discrete convolution step + class Engine; + + void set_engine (Engine*); + + protected: + + //! Perform the transformation on the input time series + void transformation (); + + //! Interface to alternate processing engine (e.g. GPU) + Reference::To engine; + + private: + + void compute (); + + void detect (); + void detect_tscr (); + void detect_skfb (); + void detect_fscr (); + void count_zapped (); + + void mask (); + void reset_mask (); + + void insertsk (); + + unsigned debugd; + + //! number of samples used in each SK estimate + unsigned M; + + unsigned nchan; + + unsigned npol; + + unsigned ndim; + + uint64_t npart; + + uint64_t output_ndat; + + //! SK Estimates + Reference::To estimates; + + //! Tscrunched SK Estimate for block + Reference::To estimates_tscr; + + //! Zap mask + Reference::To zapmask; + + //! accumulation arrays for S1 and S2 in t scrunch + std::vector S1_tscr; + std::vector S2_tscr; + + //! Total SK statistic for each poln/channel, post filtering + std::vector filtered_sum; + + //! Hits on filtered average for each channel + std::vector filtered_hits; + + //! Total SK statistic for each poln/channel, before filtering + std::vector unfiltered_sum; + + //! Hits on unfiltered SK statistic, same for each channel + uint64_t unfiltered_hits; + + //! number of std devs used to calculate excision limits + unsigned std_devs; + + //! lower and upper thresholds of excision limits + std::vector thresholds; + + float one_sigma; + + //! Number of samples integrated into tscr + unsigned M_tscr; + + //! exicision thresholds for tscr + std::vector thresholds_tscr; + + //! channel range to compute and apply SK excisions + std::vector channels; + + //! samples zapped by type [0:all, 1:sk, 2:fscr, 3:tscr] + std::vector zap_counts; + + //! total number of samples processed + uint64_t npart_total; + + //! flags for detection types [0:fscr, 1:tscr, 2:tscr] + std::vector detection_flags; + + bool prepared; + + }; + + class SpectralKurtosis::Engine : public Reference::Able + { + public: + + virtual void setup () = 0; + + virtual void compute (const TimeSeries* input, TimeSeries* output, + TimeSeries *output_tscr, unsigned tscrunch) = 0; + + virtual void reset_mask (BitSeries* output) = 0; + + virtual void detect_ft (const TimeSeries* input, BitSeries* output, + float upper_thresh, float lower_thresh) = 0; + + virtual void detect_fscr (const TimeSeries* input, BitSeries* output, + const float lower, const float upper, + unsigned schan, unsigned echan) = 0; + + virtual void detect_tscr (const TimeSeries* input, + const TimeSeries * input_tscr, + BitSeries* output, + float upper, float lower) = 0; + + virtual int count_mask (const BitSeries* output) = 0; + + virtual float * get_estimates (const TimeSeries* input) = 0; + + virtual unsigned char * get_zapmask (const BitSeries* input) = 0; + + virtual void mask (BitSeries* mask, const TimeSeries * in, TimeSeries* out, unsigned M) = 0; + + virtual void insertsk (const TimeSeries* input, TimeSeries* out, unsigned M) = 0; + + }; +} + +#endif diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/TFPFilterbank.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/TFPFilterbank.h --- bl-dspsr-0+git20160405/Signal/General/dsp/TFPFilterbank.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/TFPFilterbank.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/dsp/TFPFilterbank.h,v $ - $Revision: 1.3 $ - $Date: 2011/08/23 21:00:38 $ - $Author: straten $ */ +// dspsr/Signal/General/dsp/TFPFilterbank.h #ifndef __TFPFilterbank_h #define __TFPFilterbank_h diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/TScrunchCUDA.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/TScrunchCUDA.h --- bl-dspsr-0+git20160405/Signal/General/dsp/TScrunchCUDA.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/TScrunchCUDA.h 2018-03-12 23:02:35.000000000 +0000 @@ -23,8 +23,8 @@ TScrunchEngine (cudaStream_t stream); void fpt_tscrunch (const dsp::TimeSeries * input, - dsp::TimeSeries * output, - unsigned sfactor); + dsp::TimeSeries * output, + unsigned sfactor); protected: diff -Nru bl-dspsr-0+git20160405/Signal/General/dsp/UnderSamplingBench.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/UnderSamplingBench.h --- bl-dspsr-0+git20160405/Signal/General/dsp/UnderSamplingBench.h 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/dsp/UnderSamplingBench.h 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,39 @@ +//-*-C++-*- +/*************************************************************************** + * + * Copyright (C) 2015 by Andrew Jameson + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#ifndef __UnderSamplingBench_h_ +#define __UnderSamplingBench_h_ + +#include "FTransformBench.h" + +namespace dsp { + + //! Stores UnderSampling benchmark data + class UnderSamplingBench : public FTransform::Bench + { + public: + + static bool verbose; + + //! Construct from installed benchmarks + UnderSamplingBench (const std::string& library); + + //! Set the number of channels + void set_nchan (unsigned); + + protected: + + unsigned nchan; + std::string library; + + void load () const; + void load (const std::string& library, const std::string& filename) const; + }; +} + +#endif diff -Nru bl-dspsr-0+git20160405/Signal/General/fftbatch_speed.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/fftbatch_speed.C --- bl-dspsr-0+git20160405/Signal/General/fftbatch_speed.C 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/fftbatch_speed.C 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,210 @@ +/*************************************************************************** + * + * Copyright (C) 2015 by Andrew Jameson + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#if HAVE_CONFIG_H +#include +#endif + +#include +#include +#include "CUFFTError.h" + +#include "CommandLine.h" +#include "RealTimer.h" + +#include +#include +#include +#include + +using namespace std; + +class Speed : public Reference::Able +{ +public: + + Speed (); + + // parse command line options + void parseOptions (int argc, char** argv); + + // run the test + void runTest (); + +protected: + + int npt; + int niter; + unsigned gpu_id; + bool cuda; +}; + + +Speed::Speed () +{ + gpu_id = 0; + niter = 16; + npt = 1024; + cuda = false; +} + +int main(int argc, char** argv) try +{ + Speed speed; + speed.parseOptions (argc, argv); + speed.runTest (); + return 0; +} +catch (Error& error) +{ + cerr << error << endl; + return -1; +} + +void Speed::parseOptions (int argc, char** argv) +{ + CommandLine::Menu menu; + CommandLine::Argument* arg; + + menu.set_help_header ("undersampling_speed - measure under sampling speed"); + menu.set_version ("undersampling_speed version 1.0"); + + arg = menu.add (npt, 'n', "npt"); + arg->set_help ("number of points in each FFT"); + + arg = menu.add (gpu_id, 'd'); + arg->set_help ("GPU device ID"); + + arg = menu.add (niter, 't', "ninter"); + arg->set_help ("number of iterations (batch/loops)"); + + arg = menu.add (cuda, "cuda"); + arg->set_help ("benchmark CUDA"); + + menu.parse (argc, argv); +} + +void check_error_stream (const char*, cudaStream_t); + +void Speed::runTest () +{ +#ifdef _DEBUG + dsp::Operation::verbose = true; + dsp::Observation::verbose = true; +#endif + + // assume complex FFTs + const unsigned ndim = 2; + + cudaStream_t stream = 0; + if (cuda) + { + cerr << "using GPU " << gpu_id << endl; + cudaError_t err = cudaSetDevice(gpu_id); + if (err != cudaSuccess) + throw Error (InvalidState, "undersampling_speed", + "cudaSetDevice failed: %s", cudaGetErrorString(err)); + + err = cudaStreamCreate( &stream ); + if (err != cudaSuccess) + throw Error (InvalidState, "undersampling_speed", + "cudaStreamCreate failed: %s", cudaGetErrorString(err)); + + } + + unsigned ndat = npt * niter; + unsigned nbytes = ndat * sizeof (cufftComplex); + + cufftComplex * input; + cufftComplex * output; + cufftResult result; + size_t work_size; + + cudaMalloc ((void **) &input, nbytes); + cudaMalloc ((void **) &output, nbytes); + + cudaMemsetAsync ((void *) input, 0, nbytes, stream); + cudaMemsetAsync ((void *) output, 0, nbytes, stream); + + // setup loop based FFT plan + cufftHandle plan_loop; + + result = cufftCreate (&plan_loop); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "Speed::runTest", "cufftCreate(plan_loop)"); + + result = cufftMakePlan1d (plan_loop, npt, CUFFT_C2C, 1, &work_size); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "Speed::runTest", "cufftMakePlan1D (plan_loop)"); + + result = cufftSetStream (plan_loop, stream); + if (result != CUFFT_SUCCESS) + CUFFTError (result, "Speed::runTest", "cufftSetStream (plan_loop)"); + + // setup batch based FFT plan + cufftHandle plan_batch; + + result = cufftCreate (&plan_batch); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "Speed::runTest", "cufftCreate(plan_batch)"); + + int rank = 1; + result = cufftMakePlanMany (plan_batch, rank, &npt, NULL, 0, 0, NULL, 0, 0, + CUFFT_C2C, niter, &work_size); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "Speed::runTest", "cufftMakePlanMany (plan_batch)"); + + result = cufftSetStream (plan_batch, stream); + if (result != CUFFT_SUCCESS) + CUFFTError (result, "Speed::runTest", "cufftSetStream (plan_batch)"); + + RealTimer timer_loop; + RealTimer timer_batch; + + cudaStreamSynchronize (stream); + + timer_loop.start (); + + for (unsigned i=0; iget_ndat() == 0) - return; - if( !input->get_detected() ) throw Error(InvalidState,"dsp::FScrunch::transformation()", "invalid input state: " + tostring(input->get_state())); diff -Nru bl-dspsr-0+git20160405/Signal/General/LaunchConfig.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/LaunchConfig.C --- bl-dspsr-0+git20160405/Signal/General/LaunchConfig.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/LaunchConfig.C 2018-03-12 23:02:35.000000000 +0000 @@ -17,6 +17,31 @@ cudaGetDeviceProperties (&device_properties, device); } +size_t CUDA::LaunchConfig::get_max_threads_per_block () +{ + if (device < 0) + { + throw Error (InvalidState, "CUDA::LaunchConfig::get_max_threads_per_block", + "not initialized"); + return -1; + } + else + return device_properties.maxThreadsPerBlock; +} + +size_t CUDA::LaunchConfig::get_max_shm () +{ + if (device < 0) + { + throw Error (InvalidState, "CUDA::LaunchConfig::get_max_shm", + "not initialized"); + return -1; + } + else + return device_properties.sharedMemPerBlock; +} + + void CUDA::LaunchConfig1D::set_nelement (unsigned N) { unsigned max_nthread = device_properties.maxThreadsPerBlock; diff -Nru bl-dspsr-0+git20160405/Signal/General/LoadToFil.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/LoadToFil.C --- bl-dspsr-0+git20160405/Signal/General/LoadToFil.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/LoadToFil.C 2018-03-12 23:02:35.000000000 +0000 @@ -176,12 +176,12 @@ if ( config->filterbank.get_nchan() ) { if (verbose) - cerr << "digifil: creating " << config->filterbank.get_nchan() - << " channel filterbank" << endl; + cerr << "digifil: creating " << config->filterbank.get_nchan() + << " channel filterbank" << endl; if ( config->coherent_dedisp ) { - cerr << "digifil: using coherent dedispersion" << endl; + cerr << "digifil: using coherent dedispersion" << endl; kernel = new Dedispersion; @@ -198,35 +198,37 @@ || config->coherent_dedisp || (config->npol>2) ) { - cerr << "digifil: using convolving filterbank" << endl; + cerr << "digifil: using convolving filterbank" << endl; - filterbank = new Filterbank; + filterbank = new Filterbank; - filterbank->set_nchan( config->filterbank.get_nchan() ); - filterbank->set_input( timeseries ); + filterbank->set_nchan( config->filterbank.get_nchan() ); + filterbank->set_input( timeseries ); filterbank->set_output( timeseries = new_TimeSeries() ); if (kernel) filterbank->set_response( kernel ); - if ( config->filterbank.get_freq_res() ) + if ( config->filterbank.get_freq_res() ) filterbank->set_frequency_resolution ( config->filterbank.get_freq_res() ); - operations.push_back( filterbank.get() ); - do_detection = true; + operations.push_back( filterbank.get() ); + do_detection = true; } else { - filterbank = new TFPFilterbank; + filterbank = new TFPFilterbank; - filterbank->set_nchan( config->filterbank.get_nchan() ); - filterbank->set_input( timeseries ); - filterbank->set_output( timeseries = new_TimeSeries() ); + filterbank->set_nchan( config->filterbank.get_nchan() ); + filterbank->set_input( timeseries ); + filterbank->set_output( timeseries = new_TimeSeries() ); - operations.push_back( filterbank.get() ); + operations.push_back( filterbank.get() ); } } + else + do_detection = true; if ( config->dedisperse ) { @@ -245,9 +247,9 @@ if (do_detection) { if (verbose) - cerr << "digifil: creating detection operation (npol=" << + cerr << "digifil: creating detection operation (npol=" << config->npol << ")" << endl; - + Detection* detection = new Detection; detection->set_input( timeseries ); diff -Nru bl-dspsr-0+git20160405/Signal/General/LoadToFITS.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/LoadToFITS.C --- bl-dspsr-0+git20160405/Signal/General/LoadToFITS.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/LoadToFITS.C 2018-03-12 23:02:35.000000000 +0000 @@ -82,6 +82,8 @@ rescale_seconds = -1; rescale_constant = false; + integration_length = 0; + nbits = 2; npol = 4; @@ -93,6 +95,12 @@ weighted_time_series = false; } +// set block_size to result in approximately this much RAM usage +void dsp::LoadToFITS::Config::set_maximum_RAM (uint64_t ram) +{ + maximum_RAM = ram; +} + void dsp::LoadToFITS::Config::set_quiet () { SingleThread::Config::set_quiet(); @@ -135,7 +143,6 @@ if (!config->dedisperse && unpacker->get_order_supported (config->order)) unpacker->set_output_order (config->order); - // get basic information about the observation Observation* obs = manager->get_info(); @@ -149,31 +156,46 @@ cerr << "Source = " << obs->get_source() << endl; cerr << "Frequency = " << obs->get_centre_frequency() << endl; cerr << "Bandwidth = " << obs->get_bandwidth() << endl; + cerr << "Channels = " << nchan << endl; cerr << "Sampling rate = " << rate << endl; cerr << "State = " << tostring(obs->get_state()) <set_dispersion_measure( config->dispersion_measure ); - // Strategy will be to tscrunch from Nyquist resolution to desired reso. - - // voltage samples per filterbank sample - double samp_per_fb = config->tsamp * rate; - if (verbose) - cerr << "voltage samples per filterbank sample="<get_state() == Signal::Nyquist? 0.5 : 1.0; unsigned fb_nchan = config->filterbank.get_nchan(); - unsigned tres_factor = round(factor*samp_per_fb/fb_nchan); - double tsamp = tres_factor/factor*fb_nchan/rate; + unsigned nsample; + double tsamp, samp_per_fb; + unsigned tres_factor; + double factor = obs->get_state() == Signal::Nyquist? 0.5 : 1.0; - cerr << "digifits: requested tsamp=" << config->tsamp << " rate=" << rate << endl << " actual tsamp=" << tsamp << " (tscrunch=" << tres_factor << ")" << endl; + if (fb_nchan > 0) + { + // Strategy will be to tscrunch from Nyquist resolution to desired reso. + // voltage samples per filterbank sample + samp_per_fb = config->tsamp * rate; + if (verbose) + cerr << "voltage samples per filterbank sample="<nsblk); + } + else + { + samp_per_fb = 1.0; + tres_factor = round(rate * config->tsamp); + tsamp = tres_factor/factor * 1/rate; + nsample = config->nsblk * tres_factor; + } + + cerr << "digifits: requested tsamp=" << config->tsamp << " rate=" << rate << endl + << " actual tsamp=" << tsamp << " (tscrunch=" << tres_factor << ")" << endl; if (verbose) cerr << "digifits: nsblk=" << config->nsblk << endl; - // voltage samples per output block - uint64_t nsample = round(samp_per_fb * config->nsblk); - // the unpacked input will occupy nbytes_per_sample double nbytes_per_sample = sizeof(float) * nchan * npol * ndim; double MB = 1024.0 * 1024.0; @@ -181,6 +203,10 @@ // ideally, block size would be a full output block, but this is too large // pick a nice fraction that will divide evently into maximum RAM // NB this doesn't account for copies (yet) + + if (verbose) + cerr << "digifits: nsample * nbytes_per_sample=" << nsample * nbytes_per_sample + << " config->maximum_RAM=" << config->maximum_RAM << endl; while (nsample * nbytes_per_sample > config->maximum_RAM) nsample /= 2; if (verbose) @@ -202,65 +228,74 @@ if (!obs->get_detected()) { - - if ( !config->filterbank.get_nchan() ) - throw Error(InvalidParam,"dsp::LoadToFITS::construct", - "must specify filterbank scheme if data are not detected"); - - // If user specifies -FN:D, enable coherent dedispersion - if ( config->filterbank.get_convolve_when() == - Filterbank::Config::During ) - config->coherent_dedisp = true; - - if ( (config->coherent_dedisp) && (config->dispersion_measure != 0.0) ) + // if no filterbank specified + if (fb_nchan == 0) { - cerr << "digifits: using coherent dedispersion" << endl; - - // "During" is the only option, my friends - config->filterbank.set_convolve_when( Filterbank::Config::During ); - - kernel = new Dedispersion; - kernel->set_dispersion_measure( config->dispersion_measure ); - - if (config->filterbank.get_freq_res()) - kernel -> set_times_minimum_nfft (config->filterbank.get_freq_res () ); - //kernel->set_frequency_resolution ( - // config->filterbank.get_freq_res()); - + if (nchan == 1) + throw Error(InvalidParam,"dsp::LoadToFITS::construct", + "must specify filterbank scheme if single channel data"); + else + if (verbose) + cerr << "digifits: no filterbank specified" << endl; } - else config->coherent_dedisp = false; + else + { + // If user specifies -FN:D, enable coherent dedispersion + if ( config->filterbank.get_convolve_when() == + Filterbank::Config::During ) + config->coherent_dedisp = true; + + if ( (config->coherent_dedisp) && (config->dispersion_measure != 0.0) ) + { + cerr << "digifits: using coherent dedispersion" << endl; + + // "During" is the only option, my friends + config->filterbank.set_convolve_when( Filterbank::Config::During ); + + kernel = new Dedispersion; + kernel->set_dispersion_measure( config->dispersion_measure ); + + if (config->filterbank.get_freq_res()) + kernel -> set_times_minimum_nfft (config->filterbank.get_freq_res () ); + //kernel->set_frequency_resolution ( + // config->filterbank.get_freq_res()); + + } + else + config->coherent_dedisp = false; # if HAVE_CUDA - if (run_on_gpu) - { - timeseries->set_memory (device_memory); - config->filterbank.set_device ( device_memory.ptr() ); - config->filterbank.set_stream ( gpu_stream ); - } + if (run_on_gpu) + { + timeseries->set_memory (device_memory); + config->filterbank.set_device ( device_memory.ptr() ); + config->filterbank.set_stream ( gpu_stream ); + } #endif - filterbank = config->filterbank.create (); + filterbank = config->filterbank.create (); + + filterbank->set_nchan( config->filterbank.get_nchan() ); + filterbank->set_input( timeseries ); + filterbank->set_output( timeseries = new_TimeSeries() ); - filterbank->set_nchan( config->filterbank.get_nchan() ); - filterbank->set_input( timeseries ); - filterbank->set_output( timeseries = new_TimeSeries() ); # if HAVE_CUDA - if (run_on_gpu) - timeseries->set_memory (device_memory); + if (run_on_gpu) + timeseries->set_memory (device_memory); #endif - if (kernel) - filterbank->set_response( kernel ); + if (kernel) + filterbank->set_response( kernel ); - if ( !config->coherent_dedisp ) - { - unsigned freq_res = config->filterbank.get_freq_res(); - if (freq_res > 1) - filterbank->set_frequency_resolution ( freq_res ); - } - - operations.push_back( filterbank.get() ); + if ( !config->coherent_dedisp ) + { + unsigned freq_res = config->filterbank.get_freq_res(); + if (freq_res > 1) + filterbank->set_frequency_resolution ( freq_res ); + } + operations.push_back( filterbank.get() ); + } if (verbose) cerr << "digifits: creating detection operation" << endl; @@ -441,6 +476,7 @@ FITSOutputFile* outputfile = new FITSOutputFile (output_filename); outputfile->set_nsblk (config->nsblk); outputfile->set_nbit (config->nbits); + outputfile->set_max_length (config->integration_length); outputFile = outputfile; outputFile->set_input (bitseries); @@ -462,8 +498,11 @@ unsigned freq_res = config->coherent_dedisp? kernel->get_frequency_resolution() : config->filterbank.get_freq_res(); if (freq_res == 0) freq_res = 1; - cerr << "digifits: creating " << config->filterbank.get_nchan() - << " by " << freq_res << " back channel filterbank" << endl; + if (config->filterbank.get_nchan()) + cerr << "digifits: creating " << config->filterbank.get_nchan() + << " by " << freq_res << " back channel filterbank" << endl; + else + cerr << "digifits: processing " << manager->get_info()->get_nchan() << " channels" << endl; // TODO -- set an optimal block size for search mode diff -Nru bl-dspsr-0+git20160405/Signal/General/Makefile.am bl-dspsr-0.0~git20180312.50ea209/Signal/General/Makefile.am --- bl-dspsr-0+git20160405/Signal/General/Makefile.am 2018-03-12 08:32:59.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/Makefile.am 2018-03-12 23:02:35.000000000 +0000 @@ -21,7 +21,8 @@ dsp/TFPFilterbank.h dsp/RFIZapper.h dsp/SKFilterbank.h \ dsp/Resize.h dsp/SKDetector.h dsp/SKMasker.h \ dsp/Pipeline.h dsp/SingleThread.h dsp/MultiThread.h \ - dsp/PolnSelect.h dsp/PolnReshape.h + dsp/PolnSelect.h dsp/PolnReshape.h dsp/SpectralKurtosis.h \ + dsp/SKComputer.h libdspdsp_la_SOURCES = optimize_fft.c cross_detect.c cross_detect.h \ cross_detect.ic stokes_detect.c stokes_detect.h \ @@ -38,25 +39,54 @@ TFPFilterbank.C RFIZapper.C SKFilterbank.C \ Resize.C SKDetector.C SKMasker.C \ SingleThread.C MultiThread.C dsp_verbosity.C \ - PolnSelect.C PolnReshape.C + PolnSelect.C PolnReshape.C SpectralKurtosis.C + +bin_PROGRAMS = dmsmear digitxt digimon digihist filterbank_speed libdspdsp_la_LIBADD = if HAVE_CUFFT nobase_include_HEADERS += CUFFTError.h dsp/LaunchConfig.h \ - dsp/FilterbankCUDA.h dsp/filterbank_cuda.h \ - dsp/TransferCUDA.h dsp/TransferBitSeriesCUDA.h \ - dsp/SKMaskerCUDA.h dsp/DetectionCUDA.h dsp/FZoomCUDA.h \ - dsp/FScrunchCUDA.h dsp/TScrunchCUDA.h + dsp/FilterbankCUDA.h dsp/filterbank_cuda.h \ + dsp/TransferCUDA.h dsp/TransferBitSeriesCUDA.h \ + dsp/SKMaskerCUDA.h dsp/DetectionCUDA.h dsp/FZoomCUDA.h \ + dsp/FScrunchCUDA.h dsp/TScrunchCUDA.h \ + dsp/PScrunchCUDA.h dsp/PolnSelectCUDA.h \ + dsp/ConvolutionCUDA.h dsp/ConvolutionCUDASpectral.h \ + dsp/ConvolutionCUDACallbacks.h dsp/SpectralKurtosisCUDA.h \ + dsp/SKComputerCUDA.h dsp/SKDetectorCUDA.h dsp/SKFilterbankCUDA.h libdspdsp_la_SOURCES += CUFFTError.C LaunchConfig.C FilterbankCUDA.cu \ - TransferCUDA.C TransferBitSeriesCUDA.C DetectionCUDA.cu \ - SKMaskerCUDA.cu FZoomCUDA.cu FScrunchCUDA.cu TScrunchCUDA.cu + TransferCUDA.C TransferBitSeriesCUDA.C DetectionCUDA.cu \ + SKMaskerCUDA.cu FZoomCUDA.cu FScrunchCUDA.cu TScrunchCUDA.cu \ + PScrunchCUDA.cu PolnSelectCUDA.cu SpectralKurtosisCUDA.cu \ + SKComputerCUDA.cu SKDetectorCUDA.cu SKFilterbankCUDA.cu \ + ConvolutionCUDA.cu ConvolutionCUDASpectral.cu ConvolutionCUDACallbacks.cu + +bin_PROGRAMS += fftbatch_speed +fftbatch_speed_SOURCES = fftbatch_speed.C + +if HAVE_CUFFT_CALLBACKS +bin_PROGRAMS += cufft_callback_bench + + +cufft_callback_bench_DC.o: cufft_callback_bench.o + $(CUDA_NVCC) -o cufft_callback_bench_DC.o -dlink cufft_callback_bench.o -lcufft_static + +ConvolutionCUDACallbacks.lo: ConvolutionCUDACallbacks.cu + $(top_srcdir)/config/cudalt.py $(top_builddir)/libtool $@ $(CUDA_NVCC) -dc -c $< + $(CUDA_NVCC) -o ConvolutionCUDACallbacks_DC.o -dlink ConvolutionCUDACallbacks.o -lcufft_static + +cufft_callback_bench.lo: cufft_callback_bench.cu + $(top_srcdir)/config/cudalt.py $(top_builddir)/libtool $@ $(CUDA_NVCC) -dc -c $< + $(CUDA_NVCC) -o cufft_callback_bench_DC.o -dlink cufft_callback_bench.o -lcufft_static + +cufft_callback_bench_LDADD = $(LDADD) cufft_callback_bench_DC.o -lcudart -lcufft_static -lculibos endif +endif -bin_PROGRAMS = dmsmear digitxt digimon digihist filterbank_speed dmsmear_SOURCES = dmsmear.C digitxt_SOURCES = digitxt.C @@ -86,6 +116,10 @@ passband_SOURCES = passband.C passband_LDADD = @PSRPLOT_LIBS@ $(LDADD) +if HAVE_CUFFT_CALLBACKS + digistat_LDADD += ConvolutionCUDACallbacks_DC.o -lcudart -lcufft_static -lculibos + passband_LDADD += ConvolutionCUDACallbacks_DC.o -lcudart -lcufft_static -lculibos +endif # # end PGPLOT-specific code # @@ -128,11 +162,17 @@ bin_PROGRAMS += digifil digifil_SOURCES = digifil.C + if HAVE_dada bin_PROGRAMS += the_decimator the_decimator_SOURCES = the_decimator.C the_decimator_LDADD = $(LDADD) @OPENSSL_LIBS@ @PSRXML_LIBS@ @PSRDADA_LIBS@ the_decimator_CPPFLAGS = $(AM_CPPFLAGS) $(CPPFLAGS) @PSRXML_CFLAGS@ @PSRDADA_CFLAGS@ + +if HAVE_CUFFT_CALLBACKS + the_decimator_LDADD += ConvolutionCUDACallbacks_DC.o -lcudart -lcufft_static -lculibos +endif + endif # @@ -151,9 +191,8 @@ LDADD = libdspdsp.la \ $(top_builddir)/Kernel/libdspbase.la \ $(top_builddir)/Signal/Statistics/libdspstats.la \ - @PGPLOT_LIBS@ @CUFFT_LIBS@ @CUDA_LIBS@ + @PGPLOT_LIBS@ @CUDA_LIBS@ @CUFFT_LIBS@ AM_CPPFLAGS += @PGPLOT_CFLAGS@ @CUFFT_CFLAGS@ @CFITSIO_CFLAGS@ AM_CXXFLAGS = @OPENMP_CFLAGS@ - diff -Nru bl-dspsr-0+git20160405/Signal/General/passband.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/passband.C --- bl-dspsr-0+git20160405/Signal/General/passband.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/passband.C 2018-03-12 23:02:35.000000000 +0000 @@ -5,6 +5,10 @@ * ***************************************************************************/ +#if HAVE_CONFIG_H +#include +#endif + #include "dsp/Bandpass.h" #include "dsp/RFIFilter.h" @@ -23,6 +27,11 @@ #include "pgutil.h" #include "Error.h" +#if HAVE_fits +#include "dsp/FITSFile.h" +#include "dsp/FITSUnpacker.h" +#endif + #include #include #include @@ -36,11 +45,13 @@ "Options:\n" " -b plot frequency bins (histogram style) \n" " -c cmap set the colour map (0 to 7) \n" + " -D set the PGPLOT device\n" " -d produce dynamic spectrum (greyscale) \n" " -F min,max set the min,max x-value (e.g. frequency zoom) \n" " -r min,max set the min,max y-value (e.g. saturate birdies) \n" " -n nchan number of frequency channels in each spectrum \n" " -t seconds integration interval for each spectrum \n" + " -s quit after a single integration \n" " -p detect the full-polarization bandpass \n" " -R test RFIFilter class \n" << endl; @@ -66,6 +77,9 @@ // integration length double integrate = 1.0; + // quit after a single integration (for quicklook) + bool single_quit = false; + // seek into file double seek_seconds = 0.0; @@ -101,7 +115,7 @@ int width_pixels = 0; int height_pixels = 0; - static const char* args = "ibB:c:dD:f:F:G:g:lr:n:pRS:T:t:hvV"; + static const char* args = "ibB:c:dD:f:F:G:g:lr:n:pRS:T:t:shvV"; while ((c = getopt(argc, argv, args)) != -1) switch (c) { @@ -181,6 +195,10 @@ integrate = atof (optarg); break; + case 's': + single_quit = true; + break; + case 'h': usage (); return 0; @@ -304,6 +322,30 @@ cerr << "opening data file " << filenames[ifile] << endl; manager->open (filenames[ifile]); + +#if HAVE_fits + // Use callback to handle scales/offsets for read-in + if (manager->get_info()->get_machine() == "FITS") + { + if (dsp::Operation::verbose) + cerr << "Using callback to read PSRFITS file." << endl; + // connect a callback + bool success = false; + dsp::FITSUnpacker* funp = dynamic_cast ( + manager->get_unpacker()); + dsp::FITSFile* ffile = dynamic_cast ( + manager->get_input()); + cerr << funp << endl; + cerr << ffile << endl; + if (funp && ffile) + { + ffile->update.connect ( funp, &dsp::FITSUnpacker::set_parameters ); + success = true; + } + if (not success) + cerr << "dspsr: WARNING: FITS input input but unable to apply scales and offsets." << endl; + } +#endif if (verbose) cerr << "data file " << filenames[ifile] << " opened" << endl; @@ -409,6 +451,8 @@ passband->reset_output(); } + if (single_quit) + break; } if (dynamic) diff -Nru bl-dspsr-0+git20160405/Signal/General/PolnSelectCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Signal/General/PolnSelectCUDA.cu --- bl-dspsr-0+git20160405/Signal/General/PolnSelectCUDA.cu 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/PolnSelectCUDA.cu 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,151 @@ +//-*-C++-*- + +/*************************************************************************** + * + * Copyright (C) 2015 by Andrew Jameson + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#include "dsp/PolnSelectCUDA.h" + +#include "Error.h" +#include "debug.h" + +#include + +#include + +using namespace std; + +void check_error (const char*); + +CUDA::PolnSelectEngine::PolnSelectEngine (cudaStream_t _stream) +{ + stream = _stream; +} + +CUDA::PolnSelectEngine::~PolnSelectEngine () +{ +} + +//! get cuda device properties +void CUDA::PolnSelectEngine::setup() +{ + gpu_config.init(); +} + + +// +// each thread reads a single value from both polarisation +// and adds them together +// +__global__ void fpt_polnselect_kernel (float * in, float * out, + uint64_t in_chan_span, + uint64_t out_chan_span, + uint64_t in_ndat) +{ + // ichan: blockIdx.y + const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx >= in_ndat) + return; + + out[blockIdx.y * out_chan_span + idx] = in[blockIdx.y * in_chan_span + idx]; +} + +void CUDA::PolnSelectEngine::fpt_polnselect (int ipol, + const dsp::TimeSeries* input, + dsp::TimeSeries* output) +{ + if (input == output) + throw Error (InvalidParam, "CUDA::PolnSelectEngine::fpt_polnselect", + "cannot handle in-place data"); + + const uint64_t ndat = input->get_ndat(); + const unsigned nchan = input->get_nchan(); + const unsigned npol = input->get_npol(); + + if (npol != 2) + throw Error (InvalidParam, "CUDA::PolnSelectEngine::fpt_polnselect", + "number of input polarisations must be two"); + + uint64_t in_chan_span = 0; + uint64_t out_chan_span = 0; + if (nchan > 1) + { + in_chan_span = input->get_datptr (1, 0) - input->get_datptr (0, 0); + out_chan_span = output->get_datptr (1, 0) - output->get_datptr (0, 0); + } + + // TODO (idea) this could be changed to a bunch of memcpy's in low nchan case + + float * in = (float *) input->get_datptr (0, ipol); + float * out = output->get_datptr (0, 0); + +#ifdef _DEBUG + cerr << "CUDA::PolnSelectEngine::fpt_polnselect channel spans: input=" << in_chan_span << " output=" << out_chan_span << endl; +#endif + + dim3 threads (gpu_config.get_max_threads_per_block()); + dim3 blocks (ndat / threads.x, nchan); + + if (ndat % threads.x) + blocks.x ++; + + // pass span as number of complex values + fpt_polnselect_kernel<<>> (in, out, in_chan_span, out_chan_span, ndat); + if (dsp::Operation::record_time || dsp::Operation::verbose) + check_error ("CUDA::PolnSelectEngine::fpt_polnselect"); +} + + +// each block polnselectes 1 time sample for many channels +__global__ void tfp_polnselect_kernel (float * in, float * out, unsigned nchan) +{ + // isamp == blockIdx.y + // ichan == blockIdx.x * blockDim.x + threadIdx.x + + const unsigned isamp = blockIdx.y; + const unsigned ichan = (blockIdx.x * blockDim.x + threadIdx.x); + const unsigned npol = 2; + + if (ichan >= nchan) + return; + + const unsigned int idx = (isamp * nchan * npol) + (ichan * npol); + const unsigned int odx = (isamp * nchan) + ichan; + + out[odx] = in[idx]; +} + +void CUDA::PolnSelectEngine::tfp_polnselect (int ipol, + const dsp::TimeSeries* input, + dsp::TimeSeries* output) +{ + if (input == output) + throw Error (InvalidParam, "CUDA::PolnSelectEngine::tfp_polnselect" + "cannot handle in-place data"); + + const uint64_t ndat = input->get_ndat(); + const unsigned nchan = input->get_nchan(); + const unsigned npol = input->get_npol(); + + if (npol != 2) + throw Error (InvalidParam, "CUDA::PolnSelectEngine::fpt_scrunch", + "number of input polarisations must be two"); + + dim3 threads (gpu_config.get_max_threads_per_block()); + if (nchan < gpu_config.get_max_threads_per_block()) + threads.x = nchan; + + dim3 blocks (nchan/threads.x, ndat); + if (nchan % threads.x) + blocks.x++; + + // offset into the TFP array by ipol + float * in_base = (float *) input->get_dattfp () + ipol; + float * out_base = (float *) output->get_dattfp (); + + tfp_polnselect_kernel<<>> (in_base, out_base, nchan); +} diff -Nru bl-dspsr-0+git20160405/Signal/General/PScrunchCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Signal/General/PScrunchCUDA.cu --- bl-dspsr-0+git20160405/Signal/General/PScrunchCUDA.cu 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/PScrunchCUDA.cu 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,159 @@ +//-*-C++-*- + +/*************************************************************************** + * + * Copyright (C) 2015 by Andrew Jameson + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#include "dsp/PScrunchCUDA.h" + +#include "Error.h" +#include "debug.h" + +#include + +#include + +using namespace std; + +void check_error (const char*); + +CUDA::PScrunchEngine::PScrunchEngine (cudaStream_t _stream) +{ + stream = _stream; +} + +CUDA::PScrunchEngine::~PScrunchEngine () +{ +} + +//! get cuda device properties +void CUDA::PScrunchEngine::setup() +{ + gpu_config.init(); +} + + +// +// each thread reads a single value from both polarisation +// and adds them together +// +__global__ void fpt_pscrunch_kernel (float * in_p0, float * in_p1, + float * out, uint64_t in_chan_span, + uint64_t out_chan_span, uint64_t in_ndat) +{ + // ichan: blockIdx.y + const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx >= in_ndat) + return; + + // increment the input/output base pointers to this chan/pol + in_p0 += (blockIdx.y * in_chan_span); + in_p1 += (blockIdx.y * in_chan_span); + out += (blockIdx.y * out_chan_span); + + out[idx] = (in_p0[idx] + in_p1[idx]) * M_SQRT1_2; +} + +void CUDA::PScrunchEngine::fpt_pscrunch (const dsp::TimeSeries* input, + dsp::TimeSeries* output) +{ + if (input == output) + throw Error (InvalidParam, "CUDA::PScrunchEngine::fpt_pscrunch", + "cannot handle in-place data"); + + const uint64_t input_ndat = input->get_ndat(); + const unsigned input_nchan = input->get_nchan(); + const unsigned input_npol = input->get_npol(); + + if (input_npol != 2) + throw Error (InvalidParam, "CUDA::PScrunchEngine::fpt_scrunch", + "number of input polarisations must be two"); + + uint64_t in_chan_span = 0; + uint64_t out_chan_span = 0; + if (input_nchan > 1) + { + in_chan_span = input->get_datptr (1, 0) - input->get_datptr (0, 0); + out_chan_span = output->get_datptr (1, 0) - output->get_datptr (0, 0); + } + + float * in_pol0 = (float *) input->get_datptr (0, 0); + float * in_pol1 = (float *) input->get_datptr (0, 1); + float * out = output->get_datptr (0, 0); + +#ifdef _DEBUG + cerr << "CUDA::PScrunchEngine::fpt_pscrunch channel spans: input=" << in_chan_span << " output=" << out_chan_span << endl; +#endif + + dim3 threads (gpu_config.get_max_threads_per_block()); + dim3 blocks (input_ndat / threads.x, input_nchan); + + if (input_ndat % threads.x) + blocks.x ++; + + // pass span as number of complex values + fpt_pscrunch_kernel<<>> (in_pol0, in_pol1, out, in_chan_span, out_chan_span, input_ndat); + if (dsp::Operation::record_time || dsp::Operation::verbose) + check_error ("CUDA::PScrunchEngine::fpt_pscrunch"); +} + + +// each block pscrunches 1 time sample for many channels +__global__ void tfp_pscrunch_kernel (float * in, float * out, unsigned nchan) +{ + extern __shared__ float pscr_shm[]; + + // isamp == blockIdx.y + // ipol == even/odd threads + // ichan == blockIdx.x * blockDim.x + threadIdx.x + + const unsigned isamp = blockIdx.y; + const unsigned ichanpol = (blockIdx.x * blockDim.x + threadIdx.x); + const unsigned ichan = ichanpol / 2; + const unsigned ipol = ichanpol & 0x1; // % 2 + const unsigned npol = 2; + + if (ichanpol >= nchan*npol) + return; + + const unsigned int idx = (isamp * nchan * npol) + ichan * npol + ipol; + const unsigned int odx = (isamp * nchan) + ichan; + + pscr_shm[threadIdx.x] = in[idx]; + + __syncthreads(); + + if (ipol == 0) + out[odx] = (pscr_shm[threadIdx.x] + pscr_shm[threadIdx.x+1]) * M_SQRT1_2; +} + +void CUDA::PScrunchEngine::tfp_pscrunch (const dsp::TimeSeries* input, + dsp::TimeSeries* output) +{ + if (input == output) + throw Error (InvalidParam, "CUDA::PScrunchEngine::tfp_pscrunch" + "cannot handle in-place data"); + + const uint64_t input_ndat = input->get_ndat(); + const unsigned input_nchan = input->get_nchan(); + const unsigned input_npol = input->get_npol(); + + if (input_npol != 2) + throw Error (InvalidParam, "CUDA::PScrunchEngine::fpt_scrunch", + "number of input polarisations must be two"); + + dim3 threads (gpu_config.get_max_threads_per_block()); + dim3 blocks (input_nchan*input_npol/threads.x, input_ndat); + if (input_nchan*input_npol % threads.x) + blocks.x++; + + float * in_base = (float *) input->get_dattfp (); + float * out_base = (float *) output->get_dattfp (); + size_t shm_bytes = blocks.x * sizeof(float); + + tfp_pscrunch_kernel<<>> (in_base, out_base, input_nchan); +} diff -Nru bl-dspsr-0+git20160405/Signal/General/SingleThread.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/SingleThread.C --- bl-dspsr-0+git20160405/Signal/General/SingleThread.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/SingleThread.C 2018-03-12 23:02:35.000000000 +0000 @@ -24,6 +24,7 @@ #if HAVE_CUDA #include "dsp/MemoryCUDA.h" #include "dsp/TransferCUDA.h" +#include "dsp/TimeSeriesCUDA.h" #endif #include "dsp/ObservationChange.h" @@ -58,6 +59,7 @@ input_context = 0; gpu_stream = undefined_stream; + gpu_device = 0; } dsp::SingleThread::~SingleThread () @@ -216,35 +218,47 @@ if (run_on_gpu) { - // disable input buffering when data must be copied between devices - if (config->get_total_nthread() > 1) - config->input_buffering = false; - - int device = config->cuda_device[thread_id]; + gpu_device = config->cuda_device[thread_id]; cerr << "dspsr: thread " << thread_id - << " using CUDA device " << device << endl; + << " using CUDA device " << gpu_device << endl; int ndevice = 0; cudaError err = cudaGetDeviceCount(&ndevice); - if (err != cudaSuccess || device >= ndevice) + if (err != cudaSuccess || gpu_device >= ndevice) throw Error (InvalidParam, "dsp::SingleThread::initialize", - "device=%d >= ndevice=%d cudaError=%s", device, ndevice, cudaGetErrorString(err)); + "device=%d >= ndevice=%d cudaError=%s", gpu_device, ndevice, cudaGetErrorString(err)); - err = cudaSetDevice (device); + err = cudaSetDevice (gpu_device); if (err != cudaSuccess) throw Error (InvalidState, "dsp::SingleThread::initialize", "cudaMalloc failed: %s", cudaGetErrorString(err)); - unsigned nstream = count (config->cuda_device, (unsigned)device); + unsigned nstream = count (config->cuda_device, (unsigned) gpu_device); // always create a stream, even for 1 thread cudaStreamCreate( &stream ); cerr << "dspsr: thread " << thread_id << " on stream " << stream << endl; gpu_stream = stream; + device_memory = new CUDA::DeviceMemory (stream, gpu_device); + if (config->input_buffering) + cerr << "dspsr: input_buffering enabled" << endl; + else + cerr << "dspsr: input_buffering disabled" << endl; + if (unpacker->get_device_supported( device_memory )) + cerr << "dspsr: unpacker supports device memory" << endl; - device_memory = new CUDA::DeviceMemory (stream); + if ((thread_id == 0) && (!config->input_buffering) && unpacker->get_device_supported( device_memory )) + { + dsp::Seekable * seekable = dynamic_cast( manager->get_input() ); + if (seekable) + { + cerr << "dspsr: disabling input buffering, using overlap memory instead" << endl; + // overlap memory on stream/device of thread_id 0 + seekable->set_overlap_buffer_memory (device_memory); + } + } if (unpacker->get_device_supported( device_memory )) { @@ -253,9 +267,10 @@ unpacker->set_device( device_memory ); unpacked->set_memory( device_memory ); - + unpacked->set_engine (new CUDA::TimeSeriesEngine (device_memory)); + BitSeries* bits = new BitSeries; - bits->set_memory (new CUDA::PinnedMemory); + bits->set_memory (device_memory); manager->set_output (bits); } else @@ -339,10 +354,19 @@ //! Run through the data void dsp::SingleThread::run () try { - if (Operation::verbose) - cerr << "dsp::SingleThread::run this=" << this + + if (Operation::verbose) { + + cerr << "dsp::SingleThread::run this=" << this << " nops=" << operations.size() << endl; + for (unsigned iop=0; iop < operations.size(); iop++){ + cerr << "dsp::SingleThread::run operation (" << iop << "): " + << operations[iop]->get_name() << endl; + } + + } + if (log) scratch->set_cerr (*log); diff -Nru bl-dspsr-0+git20160405/Signal/General/SKComputerCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Signal/General/SKComputerCUDA.cu --- bl-dspsr-0+git20160405/Signal/General/SKComputerCUDA.cu 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/SKComputerCUDA.cu 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,544 @@ +//-*-C++-*- + +/*************************************************************************** + * + * Copyright (C) 2016 by Andre Jameson + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#include "dsp/SKComputerCUDA.h" +#include "dsp/MemoryCUDA.h" + +#include "Error.h" +#include "templates.h" +#include "debug.h" + +#include +#include +#include + +#include + +#ifdef __CUDA_ARCH__ + #if (__CUDA_ARCH__ >= 300) + #define HAVE_SHFL + #else + #define NO_SHFL + #endif +#endif + +using namespace std; + +void check_error (const char*); +void check_error_stream (const char*, cudaStream_t); + +/* + * Important Note, this engine is only efficient for larger strides (256-512) + * stride == nbeam for molongolo + */ + +CUDA::SKComputerEngine::SKComputerEngine (dsp::Memory * memory) +{ + device_memory = dynamic_cast(memory); + stream = device_memory->get_stream(); + + work_buffer_size = 0; + work_buffer = 0; +} + +void CUDA::SKComputerEngine::setup () +{ + if (dsp::Operation::verbose) + cerr << "CUDA::SKComputerEngine::setup ()" << endl; + + // determine GPU capabilities + int device = 0; + cudaGetDevice(&device); + struct cudaDeviceProp device_properties; + cudaGetDeviceProperties (&device_properties, device); + max_threads_per_block = device_properties.maxThreadsPerBlock; +} + +// each +__global__ void reduce_sqld_new (float2 * in, float2 * sums, float * skestimates, uint64_t in_stride, unsigned M) +{ + extern __shared__ float s1s[]; + float * s2s = s1s + 32; + + // each block integrates M samples + const unsigned ichanpol = blockIdx.y; + const unsigned nchanpol = gridDim.y; + + // offset to current channel, pol + in += (ichanpol * in_stride) + (blockIdx.x * M); + + float power; + float2 val; + float s1 = 0; + float s2 = 0; + + // in case M is > blockDim.x + for (unsigned i=threadIdx.x; i 0; offset >>= 1) + { + if (threadIdx.x < offset) + { + s1s[threadIdx.x] += s1s[threadIdx.x + offset]; + s2s[threadIdx.x] += s2s[threadIdx.x + offset]; + } + __syncthreads(); + } + + if (threadIdx.x == 0) + { + val.x = s1s[0]; + val.y = s2s[0]; + unsigned odx = blockIdx.x*nchanpol + ichanpol; + sums [odx] = val; + skestimates[odx] = ((M+1) / (M-1)) * (M * (val.y / (val.x * val.x)) - 1); + } + + +#endif + + // now we need to a reduction across the block +} + + +/* Perform a reduction including SQLD calculations */ +__global__ void reduce_sqld (float * in, float * out, const uint64_t ndat) +{ + extern __shared__ float sdata[]; + + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int s1 = (threadIdx.x*2); + unsigned int s2 = (threadIdx.x*2) + 1; + + float re = 0; + float im = 0; + if (i < ndat) + { + re = in[(2*i)]; + im = in[(2*i) + 1]; + } + + sdata[s1] = (re * re) + (im * im); + sdata[s2] = sdata[s1] * sdata[s1]; + + __syncthreads(); + + int last_offset = blockDim.x/2 + blockDim.x % 2; + + for (int offset = blockDim.x/2; offset > 0; offset >>= 1) + { + // add a partial sum upstream to our own + if (threadIdx.x < offset) + { + sdata[s1] += sdata[s1 + (2*offset)]; + sdata[s2] += sdata[s2 + (2*offset)]; + } + __syncthreads(); + + // special case for non power of 2 reductions + if ((last_offset % 2) && (last_offset > 2) && (threadIdx.x == offset)) + { + sdata[0] += sdata[s1 + (2*offset)]; + sdata[1] += sdata[s2 + (2*offset)]; + } + + last_offset = offset; + + // wait until all threads in the block have updated their partial sums + __syncthreads(); + } + + // thread 0 writes the final result + if (threadIdx.x == 0) + { + out[(2*blockIdx.x)] = sdata[0]; + out[(2*blockIdx.x)+1] = sdata[1]; + } +} + +/* sum each set of S1 and S2 and compute SK estimate for whole block */ +__global__ void reduce_sk_estimate_new (float2* input, float * output, unsigned nchanpol, unsigned ndat, float M) +{ + // input are stored in TFP order + const float M_fac = (M+1) / (M-1); + + for (unsigned ichanpol=threadIdx.x; ichanpol 0; offset >>= 1) + { + // add a partial sum upstream to our own + if (threadIdx.x < offset) + { + sdata[s1] += sdata[s1 + (2*offset)]; + sdata[s2] += sdata[s2 + (2*offset)]; + } + + __syncthreads(); + + // special case for non power of 2 reductions + if ((last_offset % 2) && (last_offset > 2) && (threadIdx.x == offset)) + { + sdata[0] += sdata[s1 + (2*offset)]; + sdata[1] += sdata[s2 + (2*offset)]; + } + + last_offset = offset; + + // wait until all threads in the block have updated their partial sums + __syncthreads(); + } + + // thread 0 writes the final result + if (threadIdx.x == 0) + { + if (sdata[0] == 0) + out[0] = 0; + else + { + float M_fac = (M+1) / (M-1); + out[0] = M_fac * (M * (sdata[1] / (sdata[0]*sdata[0])) - 1); + } + } +} + +__global__ void calc_sk_estimate (float * in, float * out, float M_fac, unsigned int M, size_t out_span) +{ + unsigned int i = threadIdx.x; + float S1_sum = in[(2*i)]; + float S2_sum = in[(2*i)+1]; + if (S1_sum == 0) + out[out_span*i] = 0; + else + out[out_span*i] = M_fac * (M * (S2_sum / (S1_sum * S1_sum)) - 1); +} + +// calculate SK statistics +void CUDA::SKComputerEngine::compute (const dsp::TimeSeries* input, + dsp::TimeSeries* output, dsp::TimeSeries *output_tscr, unsigned M) +{ + if (dsp::Operation::verbose) + std::cerr << "CUDA::SKComputerEngine::compute()" << std::endl; + + const uint64_t ndat = output->get_ndat() * M; + const unsigned nchan = input->get_nchan (); + const unsigned npol = input->get_npol (); + const unsigned nchanpol = nchan * npol; + + if (dsp::Operation::verbose) + std::cerr << "CUDA::SKComputerEngine::compute ndat=" << ndat << " nchan=" + << nchan << " npol=" << npol << " M=" << M << std::endl; + + float * outdat = output->get_dattfp(); + float * outdat_tscr = output_tscr->get_dattfp(); + if (dsp::Operation::verbose) + { + std::cerr << "CUDA::SKComputerEngine::compute outdat=" << (void *) outdat << endl; + std::cerr << "CUDA::SKComputerEngine::compute outdat_tscr=" << (void *) outdat_tscr << endl; + } + + // TODO: currently only support FPT on GPU due to FoldCUDA + switch (input->get_order()) + { + case dsp::TimeSeries::OrderFPT: + { + if (dsp::Operation::verbose) + std::cerr << "CUDA::SKComputerEngine::compute OrderFPT" << std::endl; + + float2 * indat = (float2*) input->get_datptr (0, 0); + + unsigned nthreads = 1024; + if (M < nthreads) + nthreads = M; + dim3 blocks (ndat / M, nchanpol); + + // this is by design, due to input buffering + assert (ndat % M == 0); + + // work buffer for S1 and S2 values for each set of M samples + size_t bytes_required = nchanpol * blocks.x * sizeof(float2); + if (bytes_required > work_buffer_size) + { + if (work_buffer) + { + cudaFree(work_buffer); + } + work_buffer_size = bytes_required; + cudaMalloc (&work_buffer, work_buffer_size); + } + + if (dsp::Operation::verbose) + cerr << "CUDA::SKComputerEngine::compute ndat=" << ndat + << " blocks=(" << blocks.x << "," << blocks.y << ")" + << " nthreads=" << nthreads << endl; + + // require an S1 and S2 value for each warp in each block + size_t shm_bytes_1 = 32 * sizeof(float2); + + if (dsp::Operation::verbose) + cerr << "CUDA::SKComputerEngine::compute work_buffer=" << (void *) work_buffer << endl; + + uint64_t in_stride; + if (npol > 1) + in_stride = input->get_datptr (0, 1) - input->get_datptr (0, 0); + else + in_stride = input->get_datptr (1, 0) - input->get_datptr (0, 0); + + // for float2 + in_stride /= 2; + + reduce_sqld_new<<>> ((float2 *) indat, (float2 *) work_buffer, outdat, in_stride, M); + if (dsp::Operation::record_time || dsp::Operation::verbose) + if (stream) + check_error_stream ("CUDA::SKComputerEngine::compute reduce_sqld_new [first]", stream); + else + check_error ("CUDA::SKComputerEngine::compute reduce_sqld_new [first]"); + + // compute a tscrunched output SK + nthreads = 1024; + if (nchanpol < nthreads) + nthreads = nchanpol; + reduce_sk_estimate_new<<<1,nthreads,0,stream>>>((float2*) work_buffer, outdat_tscr, nchanpol, blocks.x, ndat); + +#if 0 + + + // TODO consider making ichan a ydim? + for (unsigned ichan=0; ichan(input->get_datptr (ichan, ipol)); + + //cerr << "CUDA::SKComputerEngine::compute ichan=" << ichan << " pol=" << ipol << " indat=" << indat << endl; + + // foreach block reduce to S1, S2 sums [out of place] + //cerr << "CUDA::SKComputerEngine::compute [1] [" << ichan << ", " << ipol << "] shm_bytes=" << shm_bytes_1 << endl; + reduce_sqld<<>> (indat, work_buffer, ndat_proc); + if (dsp::Operation::record_time || dsp::Operation::verbose) + if (stream) + check_error_stream ("CUDA::SKComputerEngine::compute reduce_sqld [first]", stream); + else + check_error ("CUDA::SKComputerEngine::compute reduce_sqld [first]"); + + // calculate S1, S2 sums for tscr [in place] + //cerr << "CUDA::SKComputerEngine::compute [2] [" << ichan << ", " << ipol << "] shm_bytes=" << shm_bytes_2 << endl; + reduce_sk_estimate<<<1,nblocks,shm_bytes_2,stream>>> (work_buffer, outdat_tscr, nblocks, ndat_proc, ichan); + if (dsp::Operation::record_time || dsp::Operation::verbose) + if (stream) + check_error_stream ("CUDA::SKComputerEngine::compute reduce_sqld [second]", stream); + else + check_error ("CUDA::SKComputerEngine::compute reduce_sqld [second]"); + + // caculate SK estimator for each block in place [out of place] + calc_sk_estimate<<<1,nblocks,0,stream>>> (work_buffer, outdat, M_fac, M, nchan*npol); + if (dsp::Operation::record_time || dsp::Operation::verbose) + if (stream) + check_error_stream ("CUDA::SKComputerEngine::compute sk_estimate", stream); + else + check_error ("CUDA::SKComputerEngine::compute sk_estimate"); + + outdat ++; + outdat_tscr ++; + } + } +#endif + + // now calculate the SK limit for the tscrunched data + break; + } + + case dsp::TimeSeries::OrderTFP: + { + throw Error (InvalidState, "CUDA::SKComputerEngine::compute", + "OrderTFP is unsupported input order"); + } + + default: + { + throw Error (InvalidState, "CUDA::SKComputerEngine::compute", + "unsupported input order"); + } + } +} + + +__global__ void copy1sample ( const float * in_base, + float2 * out_base, + uint64_t out_stride, + uint64_t ndat, + unsigned M) +{ + const unsigned idat = blockIdx.x * blockDim.x + threadIdx.x; + if (idat >= ndat) + return; + + const unsigned ipol = blockIdx.z; + const unsigned ichan = blockIdx.y; + const unsigned isk = idat / M; + + const unsigned nchan = gridDim.y; + const unsigned npol = gridDim.z; + + // forward pointer to pol0 for this chan + out_base += (ichan * npol + ipol) * out_stride; + + // get the SK estimate (TFP order) for this sample/pol + const float sk = in_base[isk * nchan * npol + ichan*npol + ipol]; + + out_base[idat].x = sk; + out_base[idat].y = sk; +} + + + +void CUDA::SKComputerEngine::insertsk (const dsp::TimeSeries* input, + dsp::TimeSeries* output, + unsigned M) +{ + // copy the SK estimates to the output timesseries + if (dsp::Operation::verbose) + cerr << "CUDA::SKMaskerEngine::insertsk M=" << M << endl; + + uint64_t ndat = output->get_ndat(); + unsigned nchan = output->get_nchan(); + unsigned npol = output->get_npol(); + + // order is FPT + const float * in_base = (float *) input->get_dattfp (); + float2 * out_base = (float2 *) output->get_datptr (0, 0); + + uint64_t out_stride; + if (npol == 1) + { + out_stride = output->get_datptr (1, 0) - output->get_datptr (0, 0); + } + else + { + out_stride = output->get_datptr (0, 1) - output->get_datptr (0, 0); + } + + out_stride /= 2; + + unsigned threads = max_threads_per_block; + dim3 blocks (ndat / threads, nchan, npol); + if (ndat % threads) + blocks.x++; + + cerr << "CUDA::SKComputerEngine::insertsk ndat=" << ndat << " nchan=" << nchan << " npol=" << npol << endl; + cerr << "CUDA::SKComputerEngine::insertsk out_stride=" << out_stride << endl; + cerr << "CUDA::SKComputerEngine::insertsk blocks=(" << blocks.x << ", " << blocks.y << ") threads=" << threads << endl; + + copy1sample<<>> (in_base, out_base, out_stride, ndat, M); + + if (dsp::Operation::record_time || dsp::Operation::verbose) + check_error( "CUDA::SKComputerEngine::insertsk" ); +} diff -Nru bl-dspsr-0+git20160405/Signal/General/SKDetector.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/SKDetector.C --- bl-dspsr-0+git20160405/Signal/General/SKDetector.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/SKDetector.C 2018-03-12 23:02:35.000000000 +0000 @@ -14,8 +14,6 @@ #include #include -//#define USE_MEGA_THRESHOLDS 1 - using namespace std; dsp::SKDetector::SKDetector () @@ -26,18 +24,11 @@ n_std_devs = 3; upper_thresh = 0; lower_thresh = 0; -#ifdef USE_MEGA_THRESHOLDS - mega_upper_thresh = 0; - mega_lower_thresh = 0; -#endif debugd = 1; s_chan = 0; e_chan = 0; ndat_zapped = 0; ndat_zapped_skfb = 0; -#ifdef USE_MEGA_THRESHOLDS - ndat_zapped_mega = 0; -#endif ndat_zapped_fscr = 0; ndat_zapped_tscr = 0; ndat_total = 0; @@ -60,14 +51,17 @@ cerr << "Zapped: " << " total=" << (100 * (float) ndat_zapped / (float) ndat_total) << "\%" << " skfb=" << (100 * (float) ndat_zapped_skfb / (float) ndat_total) << "\%" -#ifdef USE_MEGA_THRESHOLDS - << " mega=" << (100 * (float) ndat_zapped_mega / (float) ndat_total) << "\%" -#endif << " tscr=" << (100 * (float) ndat_zapped_tscr / (float) ndat_total) << "\%" << " fscr=" << (100 * (float) ndat_zapped_fscr / (float) ndat_total) << "\%" << endl; } +void dsp::SKDetector::set_engine (Engine * _engine) +{ + engine = _engine; + engine->setup(); +} + void dsp::SKDetector::set_thresholds (unsigned _M, unsigned _n_std_devs) { M = _M; @@ -81,16 +75,6 @@ upper_thresh = (float) limits.get_upper_threshold(); lower_thresh = (float) limits.get_lower_threshold(); -#ifdef USE_MEGA_THRESHOLDS - if (verbose) - cerr << "dsp::SKDetector::set_thresholds SKlimits(" << M << ", " << n_std_devs + 3 << ")" << endl; - limits.set_std_devs(6); - limits.calc_limits(); - - mega_upper_thresh = (float) limits.get_upper_threshold(); - mega_lower_thresh = (float) limits.get_lower_threshold(); -#endif - if (verbose) cerr << "dsp::SKDetector::set_thresholds M=" << M << " n_std_devs=" << n_std_devs << " [" << lower_thresh << " - " << upper_thresh @@ -190,10 +174,12 @@ unsigned char * outdat = 0; const unsigned npol = input->get_npol(); - assert(npol == 2); + unsigned zap_chan; + float V; + //assert(npol == 2); - float V_p0 = 0; - float V_p1 = 0; + //float V_p0 = 0; + //float V_p1 = 0; if (ndat && (tscr_M != M * ndat)) { @@ -214,20 +200,27 @@ << "]" << endl; } + if (engine) + { + engine->detect_tscr(input, input_tscr, output, tscr_upper, tscr_lower); + return; + } + for (uint64_t ichan=s_chan; ichan < e_chan; ichan++) { - // check the tscrunched value for this idat - V_p0 = indat[2*ichan]; - V_p1 = indat[2*ichan+1]; - - if ( V_p0 > tscr_upper || - V_p0 < tscr_lower || - V_p1 > tscr_upper || - V_p1 < tscr_lower ) + zap_chan = 0; + for (unsigned ipol=0; ipol < npol; ipol++) + { + V = indat[ichan*npol + ipol]; + if (V > tscr_upper || V < tscr_lower) + zap_chan = 1; + + } + if (zap_chan) { if (verbose) - cerr << "dsp::SKDetector::detect_tscr zap [" << V_p0 << ", " - << V_p1 << "] ichan=" << ichan << endl; + cerr << "dsp::SKDetector::detect_tscr zap V=" << V << ", " + << "ichan=" << ichan << endl; outdat = output->get_datptr(); for (unsigned idat=0; idat < ndat; idat++) { @@ -245,66 +238,45 @@ if (verbose) cerr << "dsp::SKDetector::detect_skfb()" << endl; + if (engine) + { + engine->detect_ft(input, output, upper_thresh, lower_thresh); + return; + } + const unsigned nchan = input->get_nchan(); const unsigned npol = input->get_npol(); uint64_t ndat = input->get_ndat(); const float * indat = input->get_dattfp(); unsigned char * outdat = output->get_datptr(); -#ifdef USE_MEGA_THRESHOLDS - uint64_t zapped_mega = 0; -#endif - float V_p0 = 0; - float V_p1 = 0; + float V = 0; + char zap; // compare SK estimator for each pol to expected values for (uint64_t idat=0; idat < ndat; idat++) { -#ifdef USE_MEGA_THRESHOLDS - zapped_mega = 0; -#endif - // for each channel and pol in the SKFB for (unsigned ichan=0; ichan < nchan; ichan++) { - V_p0 = indat[ichan*2]; - V_p1 = indat[ichan*2+1]; - - if ( V_p0 > upper_thresh || - V_p0 < lower_thresh || - V_p1 > upper_thresh || - V_p1 < lower_thresh ) + zap = 0; + for (unsigned ipol=0; ipol < npol; ipol++) + { + V = indat[npol*ichan + ipol]; + if (V > upper_thresh || V < lower_thresh) + { + zap = 1; + } + } + if (zap) { outdat[ichan] = 1; // only count skfb zapped channels in the in-band region if (ichan > s_chan && ichan < e_chan) ndat_zapped_skfb++; - -#ifdef USE_MEGA_THRESHOLDS - if ( V_p0 > mega_upper_thresh || - V_p0 < mega_lower_thresh || - V_p1 > mega_upper_thresh || - V_p1 < mega_lower_thresh ) - { - zapped_mega ++; - } -#endif } } -#ifdef USE_MEGA_THRESHOLDS - if (zapped_mega > 10) - { - if (verbose) - cerr << "ZAP mega n_bad_chan=" << zapped_mega << endl; - for (unsigned ichan=0; ichanreset_mask(output); + return; + } + unsigned nchan = output->get_nchan(); uint64_t ndat = output->get_ndat(); unsigned char * outdat = output->get_datptr(); @@ -332,6 +310,15 @@ if (verbose) cerr << "dsp::SKDetector::count_zapped hits=" << unfiltered_hits << endl; + int zapped = 0; + + if (engine) + { + zapped = engine->count_mask (output); + ndat_zapped += zapped; + return; + } + unsigned npol = input->get_npol(); const float * indat = input->get_dattfp(); @@ -355,6 +342,7 @@ for (uint64_t idat=0; idat < ndat; idat++) { + // number of SK idats (same for each channel) unfiltered_hits ++; for (unsigned ichan=s_chan; ichan < e_chan; ichan++) @@ -362,19 +350,20 @@ uint64_t index = (idat*nchan + ichan) * npol; unsigned outdex = ichan * npol; + // sum of all SK values unfiltered_sum[outdex] += indat[index]; if (npol == 2) - unfiltered_sum[outdex+1] += indat[index+1]; - + unfiltered_sum[outdex+1] += indat[index+1]; + if (outdat[(idat*nchan) + ichan] == 1) { ndat_zapped ++; - continue; + continue; } filtered_sum[outdex] += indat[index]; if (npol == 2) - filtered_sum[outdex+1] += indat[index+1]; + filtered_sum[outdex+1] += indat[index+1]; filtered_hits[ichan] ++; } @@ -386,66 +375,81 @@ if (verbose) cerr << "dsp::SKDetector::detect_fscr()" << endl; + float _M = (float) M; + float mu2 = (4 * _M * _M) / ((_M-1) * (_M + 2) * (_M + 3)); unsigned nchan = input->get_nchan(); + + if (engine) + { + float one_sigma_idat = sqrt(mu2 / float((e_chan-s_chan)+1)); + const float upper = 1 + ((1+n_std_devs) * one_sigma_idat); + const float lower = 1 - ((1+n_std_devs) * one_sigma_idat); + engine->detect_fscr(input, output, lower, upper, s_chan, e_chan); + return; + } + const unsigned npol = input->get_npol(); const uint64_t ndat = input->get_ndat(); const float * indat = input->get_dattfp(); unsigned char * outdat = output->get_datptr(); - float sk_avg_p0 = 0; - float sk_avg_p1 = 0; + float sk_avg; unsigned sk_avg_cnt = 0; + unsigned zap_idat; + uint64_t nzap = 0; + // foreach SK integration for (uint64_t idat=0; idat < ndat; idat++) { - sk_avg_p0 = 0; - sk_avg_p1 = 0; - sk_avg_cnt = 0; - - if (verbose) - cerr << "dsp::SKDetector::detect_fscr idat=" << idat << endl; - // accumulate the avg values for p0 and p1 - for (unsigned ichan=s_chan; ichan < e_chan; ichan++) + zap_idat = 0; + for (unsigned ipol=0; ipol < npol; ipol++) { - if (outdat[ichan] == 0) + sk_avg = 0; + sk_avg_cnt = 0; + + for (unsigned ichan=s_chan; ichan < e_chan; ichan++) + { + if (outdat[ichan] == 0) + { + sk_avg += indat[ichan*npol + ipol]; + sk_avg_cnt++; + } + } + + if (sk_avg_cnt > 0) { - sk_avg_p0 += indat[ichan*2]; - sk_avg_p1 += indat[ichan*2+1]; - sk_avg_cnt++; + sk_avg /= (float) sk_avg_cnt; + + float one_sigma_idat = sqrt(mu2 / (float) sk_avg_cnt); + float avg_upper_thresh = 1 + ((1+n_std_devs) * one_sigma_idat); + float avg_lower_thresh = 1 - ((1+n_std_devs) * one_sigma_idat); + if ((sk_avg > avg_upper_thresh) || (sk_avg < avg_lower_thresh)) + { + if (verbose) + cerr << "Zapping idat=" << idat << " ipol=" << ipol << " sk_avg=" << sk_avg + << " [" << avg_lower_thresh << " - " << avg_upper_thresh + << "] cnt=" << sk_avg_cnt << endl; + zap_idat = 1; + } } } - if (sk_avg_cnt > 0) + if (zap_idat) { - sk_avg_p0 /= (float) sk_avg_cnt; - sk_avg_p1 /= (float) sk_avg_cnt; - - float _M = (float) M; - float mu2 = (4 * _M * _M) / ((_M-1) * (_M + 2) * (_M + 3)); - float one_sigma_idat = sqrt(mu2 / (float) sk_avg_cnt); - - float avg_upper_thresh = 1 + ((n_std_devs) * one_sigma_idat); - float avg_lower_thresh = 1 - ((n_std_devs) * one_sigma_idat); - - if ((sk_avg_p0 > avg_upper_thresh) || (sk_avg_p0 < avg_lower_thresh) || - (sk_avg_p1 > avg_upper_thresh) || (sk_avg_p1 < avg_lower_thresh)) + for (unsigned ichan=0; ichan +#include +#include +#include + +#include +//#define _DEBUG 1 + +// TODO consider having schan / echan in mask represented by values other than 0, 1 + +using namespace std; + +void check_error (const char*); + +CUDA::SKDetectorEngine::SKDetectorEngine (dsp::Memory * memory) +{ + device_memory = dynamic_cast(memory); + stream = device_memory->get_stream(); + + estimates_host = new dsp::TimeSeries(); + zapmask_host = new dsp::BitSeries(); + + pinned_memory = new PinnedMemory (); + estimates_host->set_memory ((dsp::Memory *) pinned_memory); + zapmask_host->set_memory ((dsp::Memory *) pinned_memory); + + transfer_estimates = new dsp::TransferCUDA (stream); + transfer_estimates->set_kind (cudaMemcpyDeviceToHost); + transfer_estimates->set_output( estimates_host ); + + transfer_zapmask = new dsp::TransferBitSeriesCUDA (stream); + transfer_zapmask->set_kind (cudaMemcpyDeviceToHost); + transfer_zapmask->set_output( zapmask_host ); +} + +void CUDA::SKDetectorEngine::setup () +{ + if (dsp::Operation::verbose) + cerr << "CUDA::SKDetectorEngine::setup ()" << endl; + + // determine GPU capabilities + int device = 0; + cudaGetDevice(&device); + struct cudaDeviceProp device_properties; + cudaGetDeviceProperties (&device_properties, device); + max_threads_per_block = device_properties.maxThreadsPerBlock; +} + + +// faster kernel for npol=1 +__global__ void detect_one_pol (const float * indat, unsigned char * outdat, uint64_t nval, float upper, float lower) +{ + unsigned idat = (blockIdx.x * blockDim.x) + threadIdx.x; + if (idat < nval) + { + float V = indat[idat]; + if (V < lower || V > upper) + outdat[idat] = 1; + } +} + +__global__ void detect_two_pol (const float2 * indat, unsigned char * outdat, uint64_t nval, float upper, float lower) +{ + unsigned idat = (blockIdx.x * blockDim.x) + threadIdx.x; + if (idat < nval) + { + const float2 V = indat[idat]; + if (V.x < lower || V.x > upper || V.y < lower || V.y > upper) + { + outdat[idat] = 1; + } + } +} + + +// detect SK limits for N polarisations +__global__ void detect_one_sample (const float * indat, unsigned char * outdat, uint64_t nval, float upper, float lower, unsigned npol) +{ + unsigned idat = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (idat < nval) + { + unsigned zap = 0; + float V; + + for (int ipol=0; ipol upper) + { + zap = 1; + } + } + if (zap) + outdat[idat] = 1; + } +} + +void CUDA::SKDetectorEngine::detect_ft (const dsp::TimeSeries* input, + dsp::BitSeries* output, float upper_thresh, float lower_thresh) +{ + if (dsp::Operation::verbose) + cerr << "CUDA::SKDetectorEngine::detect_ft()" << endl; + + const unsigned nchan = input->get_nchan(); + const unsigned npol = input->get_npol(); + const int64_t ndat = input->get_ndat(); + + const float * indat = input->get_dattfp(); // TFP + unsigned char * outdat = output->get_datptr(); // TFP also! + + uint64_t nval = nchan * ndat; + uint64_t nblocks = nval / max_threads_per_block; + if (nval % max_threads_per_block) + nblocks++; + + dim3 threads (max_threads_per_block); + dim3 blocks (nblocks); + + if (dsp::Operation::verbose) + { + cerr << "CUDA::SKDetectorEngine::detect_ft nval=" << nval << " nblocks=" << nblocks << " max_threads_per_block=" << max_threads_per_block << endl; + cerr << "CUDA::SKDetectorEngine::detect_ft thresholds [" << lower_thresh << " - " << upper_thresh << "]" << endl; + cerr << "CUDA::SKDetectorEngine::detect_ft npol=" << npol << endl; + } + + if (npol == 1) + detect_one_pol<<>> (indat, outdat, nval, upper_thresh, lower_thresh); + else if (npol == 2) + detect_two_pol<<>> ((const float2 *) indat, outdat, nval, upper_thresh, lower_thresh); + else + detect_one_sample<<>> (indat, outdat, nval, upper_thresh, lower_thresh, npol); + + if (dsp::Operation::record_time || dsp::Operation::verbose) + check_error( "CUDA::SKDetectorEngine::detect_ft detect_one_xxx" ); + +#ifdef _DEBUG + int sum = count_mask(output); + cerr << "CUDA::SKDetectorEngine::detect_ft sum now " << sum << endl; +#endif +} + +// each block reads 1 time sample, all channels/pols +// then do a block-wide sum + +// input data are stored TFP, 1 warp per time sample, 32 warps / block to sum across channels +__global__ void reduce_sum_fscr_1pol (const float * input, unsigned char * out, + const unsigned nchan, float lower, float upper, + unsigned schan, unsigned echan) +{ + extern __shared__ float sdata[]; + + unsigned idat = blockIdx.x; + const float * in = input + (idat * nchan); + + float sum = 0; + for (unsigned ichan=threadIdx.x; ichan= schan && ichan < echan) + sum += in[ichan]; + } + + sdata[threadIdx.x] = sum; + __syncthreads(); + + // now do a block wide sum across all threads + int last_offset = blockDim.x / 2 ; + for (int offset = last_offset; offset > 0; offset >>= 1) + { + if (threadIdx.x < offset) + sdata[threadIdx.x] += sdata[threadIdx.x + offset]; + + __syncthreads(); + } + + if (threadIdx.x == 0) + { + float val = sdata[0] / float((echan - schan) + 1); + if (val < lower || val > upper) + out[idat] = 1; + } +} + +__global__ void reduce_sum_fscr_2pol (const float2 * input, unsigned char * out, + const unsigned nchan, float lower, float upper, + unsigned schan, unsigned echan) +{ + extern __shared__ float2 sdata2[]; + + // idat = blockIdx.x + const float2 * in = input + (blockIdx.x * nchan); + + float2 sum = make_cuComplex(0,0); + for (unsigned ichan=threadIdx.x; ichan= schan && ichan < echan) + sum = cuCaddf(sum, in[ichan]); + } + + sdata2[threadIdx.x] = sum; + __syncthreads(); + + // now do a block wide sum across all threads + int last_offset = blockDim.x / 2; + for (int offset = last_offset; offset > 0; offset >>= 1) + { + if (threadIdx.x < offset) + sdata2[threadIdx.x] = cuCaddf(sdata2[threadIdx.x], sdata2[threadIdx.x + offset]); + __syncthreads(); + } + + if (threadIdx.x == 0) + { + float nvalidchan = float((echan - schan) + 1); + float p0 = sdata2[0].x / nvalidchan; + float p1 = sdata2[0].y / nvalidchan; + + if (p0 < lower || p0 > upper || p1 < lower || p1 > upper) + out[blockIdx.x] = 1; + } +} + + +void CUDA::SKDetectorEngine::detect_fscr (const dsp::TimeSeries* input, dsp::BitSeries* output, const float lower, const float upper, unsigned schan, unsigned echan) +{ + if (dsp::Operation::verbose) + cerr << "CUDA::SKDetectorEngine::detect_fscr()" << endl; + + const unsigned nchan = input->get_nchan(); + const unsigned npol = input->get_npol(); + const int64_t ndat = input->get_ndat(); + + const unsigned nblocks = ndat; + unsigned nthreads = max_threads_per_block; + if (nchan < nthreads) + nthreads = nchan; + const size_t shared_bytes = nthreads * npol * sizeof(float); + + // indat is the SK estimatesestimates + const float * indat = input->get_dattfp(); + + // outdat is the bitmask + unsigned char * outdat = output->get_datptr(); + + if (dsp::Operation::verbose) + { + cerr << "CUDA::SKDetectorEngine::detect_fscr nchan=" << nchan << " ndat=" << ndat << endl; + cerr << "CUDA::SKDetectorEngine::detect_fscr nblocks=" << nblocks << " nthreads=" << nthreads << " shared_bytes=" << shared_bytes << endl; + cerr << "CUDA::SKDetectorEngine::detect_fscr thresholds [" << lower << " - " << upper << "]" << endl; + } + + if (npol == 1) + reduce_sum_fscr_1pol<<>>(indat, outdat, nchan, lower, upper, schan, echan); + else + reduce_sum_fscr_2pol<<>>((float2*) indat, outdat, nchan, lower, upper, schan, echan); + + if (dsp::Operation::record_time || dsp::Operation::verbose) + check_error( "CUDA::SKDetectorEngine::detect_fscr_element" ); + +#ifdef _DEBUG + int sum = count_mask(output); + cerr << "CUDA::SKDetectorEngine::detect_fscr mask_sum=" << sum << endl; +#endif + + if (dsp::Operation::record_time || dsp::Operation::verbose) + check_error( "CUDA::SKDetectorEngine::detect detect_fscr" ); +} + +__global__ void detect_tscr_element (const float * indat, unsigned char * outdat, uint64_t nval, float upper, float lower, unsigned npol, unsigned nchan) +{ + extern __shared__ char sk_tscr[]; + + unsigned int idat = (blockIdx.x * blockDim.x + threadIdx.x); + + if (idat < nval) + { + const unsigned nchanpol = nchan * npol; + const unsigned ichanpol = idat % nchanpol; + + // first nchan threads to fill shared mem with the tscr SK estimates for each chan & pol (TFP) + if (threadIdx.x < nchanpol) + { + sk_tscr[threadIdx.x] = (char) (indat[threadIdx.x] > upper || indat[threadIdx.x] < lower); + } + __syncthreads(); + + outdat[idat/npol] = sk_tscr[ichanpol]; + } +} + + +void CUDA::SKDetectorEngine::detect_tscr (const dsp::TimeSeries* input, + const dsp::TimeSeries* input_tscr, dsp::BitSeries* output, + float upper_thresh, float lower_thresh) +{ + if (dsp::Operation::verbose) + cerr << "CUDA::SKDetectorEngine::detect_tscr()" << endl; + const unsigned nchan = input->get_nchan(); + const unsigned npol = input->get_npol(); + const int64_t ndat = output->get_ndat(); + + // indat is the tscr mask [nchan vals] + const float * indat = input_tscr->get_dattfp(); + + // outdat is the bitmask + unsigned char * outdat = output->get_datptr(); + + // this kernel is indexed on output rather than input + const uint64_t nval = ndat * nchan; + uint64_t nblocks = nval / max_threads_per_block; + if (nval % max_threads_per_block) + nblocks++; + + dim3 threads (max_threads_per_block); + dim3 blocks (nblocks); + unsigned shared_bytes = nchan*npol*sizeof(char); + + if (dsp::Operation::verbose) + cerr << "CUDA::SKDetectorEngine::detect_tscr_element ndat=" << ndat + << " nchan=" << nchan << " nval=" << nval + << " max_threads=" << max_threads_per_block + << " nblocks=" << nblocks << endl; + + detect_tscr_element<<>>(indat, outdat, nval, upper_thresh, lower_thresh, npol, nchan); + + if (dsp::Operation::record_time || dsp::Operation::verbose) + check_error( "CUDA::SKDetectorEngine::detect_tscr_element" ); + +#ifdef _DEBUG + int sum = count_mask(output); + cerr << "CUDA::SKDetectorEngine::detect_tscr mask_sum=" << sum << endl; +#endif +} + + +void CUDA::SKDetectorEngine::reset_mask (dsp::BitSeries* output) +{ + unsigned nchan = output->get_nchan(); + int64_t ndat = output->get_ndat(); + unsigned char * outdat = output->get_datptr(); + + size_t nbytes = nchan * ndat; + + cudaError error = cudaMemsetAsync (outdat, 0, nbytes, stream); + if (error != cudaSuccess) + throw Error (FailedCall, "CUDA::SKDetectorEngine::reset_mask ", + "cudaMemset (%p, 0, %u): %s", outdat, nbytes, + cudaGetErrorString (error)); +#ifdef _DEBUG + int sum = count_mask(output); + cerr << "CUDA::SKDetectorEngine::reset_mask sum now " << sum << endl; +#endif +} + +int CUDA::SKDetectorEngine::count_mask (const dsp::BitSeries* output) +{ + unsigned char * outdat = const_cast(output->get_datptr()); + const unsigned nchan = output->get_nchan(); + const int64_t ndat = output->get_ndat(); + int sum = 0; +/* + const uint64_t nval = (uint64_t) ndat * nchan; + cudaStreamSynchronize(stream); + thrust::device_ptr d = thrust::device_pointer_cast(outdat); + int sum = thrust::reduce(thrust::cuda::par.on(stream), d, d+nval, (int) 0, thrust::plus()); + cudaStreamSynchronize(stream); +*/ + + return sum; +} + +float * CUDA::SKDetectorEngine::get_estimates (const dsp::TimeSeries * input) +{ + transfer_estimates->set_input (input); + transfer_estimates->operate (); + cudaStreamSynchronize (stream); + return estimates_host->get_dattfp(); +} + +unsigned char * CUDA::SKDetectorEngine::get_zapmask (const dsp::BitSeries * input) +{ + transfer_zapmask->set_input (input); + transfer_zapmask->operate (); + cudaStreamSynchronize (stream); + return zapmask_host->get_datptr(); +} + diff -Nru bl-dspsr-0+git20160405/Signal/General/SKFilterbank.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/SKFilterbank.C --- bl-dspsr-0+git20160405/Signal/General/SKFilterbank.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/SKFilterbank.C 2018-03-12 23:02:35.000000000 +0000 @@ -63,6 +63,13 @@ void dsp::SKFilterbank::custom_prepare () { output->set_order( TimeSeries::OrderTFP ); + if (engine) + { + cerr << "dsp::SKFilterbank::custom_prepare engine->setup()" << endl; + engine->setup(); + cerr << "dsp::SKFilterbank::custom_prepare engine->prepare (input, " << nsamp_fft << ")" << endl; + engine->prepare (input, nsamp_fft); + } } /* @@ -71,7 +78,6 @@ */ uint64_t dsp::SKFilterbank::get_skfb_inc (uint64_t blocksize) { - if (verbose) cerr << "dsp::SKFilterbank::get_skfb_inc M=" << tscrunch << " nsamp_fft=" << nsamp_fft << " blocksize=" << blocksize << endl; @@ -139,101 +145,110 @@ output_tscr->set_npol(npol); output_tscr->set_ndim(1); output_tscr->resize(1); - - if (verbose) - cerr << "dsp::SKFilterbank::filterbank S?_tscr.resize(" << nchan*npol << ")" << endl; - S1_tscr.resize(nchan * npol); - S2_tscr.resize(nchan * npol); } - // initialise tscr - if (output_tscr) + if (engine) + { + engine->perform (input, output, output_tscr); + } + else { - for (unsigned i=0; i output_tscr->get_nchan()) + { + if (verbose) + cerr << "dsp::SKFilterbank::filterbank S?_tscr.resize(" << nchan*npol << ")" << endl; + S1_tscr.resize(nchan * npol); + S2_tscr.resize(nchan * npol); + } + + // initialise tscr + if (output_tscr) { - S1_tscr[i]=0; - S2_tscr[i]=0; + for (unsigned i=0; iget_dattfp(); + cerr << "dsp::SKFilterbank::filterbank calculating tscrunch SK estimates" << endl; - if (debugd < 1) - cerr << "dsp::SKFilterbank::filterbank tscr M=" << M <<" M_fac=" << M_fac << endl; - for (unsigned ichan=0; ichanget_dattfp(); + + if (debugd < 1) + cerr << "dsp::SKFilterbank::filterbank tscr M=" << M <<" M_fac=" << M_fac << endl; + for (unsigned ichan=0; ichan +#include +#include + +//#define _DEBUG 1 + +using namespace std; + +void check_error_stream (const char*, cudaStream_t); + +/* Perform a reduction including SQLD calculations */ +__global__ void reduce_sqld (cufftComplex* input, cufftComplex* output, float * skout, unsigned nchan, unsigned npol, unsigned M) +{ + // each block is a tsrunch, threads are channels + + // increment input and output pointer + input += (blockIdx.x * nchan * M); + output += (blockIdx.x * nchan); + skout += (blockIdx.x * nchan); + + const float M_fac = (M+1) / (M-1); + + cufftComplex val; + for (unsigned ichan=threadIdx.x; ichan(_memory); + stream = memory->get_stream(); + tscrunch = _tscrunch; + + cufftResult result = cufftCreate (&plan); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::SKFilterbankEngine::SKFilterbankEngine", + "cufftCreate(plan)"); + npt = 0; +} + +CUDA::SKFilterbankEngine::~SKFilterbankEngine () +{ +} + +void CUDA::SKFilterbankEngine::setup () +{ + if (dsp::Operation::verbose) + cerr << "CUDA::SKFilterbankEngine::setup ()" << endl; + + // determine GPU capabilities + int device = 0; + cudaGetDevice(&device); + struct cudaDeviceProp device_properties; + cudaGetDeviceProperties (&device_properties, device); + max_threads_per_block = device_properties.maxThreadsPerBlock; +} + +void CUDA::SKFilterbankEngine::prepare (const dsp::TimeSeries * input, unsigned _npt) +{ + // real or complex input + cufftType type = CUFFT_C2C; + if (input->get_state() == Signal::Nyquist) + type = CUFFT_R2C; + + npt = _npt; + + unsigned ndim = input->get_ndim(); + uint64_t ndat = input->get_ndat(); + unsigned nbatch = (ndat / npt); + + // 1D transform + int rank = 1; + int inembed[1] = { npt }; + int onembed[1] = { npt / ndim }; + + // distance between successive elements + int istride = 1; + int ostride = 1; + + // distance between sucessive batches + int idist = npt; + int odist = npt / ndim; + nchan = odist; + + size_t work_size; + + cufftResult result = cufftMakePlanMany (plan, rank, &npt, + inembed, istride, idist, + onembed, ostride, odist, + type, nbatch, &work_size); + + result = cufftSetStream (plan, stream); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::SKFilterbankEngine::prepare", + "cufftSetStream(plan)"); + + size_t bytes_required = nbatch * nchan * sizeof (cufftComplex); + if (bytes_required > buffer_size) + { + if (buffer) + memory->do_free (buffer); + buffer = memory->do_allocate (bytes_required); + buffer_size = bytes_required; + } + + bytes_required = (nbatch / tscrunch) * nchan * sizeof(cufftComplex); + if (bytes_required > sums_size) + { + if (sums) + memory->do_free (sums); + sums = memory->do_allocate (sums_size); + sums_size = bytes_required; + } +} + +void CUDA::SKFilterbankEngine::perform (const dsp::TimeSeries* input, + dsp::TimeSeries* output, + dsp::TimeSeries* output_tscr) +{ + if (dsp::Operation::verbose) + std::cerr << "CUDA::SKFilterbankEngine::perform()" << std::endl; + + uint64_t ndat = input->get_ndat(); + unsigned npol = input->get_npol (); + unsigned npart = (unsigned) (ndat / npt); + + if (input->get_order() != dsp::TimeSeries::OrderFPT) + throw Error(InvalidState, "CUDA::SKFilterbankEngine::perform", + "Only OrderFPT input order is supported"); + + // TODO decide what to do about multi-input channel data + + // adjust FFT plan if required, TODO work on how npt is passed + if (npart != nbatch) + prepare (input, npt); + + // FFT output buffer from batched FFT + cufftComplex * buf = (cufftComplex *) buffer; + + unsigned input_nchan = input->get_nchan (); + if (dsp::Operation::verbose) + std::cerr << "CUDA::SKFilterbankEngine::perform ndat=" << ndat + << " input_nchan=" << input_nchan << " output_nchan=" << nchan + << " npol=" << npol << " tscrunch=" << tscrunch << std::endl; + + for (unsigned ipol=0; ipolget_datptr (0, ipol); + + // output SK estimates at (1/M) time sampling + float * out = (float *) output->get_dattfp(); + + // output SK estimates at block resolution + float * out_tscr = (float *) output_tscr->get_dattfp(); + + // batch FFT all the input data + if (type == CUFFT_R2C) + fft_real ((cufftReal *) in, buf); + else + fft_complex ((cufftComplex *)in, buf); + + // specta now exist in out in TF format + int nthread = nchan; + int nblocks = nbatch; + + // convert the spectra into tscrunched S1 and S2 sums in Re and Im + reduce_sqld<<>> (buf, (cufftComplex *) sums, out + ipol, nchan, npol, tscrunch); + + // compute a tscrunched output SK + reduce_sk_estimate<<<1,nthread,0,stream>>>((cufftComplex *) sums, out_tscr + ipol, nchan, npol, npart, tscrunch); + } + + check_error_stream("CUDA::SKFilterBank::perform", stream); + +} + +void CUDA::SKFilterbankEngine::fft_real (cufftReal *in, cufftComplex * out) +{ + cufftResult result = cufftExecR2C (plan, in, out); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::SKFilterbankEngine::fft_real", + "cufftExecR2C(plan)"); +} + +void CUDA::SKFilterbankEngine::fft_complex (cufftComplex *in, cufftComplex * out) +{ + cufftResult result = cufftExecC2C (plan, in, out, CUFFT_FORWARD); + if (result != CUFFT_SUCCESS) + throw CUFFTError (result, "CUDA::SKFilterbankEngine::fft_complex", + "cufftExecC2C(plan)"); +} + diff -Nru bl-dspsr-0+git20160405/Signal/General/SKMasker.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/SKMasker.C --- bl-dspsr-0+git20160405/Signal/General/SKMasker.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/SKMasker.C 2018-03-12 23:02:35.000000000 +0000 @@ -63,6 +63,7 @@ const unsigned ddfb_nchan = input->get_nchan(); const uint64_t ddfb_ndat = input->get_ndat(); const uint64_t ddfb_npol = input->get_npol(); + const unsigned ddfb_ndim = input->get_ndim(); const unsigned mask_nchan = mask_input->get_nchan(); const unsigned mask_npol = mask_input->get_npol(); @@ -71,8 +72,6 @@ const uint64_t ddfb_input_sample = input->get_input_sample(); const uint64_t mask_input_sample = mask_input->get_input_sample(); - const unsigned output_ndim = output->get_ndim(); - if (mask_npol != 1) throw Error (InvalidParam, "dsp::SKMasker::transformation", "mask_npol != 1"); @@ -92,8 +91,13 @@ // indicate the output timeseries contains zeroed data output->set_zeroed_data (true); - // and resize the output to ensure the hits array is reallocated - output->resize (output->get_ndat()); + // resize the output to ensure the hits array is reallocated + if (engine) + { + if (verbose) + cerr << "dsp::SKMasker::transformation output->resize(" << output->get_ndat() << ")" << endl; + output->resize (output->get_ndat()); + } // get base pointer to mask bitseries unsigned char * mask = mask_input->get_datptr (); @@ -162,7 +166,11 @@ uint64_t ddfb_end_idat; if (engine) - engine->setup (ddfb_nchan, ddfb_npol, output->get_nfloat_span()); + { + if (verbose) + cerr << "dsp::SKMasker::transformation engine->setup()" << endl; + engine->setup (); + } for (uint64_t idat=0; idat < mask_ndat; idat++) { @@ -208,10 +216,10 @@ if (engine) { unsigned mask_offset = mask_nchan * mask_npol * idat; - unsigned offset = ddfb_start_idat*output_ndim; - unsigned end = ddfb_nsamples*output_ndim; + unsigned offset = ddfb_start_idat*ddfb_ndim; + unsigned end = ddfb_nsamples*ddfb_ndim; - engine->perform (mask_input, mask_offset, output, offset, end); + //engine->perform (mask_input, mask_offset, output, offset, end); } else { @@ -220,13 +228,11 @@ { if (mask[ichan]) { - float * zerop0 = output->get_datptr(ichan, 0) + (ddfb_start_idat*output_ndim); - float * zerop1 = output->get_datptr(ichan, 1) + (ddfb_start_idat*output_ndim); - - for (unsigned j=0; jget_datptr(ichan, ipol) + (ddfb_start_idat*ddfb_ndim); + for (unsigned j=0; j(memory); + stream = device_memory->get_stream(); } -void CUDA::SKMaskerEngine::setup (unsigned _nchan, unsigned _npol, unsigned _span) +void CUDA::SKMaskerEngine::setup () { - if (dsp::Operation::verbose) - cerr << "CUDA::SKMaskerEngine::setup nchan=" << _nchan << " npol=" << _npol - << " span=" << _span << endl; - - nchan = _nchan; - npol = _npol; - span = _span; - // determine GPU capabilities int device = 0; cudaGetDevice(&device); @@ -38,88 +31,102 @@ max_threads_per_block = device_properties.maxThreadsPerBlock; } - -/* cuda kernel to mask 1 channel for both polarisations */ -__global__ void mask1chan (unsigned char * mask_base, - float * out_base, - unsigned npol, - unsigned end, - unsigned span) -{ - // ichan = blockIdx.x * blockDim.x + threadIdx.x - - float * p0 = out_base + span * npol * (blockIdx.x * blockDim.x + threadIdx.x); - float * p1 = out_base + span * npol * (blockIdx.x * blockDim.x + threadIdx.x) + span; - - mask_base += (blockIdx.x * blockDim.x + threadIdx.x); - - if (mask_base[0]) - { - for (unsigned j=0; j= ndat) + return; - __shared__ char mask; + const unsigned ichan = blockIdx.y; + const unsigned imask = idat / M; - if (threadIdx.x == 0) - mask = mask_base[ichan]; + // load the mask + const unsigned char mask = mask_base[imask * gridDim.y + ichan]; - __syncthreads(); + // forward pointer to pol0 for this chan + out_base += ichan * npol * out_stride; + in_base += ichan * npol * in_stride; - // zap if mask - if (mask) - { - int idat = threadIdx.x; - int out_offset = (span * npol * ichan) + idat; - out_base[out_offset] = 0; // p0 - out_base[out_offset +span ] = 0; // p1 + for (unsigned ipol=0; ipol +#include +#include + +using namespace std; + +dsp::SpectralKurtosis::SpectralKurtosis() : Transformation("SpectralKurtosis", outofplace) +{ + M = 128; + debugd = 1; + + estimates = new TimeSeries; + estimates_tscr = new TimeSeries; + zapmask = new BitSeries; + + // SK Detector + std_devs = 3; + channels.resize(2); + npart_total = 0; + thresholds.resize(2); + thresholds_tscr.resize(2); + zap_counts.resize(4); + detection_flags.resize(3); + std::fill (detection_flags.begin(), detection_flags.end(), false); + detection_flags.resize(3); + M_tscr = 0; + + unfiltered_hits = 0; + + prepared = false; + + set_buffering_policy(new InputBuffering(this)); +} + +dsp::SpectralKurtosis::~SpectralKurtosis () +{ + if (verbose) + cerr << "dsp::SpectralKurtosis::SpectralKurtosis~" << endl; + + float percent_all = 0; + float percent_skfb = 0; + float percent_tscr = 0; + float percent_fscr = 0; + + if (npart_total) + { + percent_all = (100 * (float) zap_counts[ZAP_ALL] / (float) npart_total); + percent_skfb = (100 * (float) zap_counts[ZAP_SKFB] / (float) npart_total); + percent_tscr = (100 * (float) zap_counts[ZAP_TSCR] / (float) npart_total); + percent_fscr = (100 * (float) zap_counts[ZAP_FSCR] / (float) npart_total); + } + + cerr << "Zapped: " + << " total=" << percent_all << "\%" << " skfb=" << percent_skfb << "\%" + << " tscr=" << percent_tscr << "\%" << " fscr=" << percent_fscr << "\%" + << endl; + + delete estimates; + delete estimates_tscr; + delete zapmask; +} + +bool dsp::SpectralKurtosis::get_order_supported (TimeSeries::Order order) const +{ + if (order == TimeSeries::OrderFPT || order == TimeSeries::OrderTFP) + return true; +} + + +void dsp::SpectralKurtosis::set_engine (Engine* _engine) +{ + if (verbose) + cerr << "dsp::SpectralKurtosis::set_engine()" << endl; + engine = _engine; +} + + +/* + * These are preparations that could be performed once at the start of + * the data processing + */ +void dsp::SpectralKurtosis::prepare () +{ + if (verbose) + cerr << "dsp::SpectralKurtosis::prepare()" << endl; + + nchan = input->get_nchan(); + npol = input->get_npol(); + ndim = input->get_ndim(); + + Memory * memory = const_cast(input->get_memory()); + estimates->set_memory (memory); + estimates_tscr->set_memory (memory); + zapmask->set_memory (memory); + + if (has_buffering_policy()) + { + get_buffering_policy()->set_minimum_samples (M); + } + + if (engine) + { + engine->setup (); + } + else + { + if (!detection_flags[1]) + { + S1_tscr.resize(nchan * npol); + S2_tscr.resize(nchan * npol); + } + } + + // ensure output containers are configured correctly + prepare_output (); + + prepared = true; +} + +/*! ensure output parameters are configured correctly */ +void dsp::SpectralKurtosis::prepare_output () +{ + if (verbose) + cerr << "dsp::SpectralKurtosis::prepare_output()" << endl; + + double mask_rate = input->get_rate() / M; + + estimates->copy_configuration (get_input()); + estimates->set_ndim (1); // SK estimates have only single dimension + estimates->set_order (TimeSeries::OrderTFP); // stored in TFP order + estimates->set_scale (1.0); // no scaling + estimates->set_rate (mask_rate); // rate is /= M + + if (input->get_npol() == 2) + estimates->set_state (Signal::PPQQ); + else + estimates->set_state (Signal::Intensity); + + double tscrunch_mask_rate = mask_rate; + if (npart > 0) + tscrunch_mask_rate /= npart; + + // tscrunched estimates have same configuration, except number of samples + estimates_tscr->copy_configuration (estimates); + estimates_tscr->set_order (TimeSeries::OrderTFP); // stored in TFP order + estimates_tscr->set_rate (tscrunch_mask_rate); + + // zap mask has same configuration as estimates with following changes + zapmask->copy_configuration (estimates); + zapmask->set_nbit (8); + zapmask->set_npol (1); + + // configure output timeseries (out-of-place) to match input + output->copy_configuration (get_input()); + output->set_input_sample (input->get_input_sample ()); +} + +/* ensure containers have correct dynamic size */ +void dsp::SpectralKurtosis::reserve () +{ + if (verbose) + cerr << "dsp::SpectralKurtosis::reserve()" << endl; + + const uint64_t ndat = input->get_ndat(); + npart = ndat / M; + output_ndat = npart * M; + + if (verbose) + cerr << "dsp::SpectralKurtosis::reserve input_ndat=" << ndat + << " npart=" << npart << " output_ndat=" << output_ndat << endl; + + // use resize since out of place operation + estimates->resize (npart); + estimates_tscr->resize (npart > 0); // 1 if npart != 0 + zapmask->resize (npart); + output->resize (output_ndat); +} + +/* call set of transformations */ +void dsp::SpectralKurtosis::transformation () +{ + if (!prepared) + prepare(); + + const uint64_t ndat = input->get_ndat(); + if (verbose || debugd < 1) + cerr << "dsp::SpectralKurtosis::transformation input ndat=" << ndat + << " tscrunch=" << M << endl; + + npart = ndat / M; + output_ndat = npart * M; + + if (verbose || debugd < 1) + cerr << "dsp::SpectralKurtosis::transformation input npart=" << npart + << " output_ndat=" << output_ndat << endl; + + if (has_buffering_policy()) + { + if (verbose || debugd < 1) + cerr << "dsp::SpectralKurtosis::transformation setting next_start_sample=" + << output_ndat << endl; + get_buffering_policy()->set_next_start (output_ndat); + } + + prepare_output (); + + // ensure output containers are sized correctly + reserve (); + + if ((ndat == 0) || (npart == 0)) + return; + + // perform SK functions + compute (); + detect (); + mask (); + //insertsk(); +} + +void dsp::SpectralKurtosis::compute () +{ + if (verbose) + cerr << "dsp::SpectralKurtosis::compute" << endl; + + if (engine) + { + engine->compute (input, estimates, estimates_tscr, M); + } + else + { + // initialise tscr + if (!detection_flags[1]) + { + std::fill(S1_tscr.begin(), S1_tscr.end(), 0); + std::fill(S2_tscr.begin(), S2_tscr.end(), 0); + } + + float S1_sum, S2_sum; + const float M_fac = (M+1) / (M-1); + float * outdat = estimates->get_dattfp(); + + switch (input->get_order()) + { + case dsp::TimeSeries::OrderTFP: + { + const unsigned int chan_stride = nchan * npol * ndim; + float * indat; + + for (unsigned ipart=0; ipart < npart; ipart++) + { + indat = (float *) input->get_dattfp() + (M * ipart * chan_stride); + + for (unsigned ichan=0; ichanget_datptr (ichan, ipol) + ipart * nfloat; + + S1_sum = 0; + S2_sum = 0; + + // Square Law Detect for S1 + S2 + for (unsigned i=0; iget_dattfp(); + if (verbose || debugd < 1) + cerr << "dsp::SpectralKurtosis::compute tscr M=" << M_t <<" M_fac=" << M_fac << endl; + for (unsigned ichan=0; ichanget_dattfp(); + unsigned char * outdat = 0; + unsigned zap_chan; + float V; + + if (npart && (M_tscr != M * npart)) + { + M_tscr = (float) (M * npart); + + if (verbose) + cerr << "dsp::SpectralKurtosis::detect_tscr SKlimits(" << M_tscr << ", " << std_devs << ")" << endl; + + dsp::SKLimits limits(M_tscr, std_devs); + limits.calc_limits(); + + thresholds_tscr[0] = (float) limits.get_lower_threshold(); + thresholds_tscr[1] = (float) limits.get_upper_threshold(); + + if (verbose) + cerr << "dsp::SpectralKurtosis::detect_tscr M=" << M_tscr << " std_devs=" + << std_devs << " [" << thresholds_tscr[0] << " - " << thresholds_tscr[1] + << "]" << endl; + } + + if (engine) + { + engine->detect_tscr (estimates, estimates_tscr, zapmask, thresholds_tscr[1], thresholds_tscr[0]); + return; + } + + for (uint64_t ichan=channels[0]; ichan < channels[1]; ichan++) + { + zap_chan = 0; + for (unsigned ipol=0; ipol < npol; ipol++) + { + V = indat[ichan*npol + ipol]; + if (V > thresholds_tscr[1] || V < thresholds_tscr[0]) + zap_chan = 1; + } + + if (zap_chan) + { + if (verbose) + cerr << "dsp::SpectralKurtosis::detect_tscr zap V=" << V << ", " + << "ichan=" << ichan << endl; + outdat = zapmask->get_datptr(); + for (unsigned ipart=0; ipart < npart; ipart++) + { + outdat[ichan] = 1; + zap_counts[ZAP_TSCR]++; + outdat += nchan; + } + } + } +} + +void dsp::SpectralKurtosis::detect_skfb () +{ + if (verbose) + cerr << "dsp::SpectralKurtosis::detect_skfb(" << npart << ")" << endl; + + if (engine) + { + engine->detect_ft (estimates, zapmask, thresholds[1], thresholds[0]); + return; + } + + const float * indat = estimates->get_dattfp(); + unsigned char * outdat = zapmask->get_datptr(); + float V = 0; + char zap; + + // compare SK estimator for each pol to expected values + for (uint64_t ipart=0; ipart < npart; ipart++) + { + // for each channel and pol in the SKFB + for (unsigned ichan=0; ichan < nchan; ichan++) + { + zap = 0; + for (unsigned ipol=0; ipol < npol; ipol++) + { + V = indat[npol*ichan + ipol]; + if (V > thresholds[1] || V < thresholds[0]) + { + zap = 1; + } + } + if (zap) + { + outdat[ichan] = 1; + + // only count skfb zapped channels in the in-band region + if (ichan > channels[0] && ichan < channels[1]) + zap_counts[ZAP_SKFB]++; + } + } + + indat += nchan * npol; + outdat += nchan; + } +} + +void dsp::SpectralKurtosis::reset_mask () +{ + if (engine) + { + engine->reset_mask (zapmask); + return; + } + + unsigned char * outdat = zapmask->get_datptr(); + + for (unsigned ichan=0; ichan < nchan; ichan++) + { + for (uint64_t ipart=0; ipart < npart; ipart++) + { + outdat[(ipart*nchan) + ichan] = 0; + } + } +} + +void dsp::SpectralKurtosis::count_zapped () +{ + if (verbose) + cerr << "dsp::SpectralKurtosis::count_zapped hits=" << unfiltered_hits << endl; + + int zapped = 0; + + const float * indat; + unsigned char * outdat; + + if (engine) + { + int zapped = engine->count_mask (zapmask); + indat = engine->get_estimates (estimates); + outdat = engine->get_zapmask(zapmask); + zap_counts[ZAP_ALL] += zapped; + } + else + { + indat = estimates->get_dattfp(); + outdat = zapmask->get_datptr(); + } + + assert (npart == estimates->get_ndat()); + if (unfiltered_hits == 0) + { + filtered_sum.resize (npol * nchan); + std::fill (filtered_sum.begin(), filtered_sum.end(), 0); + + filtered_hits.resize (nchan); + std::fill (filtered_hits.begin(), filtered_hits.end(), 0); + + unfiltered_sum.resize (npol * nchan); + std::fill (unfiltered_sum.begin(), unfiltered_sum.end(), 0); + } + + for (uint64_t ipart=0; ipart < npart; ipart++) + { + unfiltered_hits ++; + + for (unsigned ichan=channels[0]; ichan < channels[1]; ichan++) + { + uint64_t index = (ipart*nchan + ichan) * npol; + unsigned outdex = ichan * npol; + + unfiltered_sum[outdex] += indat[index]; + if (npol == 2) + unfiltered_sum[outdex+1] += indat[index+1]; + + if (outdat[(ipart*nchan) + ichan] == 1) + { + zap_counts[ZAP_ALL] ++; + continue; + } + + filtered_sum[outdex] += indat[index]; + if (npol == 2) + filtered_sum[outdex+1] += indat[index+1]; + + filtered_hits[ichan] ++; + } + } +} + +void dsp::SpectralKurtosis::detect_fscr () +{ + if (verbose) + cerr << "dsp::SpectralKurtosis::detect_fscr()" << endl; + + float _M = (float) M; + float mu2 = (4 * _M * _M) / ((_M-1) * (_M + 2) * (_M + 3)); + + if (engine) + { + float one_sigma_idat = sqrt(mu2 / (float) nchan); + const float upper = 1 + ((1+std_devs) * one_sigma_idat); + const float lower = 1 - ((1+std_devs) * one_sigma_idat); + engine->detect_fscr (estimates, zapmask, lower, upper, channels[0], channels[1]); + return; + } + + const uint64_t ndat = estimates->get_ndat(); + + const float * indat = estimates->get_dattfp(); + unsigned char * outdat = zapmask->get_datptr(); + + float sk_avg; + unsigned sk_avg_cnt = 0; + + unsigned zap_ipart; + uint64_t nzap = 0; + + // foreach SK integration + for (uint64_t ipart=0; ipart < npart; ipart++) + { + zap_ipart = 0; + for (unsigned ipol=0; ipol < npol; ipol++) + { + sk_avg = 0; + sk_avg_cnt = 0; + + for (unsigned ichan=channels[0]; ichan < channels[1]; ichan++) + { + if (outdat[ichan] == 0) + { + sk_avg += indat[ichan*npol + ipol]; + sk_avg_cnt++; + } + } + + if (sk_avg_cnt > 0) + { + sk_avg /= (float) sk_avg_cnt; + + float one_sigma_idat = sqrt(mu2 / (float) sk_avg_cnt); + float avg_upper_thresh = 1 + ((1+std_devs) * one_sigma_idat); + float avg_lower_thresh = 1 - ((1+std_devs) * one_sigma_idat); + if ((sk_avg > avg_upper_thresh) || (sk_avg < avg_lower_thresh)) + { + if (verbose) + cerr << "Zapping ipart=" << ipart << " ipol=" << ipol << " sk_avg=" << sk_avg + << " [" << avg_lower_thresh << " - " << avg_upper_thresh + << "] cnt=" << sk_avg_cnt << endl; + zap_ipart = 1; + } + } + } + + if (zap_ipart) + { + for (unsigned ichan=0; ichanresize(" << output->get_ndat() << ")" << endl; + output->resize (output->get_ndat()); + } + + // get base pointer to mask bitseries + unsigned char * mask = zapmask->get_datptr (); + + if (engine) + { + if (verbose) + cerr << "dsp::SpectralKurtosis::transformation engine->setup(" << nchan << ")" << endl; + engine->mask (zapmask, input, output, M); + } + else + { + // mask is a TFP ordered bit series, output is FTP order Timeseries + const unsigned nfloat = M * ndim; + for (unsigned ichan=0; ichan < nchan; ichan++) + { + for (unsigned ipol=0; ipol < npol; ipol++) + { + const float * indat = input->get_datptr(ichan, ipol); + float * outdat = output->get_datptr(ichan, ipol); + for (uint64_t ipart=0; ipart < npart; ipart++) + { + if (mask[ipart*nchan+ichan]) + { + for (unsigned j=0; jinsertsk (estimates, output, M); +} + diff -Nru bl-dspsr-0+git20160405/Signal/General/SpectralKurtosisCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Signal/General/SpectralKurtosisCUDA.cu --- bl-dspsr-0+git20160405/Signal/General/SpectralKurtosisCUDA.cu 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/SpectralKurtosisCUDA.cu 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,104 @@ +//-*-C++-*- + +/*************************************************************************** + * + * Copyright (C) 2016 by Andre Jameson + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#include "dsp/SpectralKurtosisCUDA.h" + +using namespace std; + +CUDA::SpectralKurtosisEngine::SpectralKurtosisEngine (dsp::Memory * memory) +{ + work_buffer_size = 0; + work_buffer = 0; + + device_memory = dynamic_cast(memory); + stream = device_memory->get_stream (); + + // sub-engines + computer = new CUDA::SKComputerEngine (memory); + detector = new CUDA::SKDetectorEngine (memory); + masker = new CUDA::SKMaskerEngine (memory); +} + +void CUDA::SpectralKurtosisEngine::setup () +{ + if (dsp::Operation::verbose) + cerr << "CUDA::SpectralKurtosisEngine::setup ()" << endl; + + // determine GPU capabilities + int device = 0; + cudaGetDevice(&device); + struct cudaDeviceProp device_properties; + cudaGetDeviceProperties (&device_properties, device); + max_threads_per_block = device_properties.maxThreadsPerBlock; + + computer->setup (); + detector->setup (); + masker->setup (); +} + +void CUDA::SpectralKurtosisEngine::compute ( const dsp::TimeSeries* input, + dsp::TimeSeries* output, dsp::TimeSeries *output_tscr, unsigned tscrunch) +{ + computer->compute (input, output, output_tscr, tscrunch); +} + +void CUDA::SpectralKurtosisEngine::detect_ft (const dsp::TimeSeries* input, + dsp::BitSeries* output, float upper_thresh, float lower_thresh) +{ + detector->detect_ft (input, output, upper_thresh, lower_thresh); +} + +void CUDA::SpectralKurtosisEngine::detect_fscr (const dsp::TimeSeries* input, + dsp::BitSeries* output, + const float lower, const float upper, + unsigned schan, unsigned echan) + +{ + detector->detect_fscr(input, output, upper, lower, schan, echan); +} + +void CUDA::SpectralKurtosisEngine::detect_tscr (const dsp::TimeSeries* input, + const dsp::TimeSeries* input_tscr, dsp::BitSeries* output, + float upper_thresh, float lower_thresh) +{ + detector->detect_tscr( input, input_tscr, output, upper_thresh, lower_thresh); +} + +void CUDA::SpectralKurtosisEngine::reset_mask (dsp::BitSeries* output) +{ + detector->reset_mask(output); +} + +int CUDA::SpectralKurtosisEngine::count_mask (const dsp::BitSeries* output) +{ + int nzapped = detector->count_mask (output); + return nzapped; +} + +float * CUDA::SpectralKurtosisEngine::get_estimates (const dsp::TimeSeries* estimates_device) +{ + return detector->get_estimates (estimates_device); +} + +unsigned char * CUDA::SpectralKurtosisEngine::get_zapmask (const dsp::BitSeries* zapmask_device) +{ + return detector->get_zapmask (zapmask_device); +} + +void CUDA::SpectralKurtosisEngine::mask (dsp::BitSeries* mask, const dsp::TimeSeries * input, + dsp::TimeSeries * output, unsigned M) +{ + masker->perform (mask, input, output, M); +} + +void CUDA::SpectralKurtosisEngine::insertsk (const dsp::TimeSeries* input, dsp::TimeSeries* out, unsigned M) +{ + computer->insertsk (input, out, M); +} + diff -Nru bl-dspsr-0+git20160405/Signal/General/stokes_detect.h bl-dspsr-0.0~git20180312.50ea209/Signal/General/stokes_detect.h --- bl-dspsr-0+git20160405/Signal/General/stokes_detect.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/stokes_detect.h 2018-03-12 23:02:35.000000000 +0000 @@ -4,10 +4,7 @@ * Licensed under the Academic Free License version 2.1 * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/General/stokes_detect.h,v $ - $Revision: 1.1 $ - $Date: 2006/10/15 19:09:24 $ - $Author: straten $ */ +// dspsr/Signal/General/stokes_detect.h #ifndef __stokes_detect_h #define __stokes_detect_h diff -Nru bl-dspsr-0+git20160405/Signal/General/TransferCUDA.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/TransferCUDA.C --- bl-dspsr-0+git20160405/Signal/General/TransferCUDA.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/TransferCUDA.C 2018-03-12 23:02:35.000000000 +0000 @@ -34,9 +34,14 @@ { cerr << "dsp::TransferCUDA::transformation input ndat=" << input->get_ndat() << " ndim=" << input->get_ndim(); - if (input->get_npol() > 1) - cerr << " span=" << input->get_datptr (0,1) - input->get_datptr(0,0); - cerr << " offset=" << input->get_datptr(0,0) - (float*)input->internal_get_buffer() << endl; + if (input->get_order() == TimeSeries::OrderFPT) + { + if (input->get_npol() > 1) + cerr << " span=" << input->get_datptr (0,1) - input->get_datptr(0,0); + cerr << " offset=" << input->get_datptr(0,0) - (float*)input->internal_get_buffer() << endl; + } + else + cerr << endl; } cudaError error; @@ -58,10 +63,14 @@ { cerr << "dsp::TransferCUDA::transformation output ndat=" << output->get_ndat() << " ndim=" << output->get_ndim(); - if (output->get_npol() > 1) - cerr << " span=" << output->get_datptr (0, 1) - output->get_datptr(0,0); - - cerr << " offset=" << output->get_datptr(0,0) - (float*)output->internal_get_buffer() << endl; + if (output->get_order() == TimeSeries::OrderFPT) + { + if (output->get_npol() > 1) + cerr << " span=" << output->get_datptr (0, 1) - output->get_datptr(0,0); + cerr << " offset=" << output->get_datptr(0,0) - (float*)output->internal_get_buffer() << endl; + } + else + cerr << endl; } } diff -Nru bl-dspsr-0+git20160405/Signal/General/TScrunchCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Signal/General/TScrunchCUDA.cu --- bl-dspsr-0+git20160405/Signal/General/TScrunchCUDA.cu 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/TScrunchCUDA.cu 2018-03-12 23:02:35.000000000 +0000 @@ -9,6 +9,7 @@ #include "dsp/TScrunchCUDA.h" +#include #include "Error.h" #include "debug.h" @@ -50,6 +51,70 @@ *out_base = result; } +__global__ void fpt_ndim2_ndim2_shm (float2* in_base, float2* out_base, + unsigned in_Fstride, unsigned in_Pstride, + unsigned out_Fstride, unsigned out_Pstride, + unsigned ndat_out, unsigned sfactor) +{ + // shared memory for coalesced reads + extern __shared__ cuFloatComplex shm[]; + + // blockIdx.y == channel index + // threadIdx.y == polarization index + unsigned ndat_in = ndat_out * sfactor; + + const unsigned block_offset = blockIdx.x * blockDim.x * sfactor; + + // X dimension is indexed on output samples. This is the input sample each thread will start to read + unsigned isamp_thr = block_offset + threadIdx.x; + + // offset into buffer = the index the first read sample for this block + in_base += (blockIdx.y*in_Fstride) + (threadIdx.y*in_Pstride) + block_offset; + + cuFloatComplex result = make_cuComplex(0,0); + unsigned isamp = threadIdx.x * sfactor; + unsigned esamp = isamp + sfactor; + unsigned shm_start = 0; + unsigned shm_end = blockDim.x; + + // ensure we don't overshoot the number of ndat + for (unsigned j=0; j= shm_start && isamp < shm_end && isamp < esamp) + { + //if (blockIdx.y == 0 && blockIdx.z == 0) + // printf ("[%d][%d] isamp=%u esamp=%u start=%u end=%u\n", blockIdx.x, threadIdx.x, isamp, esamp, shm_start, shm_end); + result = cuCaddf (result, shm[isamp-shm_start]); + isamp++; + } + + isamp_thr += blockDim.x; + shm_start += blockDim.x; + shm_end += blockDim.x; + } + + unsigned i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= ndat_out) + return; + + //if (blockIdx.y == 0 && blockIdx.z == 0) + // printf ("[%d][%d] i=%u\n", blockIdx.x, threadIdx.x, i); + + out_base += (blockIdx.y*out_Fstride) + (threadIdx.y*out_Pstride) + i; + *out_base = result; +} + + void CUDA::TScrunchEngine::fpt_tscrunch(const dsp::TimeSeries *in, dsp::TimeSeries* out, unsigned sfactor) { @@ -85,21 +150,41 @@ throw Error (InvalidParam, "CUDA::TScrunchEngine::fpt_scrunch", "only out-of-place transformation implemented"); + if (in->get_ndat() == 0) + return; + uint64_t in_Fstride = (in->get_datptr(1)-in->get_datptr(0)) / 2; uint64_t in_Pstride = (in->get_datptr(0,1)-in->get_datptr(0,0)) / 2; uint64_t out_Fstride = (out->get_datptr(1)-out->get_datptr(0)) / 2; uint64_t out_Pstride = (out->get_datptr(0,1)-out->get_datptr(0,0)) / 2; // use a 2-dimensional thread block to eliminate 3rd grid dimension - dim3 threads (128, in->get_npol()); - dim3 blocks (out->get_ndat()/threads.x, in->get_nchan() ); +#define USE_SHARED +#ifdef USE_SHARED + // set number of threads to be number of output samples, cap at 512 + dim3 threads (512); + if (out->get_ndat() < 512) + threads.x = out->get_ndat(); + dim3 blocks (out->get_ndat()/threads.x, in->get_nchan(), in->get_npol()); if (out->get_ndat() % threads.x) blocks.x ++; + size_t shm_bytes = threads.x * sizeof(float2); + fpt_ndim2_ndim2_shm<<>> ( + (float2*)(in->get_datptr(0)), (float2*)(out->get_datptr(0)), + in_Fstride, in_Pstride, out_Fstride, out_Pstride, + out->get_ndat(), sfactor); +#else + dim3 threads (128, in->get_npol()); + dim3 blocks (out->get_ndat()/threads.x, in->get_nchan(), in->get_npol()); + if (out->get_ndat() % threads.x) + blocks.x ++; fpt_ndim2_ndim2<<>> ( (float2*)(in->get_datptr(0)), (float2*)(out->get_datptr(0)), in_Fstride, in_Pstride, out_Fstride, out_Pstride, out->get_ndat(), sfactor); +#endif + if (dsp::Operation::record_time || dsp::Operation::verbose) check_error ("CUDA::TScrunchEngine::fpt_scrunch"); diff -Nru bl-dspsr-0+git20160405/Signal/General/UnderSamplingBench.C bl-dspsr-0.0~git20180312.50ea209/Signal/General/UnderSamplingBench.C --- bl-dspsr-0+git20160405/Signal/General/UnderSamplingBench.C 1970-01-01 00:00:00.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/General/UnderSamplingBench.C 2018-03-12 23:02:35.000000000 +0000 @@ -0,0 +1,79 @@ +/*************************************************************************** + * + * Copyright (C) 2015 by Andrew Jameson + * Licensed under the Academic Free License version 2.1 + * + ***************************************************************************/ + +#include "dsp/UnderSamplingBench.h" +#include "debug.h" + +#include +#include + +using namespace std; + +bool dsp::UnderSamplingBench::verbose = false; + +dsp::UnderSamplingBench::UnderSamplingBench (const std::string& name) +{ + library = name; + nchan = 0; +} + +//! Set the number of channels +void dsp::UnderSamplingBench::set_nchan (unsigned _chan) +{ + if (_chan != nchan) + reset (); + + nchan = _chan; +} + +void dsp::UnderSamplingBench::load () const +{ + max_nfft = 0; + + string filename = path + "/filterbank_bench_" + library + ".dat"; + + if (verbose) + cerr << "dsp::UnderSamplingBench::load filename=" << filename << endl; + + load (library, filename); + loaded = true; +} + +void dsp::UnderSamplingBench::load (const std::string& library, + const std::string& filename) const +{ + ifstream in (filename.c_str()); + if (!in) + throw Error (FailedSys, "dsp::UnderSamplingBench::load", + "std::ifstream (" + filename + ")"); + + while (!in.eof()) + { + Entry entry; + double log2nchan, log2nfft, mflops; + unsigned _chan; + + in >> _chan >> entry.nfft >> entry.cost >> log2nchan >> log2nfft >> mflops; + + if (in.eof()) + continue; + + entry.library = library; + + DEBUG(library << " " << _chan << " " << entry.nfft << " " << entry.cost); + + if (_chan != nchan) + continue; + + DEBUG("ADD nchan=" << nchan << " nfft=" << entry.nfft); + entries.push_back (entry); + + if (entry.nfft > max_nfft) + max_nfft = entry.nfft; + } +} + diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/ArchiverExtensions.C bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/ArchiverExtensions.C --- bl-dspsr-0+git20160405/Signal/Pulsar/ArchiverExtensions.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/ArchiverExtensions.C 2018-03-12 23:02:35.000000000 +0000 @@ -16,7 +16,7 @@ #include "dsp/Convolution.h" #include "dsp/Dedispersion.h" #include "dsp/TScrunch.h" -#include "dsp/SKDetector.h" +#include "dsp/SpectralKurtosis.h" #include "dsp/OperationThread.h" #include "Pulsar/dspReduction.h" @@ -195,10 +195,10 @@ set_coherent_dedispersion (input->get_state(), response); if (input->get_state() == Signal::Nyquist) - { - nsamp_fft *= 2; - nsamp_overlap_pos *= 2; - nsamp_overlap_neg *= 2; + { + nsamp_fft *= 2; + nsamp_overlap_pos *= 2; + nsamp_overlap_neg *= 2; } dspR->set_nsamp_fft ( nsamp_fft ); @@ -208,7 +208,7 @@ // save it for the Passband Extension if ( convolution->has_passband() ) - passband = convolution->get_passband(); + passband = convolution->get_passband(); } // //////////////////////////////////////////////////////////////////// @@ -224,52 +224,52 @@ // // Spectral Kurtosis RFI mitigation extension // - SKDetector* skdetect = dynamic_cast( operation ); + SpectralKurtosis* skestimator = dynamic_cast( operation ); - if (skdetect) + if (skestimator) { if (verbose > 2) - cerr << "dsp::Archiver::set SKDetector in use" << endl; + cerr << "dsp::Archiver::set SpectralKurtosis in use" << endl; unsigned nsubint = archive->get_nsubint(); Integration* subint = archive->get_Integration(nsubint - 1); - SpectralKurtosis* ext = subint -> getadd(); + Pulsar::SpectralKurtosis* ext = subint -> getadd(); - unsigned nchan = skdetect->get_input()->get_nchan(); + unsigned nchan = skestimator->get_input()->get_nchan(); ext->set_nchan( nchan ); - unsigned npol = skdetect->get_input()->get_npol(); + unsigned npol = skestimator->get_input()->get_npol(); ext->set_npol( npol ); - ext->set_M( skdetect->get_M() ); - ext->set_excision_threshold( skdetect->get_excision_threshold() ); + ext->set_M( skestimator->get_M() ); + ext->set_excision_threshold( skestimator->get_excision_threshold() ); vector data; - skdetect->get_filtered_sum (data); + skestimator->get_filtered_sum (data); for (unsigned ichan = 0; ichan < nchan; ichan++) - for (unsigned ipol = 0; ipol < npol; ipol++) - ext->set_filtered_sum (ichan, ipol, data[ichan*npol + ipol]); + for (unsigned ipol = 0; ipol < npol; ipol++) + ext->set_filtered_sum (ichan, ipol, data[ichan*npol + ipol]); vector hits; - skdetect->get_filtered_hits (hits); + skestimator->get_filtered_hits (hits); for (unsigned ichan = 0; ichan < nchan; ichan++) - ext->set_filtered_hits (ichan, hits[ichan]); + ext->set_filtered_hits (ichan, hits[ichan]); - skdetect->get_unfiltered_sum (data); + skestimator->get_unfiltered_sum (data); for (unsigned ichan = 0; ichan < nchan; ichan++) - for (unsigned ipol = 0; ipol < npol; ipol++) - ext->set_unfiltered_sum (ichan, ipol, data[ichan*npol + ipol]); + for (unsigned ipol = 0; ipol < npol; ipol++) + ext->set_unfiltered_sum (ichan, ipol, data[ichan*npol + ipol]); - ext->set_unfiltered_hits( skdetect->get_unfiltered_hits() ); + ext->set_unfiltered_hits( skestimator->get_unfiltered_hits() ); - skdetect->reset_count(); + skestimator->reset_count(); } } void dsp::Archiver::set_coherent_dedispersion (Signal::State state, - const Response* response) + const Response* response) { if (verbose > 2) cerr << "dsp::Archiver::set_coherent_dedispersion" << endl; @@ -320,7 +320,7 @@ for (unsigned ichan_output=0; ichan_outputfrequency_output[ichan_total] ); output.set_bandwidth( dedisp->bandwidth_output[ichan_total] ); @@ -343,7 +343,7 @@ { if (verbose > 2) cerr << "dsp::Archiver::set Pulsar::TwoBitStats no ExcisionUnpacker" - << endl; + << endl; return; } @@ -460,7 +460,7 @@ if (passband->get_ndim() != 1) throw Error (InvalidState, "dsp::Archiver::set_passband", - "Passband Response ndim != 1"); + "Passband Response ndim != 1"); for (unsigned ipol=0; ipol passband; //! Optional SK filterbank - Reference::To skfilterbank; + // Reference::To skfilterbank; + + //! Optional Spectral Kurtosis (for convolution) + Reference::To skestimator; //! Optional SK Resizer Reference::To skresize; diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/dsp/LoadToFoldConfig.h bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/LoadToFoldConfig.h --- bl-dspsr-0+git20160405/Signal/Pulsar/dsp/LoadToFoldConfig.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/LoadToFoldConfig.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/Pulsar/dsp/LoadToFoldConfig.h,v $ - $Revision: 1.43 $ - $Date: 2011/09/20 21:25:28 $ - $Author: straten $ */ +// dspsr/Signal/Pulsar/dsp/LoadToFoldConfig.h #ifndef __baseband_dsp_LoadToFoldConfig_h #define __baseband_dsp_LoadToFoldConfig_h diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/dsp/LoadToFoldN.h bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/LoadToFoldN.h --- bl-dspsr-0+git20160405/Signal/Pulsar/dsp/LoadToFoldN.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/LoadToFoldN.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/Pulsar/dsp/LoadToFoldN.h,v $ - $Revision: 1.14 $ - $Date: 2011/09/09 02:38:14 $ - $Author: straten $ */ +// dspsr/Signal/Pulsar/dsp/LoadToFoldN.h #ifndef __baseband_dsp_LoadToFoldN_h #define __baseband_dsp_LoadToFoldN_h diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/dsp/PhaseLockedFilterbank.h bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/PhaseLockedFilterbank.h --- bl-dspsr-0+git20160405/Signal/Pulsar/dsp/PhaseLockedFilterbank.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/PhaseLockedFilterbank.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/Pulsar/dsp/PhaseLockedFilterbank.h,v $ - $Revision: 1.5 $ - $Date: 2011/04/28 23:30:12 $ - $Author: demorest $ */ +// dspsr/Signal/Pulsar/dsp/PhaseLockedFilterbank.h #ifndef __baseband_dsp_PhaseLockedFilterbank_h #define __baseband_dsp_PhaseLockedFilterbank_h diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/dsp/PhaseSeries.h bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/PhaseSeries.h --- bl-dspsr-0+git20160405/Signal/Pulsar/dsp/PhaseSeries.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/PhaseSeries.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/Pulsar/dsp/PhaseSeries.h,v $ - $Revision: 1.41 $ - $Date: 2011/08/04 21:07:02 $ - $Author: straten $ */ +// dspsr/Signal/Pulsar/dsp/PhaseSeries.h #ifndef __PhaseSeries_h #define __PhaseSeries_h diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/dsp/PhaseSeriesUnloader.h bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/PhaseSeriesUnloader.h --- bl-dspsr-0+git20160405/Signal/Pulsar/dsp/PhaseSeriesUnloader.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/PhaseSeriesUnloader.h 2018-03-12 23:02:35.000000000 +0000 @@ -7,10 +7,7 @@ ***************************************************************************/ //-*-C++-*- -/* $Source: /cvsroot/dspsr/dspsr/Signal/Pulsar/dsp/PhaseSeriesUnloader.h,v $ - $Revision: 1.24 $ - $Date: 2011/08/31 20:46:04 $ - $Author: demorest $ */ +// dspsr/Signal/Pulsar/dsp/PhaseSeriesUnloader.h #ifndef __PhaseSeriesUnloader_h #define __PhaseSeriesUnloader_h diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/dsp/SubFold.h bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/SubFold.h --- bl-dspsr-0+git20160405/Signal/Pulsar/dsp/SubFold.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/SubFold.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/Pulsar/dsp/SubFold.h,v $ - $Revision: 1.20 $ - $Date: 2010/11/13 01:42:50 $ - $Author: demorest $ */ +// dspsr/Signal/Pulsar/dsp/SubFold.h #ifndef __SubFold_h #define __SubFold_h diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/dsp/TimeDivide.h bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/TimeDivide.h --- bl-dspsr-0+git20160405/Signal/Pulsar/dsp/TimeDivide.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/TimeDivide.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/Pulsar/dsp/TimeDivide.h,v $ - $Revision: 1.17 $ - $Date: 2009/06/17 10:32:32 $ - $Author: straten $ */ +// dspsr/Signal/Pulsar/dsp/TimeDivide.h #ifndef __baseband_dsp_TimeDivide_h #define __baseband_dsp_TimeDivide_h diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/dsp/UnloaderShare.h bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/UnloaderShare.h --- bl-dspsr-0+git20160405/Signal/Pulsar/dsp/UnloaderShare.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dsp/UnloaderShare.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/Pulsar/dsp/UnloaderShare.h,v $ - $Revision: 1.24 $ - $Date: 2010/11/16 01:43:21 $ - $Author: demorest $ */ +// dspsr/Signal/Pulsar/dsp/UnloaderShare.h #ifndef __UnloaderShare_h #define __UnloaderShare_h diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/dspsr.C bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dspsr.C --- bl-dspsr-0+git20160405/Signal/Pulsar/dspsr.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/dspsr.C 2018-03-12 23:02:35.000000000 +0000 @@ -25,12 +25,14 @@ #include "load_factory.h" #include "dirutil.h" +#include "strutil.h" #include - +#include #include #include #include +#include using namespace std; @@ -73,10 +75,20 @@ Reference::To engine; - if (config->get_total_nthread() > 1) + if (config->get_total_nthread() > 1){ + + if(dsp::Observation::verbose) + cerr << "using dsp::LoadToFoldN" << endl; + engine = new dsp::LoadToFoldN (config); - else + } + else{ + + if(dsp::Observation::verbose) + cerr << "using dsp::LoadToFold" << endl; + engine = new dsp::LoadToFold (config); + } bool time_prep = dsp::Operation::record_time || config->get_cuda_ndevice(); @@ -412,6 +424,10 @@ arg = menu.add (predictor, 'P', "file"); arg->set_help ("phase predictor used for folding"); + string predictors_file; + arg = menu.add (predictors_file, 'w', "file"); + arg->set_help ("phase predictors used for folding."); + arg = menu.add (config->additional_pulsars, 'X', "name"); arg->set_help ("additional pulsar to be folded"); @@ -577,6 +593,118 @@ ( factory (predictor[i]) ); } + if(!predictors_file.empty()) { + + cerr << "dspsr: Loading phase models from " << predictors_file << endl; + + vector buffer (10240); + char* buf = &buffer[0]; + + FILE* fptr = fopen (predictors_file.c_str(), "r"); + if (!fptr) + throw Error (FailedSys, "parse_options", + "fopen (%s)", predictors_file.c_str()); + + string key_string; + // choose first non commented and non empty line and attempt to parse header. + while( fgets (buf, buffer.size(), fptr) ==buf ){ + + string temp = buf; + temp = stringtok ( temp, "#\n", false); // get rid of comments and empty lines + + if(temp.empty()) + continue; + + key_string = temp; + break; + + } + if(key_string.empty()) + throw Error(InvalidState,"parse_options","Bad input file to -w flag."); + + string delim = " \t\n"; + + vector keys; + string key_next; + string key_rest; + + cerr << " read header string: " << key_string << endl; + + do { + + string_split_on_any( key_string, key_next, key_rest, delim ); + + if(key_next.empty() && !key_rest.empty()) + throw Error (InvalidState,"dspsr", "Key in candiate file was empty."); + + if(key_next.empty() && key_rest.empty()) + key_next = key_string; + + cerr<< "Considering Key = '" << key_next << "'"< values(nkeys); + stringstream lines; + string value_next; + string value_rest; + + for(int i=0; i< nkeys; i++ ) { + + string_split_on_any( value_string, value_next, value_rest, delim ); + + + + + if(value_next.empty() && !value_rest.empty()){ + stringstream err; + cerr << "Value in candiate file was empty on line " << nline << endl; + throw Error (InvalidState,"dspsr", err.str().c_str()); + } + + if(value_next.empty() && value_rest.empty()) + value_next = value_string; + + if(dsp::Observation::verbose) + cerr<< "Considering Key = '" << keys.at(i) << "' value='" << value_next << "'" <predictors.push_back ( factory ( virtual_ptr )); +#endif + } + } + + for (unsigned i=0; ijobs, ","); diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/Fold.C bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/Fold.C --- bl-dspsr-0+git20160405/Signal/Pulsar/Fold.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/Fold.C 2018-03-12 23:02:35.000000000 +0000 @@ -821,7 +821,7 @@ { if (verbose) { cerr << "Fold::fold finishing fold w/ engine. zeroed_samples was true so correcting integration length from:" << result->integration_length - << " by:" << (engine->get_ndat_folded() / get_input()->get_rate()) <get_ndat_folded() / get_input()->get_rate()) << endl; } result->integration_length += engine->get_ndat_folded() / get_input()->get_rate(); } @@ -867,7 +867,7 @@ } // for each idat } // for each pol - if (zeroed_samples && ichan < nchan-1) + if (zeroed_samples && ichan < nchan-1 && output->get_hits_nchan() > 1) hits += folding_nbin; } // for each chan } diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/FoldCUDA.cu bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/FoldCUDA.cu --- bl-dspsr-0+git20160405/Signal/Pulsar/FoldCUDA.cu 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/FoldCUDA.cu 2018-03-12 23:02:35.000000000 +0000 @@ -7,21 +7,30 @@ * ***************************************************************************/ -// #define _DEBUG 1 - #include "dsp/FoldCUDA.h" #include "dsp/MemoryCUDA.h" +//#define _DEBUG + #include "Error.h" #include "debug.h" +#include #include +#ifdef __CUDA_ARCH__ + #if (__CUDA_ARCH__ >= 300) + #define HAVE_SHFL + #else + #define NO_SHFL + #endif +#endif + using namespace std; CUDA::FoldEngine::FoldEngine (cudaStream_t _stream, bool _hits_on_gpu) { - use_set_bins = false; + use_set_bins = false; d_bin = 0; d_bin_size = 0; @@ -104,11 +113,11 @@ } uint64_t CUDA::FoldEngine::get_bin_hits (int ibin){ - return 0; // Fix this + return 0; // Fix this } uint64_t CUDA::FoldEngine::set_bins (double phi, double phase_per_sample, uint64_t _ndat, uint64_t idat_start) { - return 0; + return 0; } dsp::PhaseSeries* CUDA::FoldEngine::get_profiles () { @@ -178,7 +187,7 @@ if (stream) error = cudaMemcpyAsync (d_bin, binplan, mem_size, - cudaMemcpyHostToDevice, stream); + cudaMemcpyHostToDevice, stream); else error = cudaMemcpy (d_bin, binplan, mem_size, cudaMemcpyHostToDevice); @@ -186,93 +195,384 @@ throw Error (InvalidState, "CUDA::FoldEngine::set_binplan", "cudaMemcpy%s %s", stream?"Async":"", cudaGetErrorString (error)); - -// cudaThreadSynchronize(); } - -/* - * CUDA Folding Kernels - * ipol = threadIdx.y - * npol = blockDim.y +/* All CUDA folding kernels utilise the dimensionality: + * ipol = blockIdx.z + * npol = gridDim.z * ichan = blockIdx.y * nchan = gridDim.y - * idim = threadIdx.z */ -__global__ void fold1bin (const float* in_base, +// 2dim data +__global__ void fold1bin2dim (const cuFloatComplex * in_base, unsigned in_span, - float* out_base, + cuFloatComplex * out_base, + unsigned out_span, + unsigned nbin, + unsigned binplan_size, + const CUDA::bin* binplan) +{ + in_base += in_span * (blockIdx.y * gridDim.z + blockIdx.z); + out_base += out_span * (blockIdx.y * gridDim.z + blockIdx.z); + + for (unsigned ibin=threadIdx.x; ibin= binplan_size) - return; + cuFloatComplex total = make_cuComplex (0,0); - unsigned output_ibin = binplan[ibin].ibin; + // only add up bins that we have + if (ibin < binplan_size) + { + in_base += in_span * (blockIdx.y * gridDim.z + blockIdx.z); - in_base += in_span * (blockIdx.y * blockDim.y + threadIdx.y) + threadIdx.z; - out_base += out_span * (blockIdx.y * blockDim.y + threadIdx.y) + threadIdx.z; + // start/end sample for this input bin + const int sbin = binplan[ibin].offset; + const int ebin = sbin + binplan[ibin].hits; - float total = 0; + // each thread of a warp will load samples for this ibin + for (int i=sbin+warp_idx; i= binplan_size) + return; + int output_ibin = binplan[ibin].ibin; + out_base[ output_ibin ] = cuCaddf (out_base[ output_ibin ], warp_fold[warp_idx]); + } +#endif +#ifdef NO_SHFL + int last_offset = 16; + warp_fold[threadIdx.x] = total; + __syncthreads(); + for (int offset = last_offset; offset > 0; offset >>= 1) + { + if (warp_idx < offset) + warp_fold[threadIdx.x] = cuCaddf(warp_fold[threadIdx.x], warp_fold[threadIdx.x + offset]); + __syncthreads(); + } - for (; ibin < binplan_size; ibin += nbin) + if (warp_idx == 0) { - const float* input = in_base + binplan[ibin].offset * ndim; + if (ibin < binplan_size) + { + out_base += out_span * (blockIdx.y * gridDim.z + blockIdx.z); + int output_ibin = binplan[ibin].ibin; + out_base[ output_ibin ] = cuCaddf (out_base[ output_ibin ], warp_fold[threadIdx.x]); + } + } +#endif +} + +// 1dim kernels +__global__ void fold1bin1dim_shared (const float* in_base, unsigned in_span, + float* out_base, unsigned out_span, + unsigned nbin, unsigned binplan_size, + CUDA::bin* binplan) +{ + // one shared memory bin for each output phase bin for this chanpol + extern __shared__ float f1b1d_shared[]; + + // pointers for the current channel and polarisation + in_base += in_span * (blockIdx.y * gridDim.z + blockIdx.z); + out_base += out_span * (blockIdx.y * gridDim.z + blockIdx.z); + + // coalesced read the existing phase bin values + for (unsigned ibin=threadIdx.x; ibin= binplan_size) - return; + // coalesced read the existing phase bin values + for (unsigned ibin=threadIdx.x; ibin bin_dim) - bin_threads = 32; - - unsigned bin_blocks = bin_dim / bin_threads; - if (bin_dim % bin_threads) - bin_blocks ++; + // number of threads in the block (capped a max TPB) + unsigned bin_threads = bin_dim; + if (bin_threads > 1024); + bin_threads = 1024; + + // to ensure block coherrency + unsigned bin_blocks = 1; - dim3 blockDim (bin_threads, npol, ndim); - dim3 gridDim (bin_blocks, nchan, 1); + dim3 blockDim (bin_threads, 1, 1); + dim3 gridDim (bin_blocks, nchan, npol); #if 0 + cerr << "bin_dim=" << bin_dim << endl; cerr << "blockDim=" << blockDim << endl; cerr << "gridDim=" << gridDim << endl; #endif + DEBUG("bin_dim=" << bin_dim); + DEBUG("bin_threads=" << bin_threads << " bin_blocks=" << bin_blocks); + DEBUG("input=" << (void *) input << " output=" << (void *) output); DEBUG("input span=" << input_span << " output span=" << output_span); DEBUG("ndim=" << ndim << " nbin=" << folding_nbin << " binplan_nbin=" << binplan_nbin); + DEBUG("hits_on_gpu=" << hits_on_gpu << " zeroed_samples=" << zeroed_samples << " hits_nchan=" << hits_nchan); - //cudaThreadSynchronize(); - + size_t shared_max = 32768; + size_t shared_bytes = folding_nbin * sizeof(float) * ndim; if (hits_on_gpu && zeroed_samples && hits_nchan == nchan) { - fold1binhits<<>> (input, input_span, - output, output_span, hits, - ndim, folding_nbin, - binplan_nbin, d_bin); - + shared_bytes += folding_nbin * sizeof(unsigned); + if (ndim == 2) + { + if (shared_bytes <= shared_max) + fold1bin2dimhits_shared<<>> ((float2*)input, input_span/2, + (float2*) output, output_span/2, hits, + folding_nbin, binplan_nbin, d_bin); + else + fold1bin2dimhits<<>> ((float2*)input, input_span/2, + (float2*) output, output_span/2, hits, + folding_nbin, binplan_nbin, d_bin); + } + else + { + if (shared_bytes <= shared_max) + fold1bin1dimhits_shared<<>> (input, input_span, + output, output_span, hits, + folding_nbin, binplan_nbin, d_bin); + else + fold1bin1dimhits<<>> (input, input_span, + output, output_span, hits, + folding_nbin, binplan_nbin, d_bin); + } } else { - fold1bin<<>> (input, input_span, - output, output_span, - ndim, folding_nbin, - binplan_nbin, d_bin); - + if (ndim == 2) + { + if (shared_bytes <= shared_max) + { + fold1bin2dim_shared<<>> ((cuFloatComplex *) input, input_span/2, + (cuFloatComplex *) output, output_span/2, + folding_nbin, binplan_nbin, d_bin); + } + else + { + fold1bin2dim<<>> ((cuFloatComplex *) input, input_span/2, + (cuFloatComplex *) output, output_span/2, + folding_nbin, binplan_nbin, d_bin); + } +/* + dim3 threads(1024, 1, 1); + unsigned nwarps = threads.x / 32; + dim3 blocks (binplan_nbin/nwarps, nchan, npol); + if (binplan_nbin % nwarps) + blocks.x++; + size_t sbytes = threads.x * sizeof(float2); + fold1bin2dim_warp<<>> ((cuFloatComplex *) input, input_span/2, + (cuFloatComplex *) output, output_span/2, + folding_nbin, binplan_nbin, d_bin); +*/ + } + else + { + if (shared_bytes <= shared_max) + fold1bin1dim_shared<<>> (input, input_span, + output, output_span, + folding_nbin, binplan_nbin, d_bin); + else + fold1bin1dim<<>> (input, input_span, + output, output_span, + folding_nbin, binplan_nbin, d_bin); + } } // profile on the device is no longer synchronized with the one on the host diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/LoadToFold1.C bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/LoadToFold1.C --- bl-dspsr-0+git20160405/Signal/Pulsar/LoadToFold1.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/LoadToFold1.C 2018-03-12 23:02:35.000000000 +0000 @@ -27,26 +27,30 @@ #include "dsp/Filterbank.h" #include "dsp/FilterbankEngine.h" -#include "dsp/SKFilterbank.h" -#include "dsp/SKDetector.h" -#include "dsp/SKMasker.h" +#include "dsp/SpectralKurtosis.h" #include "dsp/OptimalFFT.h" #include "dsp/Resize.h" #if HAVE_CFITSIO +#if HAVE_fits #include "dsp/FITSFile.h" +#include "dsp/MultiFile.h" #include "dsp/FITSUnpacker.h" #endif +#endif #if HAVE_CUDA +#include "dsp/ConvolutionCUDA.h" +#include "dsp/ConvolutionCUDASpectral.h" #include "dsp/FilterbankCUDA.h" #include "dsp/OptimalFilterbank.h" #include "dsp/TransferCUDA.h" +#include "dsp/TimeSeriesCUDA.h" #include "dsp/TransferBitSeriesCUDA.h" #include "dsp/DetectionCUDA.h" #include "dsp/FoldCUDA.h" #include "dsp/MemoryCUDA.h" -#include "dsp/SKMaskerCUDA.h" +#include "dsp/SpectralKurtosisCUDA.h" #include "dsp/CyclicFoldEngineCUDA.h" #endif @@ -131,18 +135,43 @@ } #if HAVE_CFITSIO +#if HAVE_fits // Use callback to handle scales/offsets for read-in if (manager->get_info()->get_machine() == "FITS") { if (Operation::verbose) - std::cout << "Using callback to read PSRFITS file." << std::endl; + cerr << "Using callback to read PSRFITS file." << endl; // connect a callback - FITSFile* tmp = dynamic_cast (manager->get_input()); - tmp->update.connect ( - dynamic_cast ( manager->get_unpacker() ), - &FITSUnpacker::set_parameters); + bool success = false; + FITSUnpacker* funp = dynamic_cast ( + manager->get_unpacker()); + FITSFile* ffile = dynamic_cast (manager->get_input()); + if (funp && ffile) + { + ffile->update.connect ( funp, &FITSUnpacker::set_parameters ); + success = true; + } + else + { + MultiFile* mfile = dynamic_cast (manager->get_input()); + if (mfile) + { + for (unsigned i=0; i < mfile->nfiles(); ++i) + { + ffile = dynamic_cast (mfile->get_files()[i].get()); + if (funp && ffile) { + ffile->update.connect ( + funp, &FITSUnpacker::set_parameters ); + success = true; + } + } + } + } + if (not success) + cerr << "dspsr: WARNING: FITS input input but unable to apply scales and offsets." << endl; } #endif +#endif config->coherent_dedispersion = false; prepare_interchan (unpacked); @@ -259,103 +288,18 @@ } } - // only the Filterbank must be out-of-place - TimeSeries* convolved = unpacked; - - TimeSeries* skoutput = 0; - BitSeries * skzapmask = 0; - Reference::To skthread; - - if (config->sk_zap) - { - // put the SK signal path into a separate thread - skthread = new OperationThread(); - - TimeSeries* skfilterbank_input = unpacked; - -#if HAVE_CUDA - if (run_on_gpu) - { - Unpacker* unpack_on_cpu = 0; - unpack_on_cpu = manager->get_unpacker()->clone(); - unpack_on_cpu->set_device (Memory::get_manager()); - - unpack_on_cpu->set_input( manager->get_unpacker()->get_input() ); - unpack_on_cpu->set_output( skfilterbank_input = new_time_series() ); - - skthread->append_operation( unpack_on_cpu ); - manager->set_post_load_operation( skthread.get() ); - } -#endif - - skoutput = new_time_series (); - - // Spectral Kurtosis filterbank constructor - if (!skfilterbank) - skfilterbank = new SKFilterbank (config->sk_nthreads); - - if (!config->input_buffering) - skfilterbank->set_buffering_policy (NULL); - - skfilterbank->set_input ( skfilterbank_input ); - - skfilterbank->set_output ( skoutput ); - skfilterbank->set_nchan ( config->filterbank.get_nchan() ); - skfilterbank->set_M ( config->sk_m ); - - // SKFB also maintains trscunched SK stats - TimeSeries* skoutput_tscr = new_time_series(); - - skfilterbank->set_output_tscr (skoutput_tscr); - - skthread->append_operation (skfilterbank.get()); - - // SK Mask Generator - skzapmask = new BitSeries; - skzapmask->set_nbit (8); - skzapmask->set_npol (1); - skzapmask->set_nchan (config->filterbank.get_nchan()); - - SKDetector * skdetector = new SKDetector; - skdetector->set_input (skoutput); - skdetector->set_input_tscr (skoutput_tscr); - skdetector->set_output (skzapmask); - - skdetector->set_thresholds (config->sk_m, config->sk_std_devs); - if (config->sk_chan_start > 0 && config->sk_chan_end < config->filterbank.get_nchan()) - skdetector->set_channel_range (config->sk_chan_start, config->sk_chan_end); - skdetector->set_options (config->sk_no_fscr, config->sk_no_tscr, config->sk_no_ft); - - skthread->append_operation (skdetector); - -#if HAVE_CUDA - if (!run_on_gpu) -#endif - { - operations.push_back (skthread.get()); - OperationThread::Wait * skthread_wait = skthread->get_wait(); - operations.push_back (skthread_wait); - } - - // since the blocksize is artificially increased for the SKFB, - // we must return it to the required size for the SKFB - if (!skresize) - skresize = new Resize; - - skresize->set_input(unpacked); - skresize->set_output(unpacked); - operations.push_back (skresize.get()); - - } + // convolved and filterbank are out of place + TimeSeries* filterbanked = unpacked; + // filterbank is performing channelisation if (config->filterbank.get_nchan() > 1) { // new storage for filterbank output (must be out-of-place) - convolved = new_time_series (); + filterbanked = new_time_series (); #if HAVE_CUDA if (run_on_gpu) - convolved->set_memory (device_memory); + filterbanked->set_memory (device_memory); #endif config->filterbank.set_device( device_memory.ptr() ); @@ -369,7 +313,7 @@ filterbank->set_buffering_policy (NULL); filterbank->set_input (unpacked); - filterbank->set_output (convolved); + filterbank->set_output (filterbanked); if (config->filterbank.get_convolve_when() == Filterbank::Config::During) { @@ -383,6 +327,9 @@ operations.push_back (filterbank.get()); } + // output of convolved will be filterbanked|unpacked + TimeSeries* convolved = filterbanked; + bool filterbank_after_dedisp = config->filterbank.get_convolve_when() == Filterbank::Config::Before; @@ -392,29 +339,44 @@ if (!convolution) convolution = new Convolution; + if (!config->input_buffering) + convolution->set_buffering_policy (NULL); + convolution->set_response (response); if (!config->integration_turns) convolution->set_passband (passband); + convolved = new_time_series(); + if (filterbank_after_dedisp) { - convolution->set_input (unpacked); - convolution->set_output (unpacked); // inplace + convolution->set_input (filterbanked); + convolution->set_output (convolved); // out of place } else { - convolution->set_input (convolved); - convolution->set_output (convolved); // inplace + convolution->set_input (filterbanked); + convolution->set_output (convolved); // out of place } - if (!config->input_buffering) - convolution->set_buffering_policy (NULL); - +#if HAVE_CUDA + if (run_on_gpu) + { + convolved->set_memory (device_memory); + convolution->set_device (device_memory.ptr()); + unsigned nchan = manager->get_info()->get_nchan() * config->filterbank.get_nchan(); + if (nchan >= 16) + convolution->set_engine (new CUDA::ConvolutionEngineSpectral (stream)); + else + convolution->set_engine (new CUDA::ConvolutionEngine (stream)); + } +#endif + operations.push_back (convolution.get()); } if (filterbank_after_dedisp) - prepare_interchan (unpacked); + prepare_interchan (convolved); else prepare_interchan (convolved); @@ -491,15 +453,16 @@ return; // the phase-locked filterbank does its own detection and folding - } Reference::To presk_fold; Reference::To presk_unload; + TimeSeries * cleaned = convolved; + // peform zapping based on the results of the SKFilterbank if (config->sk_zap) - { + { if (config->nosk_too) { Detection* presk_detect = new Detection; @@ -538,64 +501,49 @@ operations.push_back (presk_fold.get()); } -#if HAVE_CUDA - if (run_on_gpu) - { - OperationThread::Wait * skthread_wait = skthread->get_wait(); - operations.push_back (skthread_wait); - } -#endif + cleaned = new_time_series(); + + if (!skestimator) + skestimator = new SpectralKurtosis(); - SKMasker * skmasker = new SKMasker; if (!config->input_buffering) - skmasker->set_buffering_policy (NULL); + skestimator->set_buffering_policy (NULL); + + skestimator->set_input (convolved); + skestimator->set_output (cleaned); + skestimator->set_M (config->sk_m); #if HAVE_CUDA if (run_on_gpu) { - // transfer the zap mask to the GPU - BitSeries * skzapmask_on_gpu = new BitSeries(); - skzapmask_on_gpu->set_nbit (8); - skzapmask_on_gpu->set_npol (1); - skzapmask_on_gpu->set_nchan (config->filterbank.get_nchan()); - skzapmask_on_gpu->set_memory (device_memory); - - TransferBitSeriesCUDA* transfer = new TransferBitSeriesCUDA(stream); - transfer->set_kind( cudaMemcpyHostToDevice ); - transfer->set_input( skzapmask ); - transfer->set_output( skzapmask_on_gpu ); - operations.push_back (transfer); - - skmasker->set_mask_input (skzapmask_on_gpu); - skmasker->set_engine (new CUDA::SKMaskerEngine (stream)); + // for input buffering + convolved->set_engine (new CUDA::TimeSeriesEngine (device_memory)); + cleaned->set_memory (device_memory); + skestimator->set_engine (new CUDA::SpectralKurtosisEngine (device_memory)); } - else - skmasker->set_mask_input (skzapmask); -#else - skmasker->set_mask_input (skzapmask); #endif - skmasker->set_input (convolved); - skmasker->set_output (convolved); - skmasker->set_M (config->sk_m); - - operations.push_back (skmasker); + skestimator->set_thresholds (config->sk_m, config->sk_std_devs); + if (config->sk_chan_start > 0 && config->sk_chan_end < config->filterbank.get_nchan()) + skestimator->set_channel_range (config->sk_chan_start, config->sk_chan_end); + skestimator->set_options (config->sk_no_fscr, config->sk_no_tscr, config->sk_no_ft); + operations.push_back (skestimator.get()); } // Cyclic spectrum also detects and folds if (config->cyclic_nchan) { - build_fold(convolved); + build_fold(cleaned); return; } if (!detect) detect = new Detection; - TimeSeries* detected = convolved; - detect->set_input (convolved); - detect->set_output (convolved); + TimeSeries* detected = cleaned; + detect->set_input (cleaned); + detect->set_output (cleaned); configure_detection (detect, noperations); @@ -636,7 +584,7 @@ Reference::To skfold; build_fold (skfold, unload); - skfold->set_input( skoutput ); + skfold->set_input( cleaned); skfold->prepare( manager->get_info() ); skfold->reset(); @@ -856,7 +804,7 @@ minimum_samples = convolution->get_minimum_samples () * fb_factor; if (report_vitals) - cerr << "dspsr: convolution requires at least " + cerr << "dspsr: convolution requires at least " << minimum_samples << " samples" << endl; if (!config->input_buffering) @@ -882,41 +830,41 @@ uint64_t ram = manager->set_block_size( block_size ); #if HAVE_CFITSIO +#if HAVE_fits // if PSRFITS input, set block to exact size of FITS row // this is needed to keep in sync with the callback if (manager->get_info()->get_machine() == "FITS") { FITSFile* tmp = dynamic_cast (manager->get_input()); - unsigned samples_per_row = tmp->get_samples_in_row(); - uint64_t current_bytes = manager->set_block_size (samples_per_row); - uint64_t new_max_ram = current_bytes / tmp->get_block_size() * samples_per_row; - if (new_max_ram > config->get_maximum_RAM ()) - throw Error (InvalidState, "prepare", "Maximum RAM smaller than PSRFITS row."); - manager->set_maximum_RAM (new_max_ram); - manager->set_block_size (samples_per_row); - } -#endif - - // add the increased block size if the SKFB is being used - if (skfilterbank) - { - block_size = manager->get_input()->get_block_size(); - int64_t skfb_increment = (int64_t) skfilterbank->get_skfb_inc (block_size); - - block_size += skfb_increment; - block_overlap += skfb_increment; - - if (block_overlap) - manager->set_overlap( block_overlap ); - ram = manager->set_block_size( block_size ); - - skfb_increment *= -1; - skresize->set_resize_samples (skfb_increment); + uint64_t block_size; - if (Operation::verbose) - cerr << "dsp::LoadToFold::prepare block_size will be adjusted by " - << skfb_increment << " samples for SKFB" << endl; + if (!tmp) + { + MultiFile* mfile = dynamic_cast (manager->get_input()); + if (mfile) + { + block_size = mfile->get_block_size(); + tmp = dynamic_cast ( mfile->get_loader() ); + } + } + else + block_size = tmp->get_block_size(); + if (tmp) + { + unsigned samples_per_row = tmp->get_samples_in_row(); + uint64_t current_bytes = manager->set_block_size (samples_per_row); + uint64_t new_max_ram = current_bytes / block_size * samples_per_row; + if (new_max_ram > config->get_maximum_RAM ()) + throw Error (InvalidState, "LoadToFold::prepare", + "Maximum RAM smaller than PSRFITS row."); + manager->set_maximum_RAM (new_max_ram); + manager->set_block_size (samples_per_row); + } + else + cerr << "dspsr: WARNING have FITS input but cannot set block size properly." << endl; } +#endif +#endif if (report_vitals) { diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/Makefile.am bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/Makefile.am --- bl-dspsr-0+git20160405/Signal/Pulsar/Makefile.am 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/Makefile.am 2018-03-12 23:02:35.000000000 +0000 @@ -43,7 +43,13 @@ $(top_builddir)/Signal/General/libdspdsp.la \ $(top_builddir)/Kernel/libdspbase.la \ $(top_builddir)/Signal/Statistics/libdspstats.la \ - @CUFFT_LIBS@ @CUDA_LIBS@ + @CUDA_LIBS@ + +if HAVE_CUFFT_CALLBACKS +LDADD += $(top_builddir)/Signal/General/ConvolutionCUDACallbacks_DC.o -lcufft_static -lculibos +else +LDADD += @CUFFT_LIBS@ +endif AM_CPPFLAGS += @CUFFT_CFLAGS@ if HAVE_CFITSIO diff -Nru bl-dspsr-0+git20160405/Signal/Pulsar/TransferPhaseSeriesCUDA.C bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/TransferPhaseSeriesCUDA.C --- bl-dspsr-0+git20160405/Signal/Pulsar/TransferPhaseSeriesCUDA.C 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/Pulsar/TransferPhaseSeriesCUDA.C 2018-03-12 23:02:35.000000000 +0000 @@ -26,11 +26,6 @@ { prepare (); - if (stream) - cudaStreamSynchronize(stream); - else - cudaThreadSynchronize(); - if (verbose) cerr << "dsp::TransferPhaseSeriesCUDA::transformation input ndat=" << input->get_ndat() << " ndim=" << input->get_ndim() @@ -75,6 +70,12 @@ throw Error (InvalidState, "dsp::TransferPhaseSeriesCUDA::transformation hits", cudaGetErrorString (error)); } + + if (stream) + cudaStreamSynchronize(stream); + else + cudaThreadSynchronize(); + } void dsp::TransferPhaseSeriesCUDA::prepare () diff -Nru bl-dspsr-0+git20160405/Signal/Statistics/dsp/MidPoint.h bl-dspsr-0.0~git20180312.50ea209/Signal/Statistics/dsp/MidPoint.h --- bl-dspsr-0+git20160405/Signal/Statistics/dsp/MidPoint.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/Statistics/dsp/MidPoint.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/Statistics/dsp/MidPoint.h,v $ - $Revision: 1.2 $ - $Date: 2011/08/04 21:03:22 $ - $Author: straten $ */ +// dspsr/Signal/Statistics/dsp/MidPoint.h #ifndef __MidPointMethod #define __MidPointMethod diff -Nru bl-dspsr-0+git20160405/Signal/Statistics/dsp/Neville.h bl-dspsr-0.0~git20180312.50ea209/Signal/Statistics/dsp/Neville.h --- bl-dspsr-0+git20160405/Signal/Statistics/dsp/Neville.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/Statistics/dsp/Neville.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/Statistics/dsp/Neville.h,v $ - $Revision: 1.2 $ - $Date: 2011/08/04 21:03:22 $ - $Author: straten $ */ +// dspsr/Signal/Statistics/dsp/Neville.h #ifndef __NevilleMethod #define __NevilleMethod diff -Nru bl-dspsr-0+git20160405/Signal/Statistics/dsp/NewtonRaphson.h bl-dspsr-0.0~git20180312.50ea209/Signal/Statistics/dsp/NewtonRaphson.h --- bl-dspsr-0+git20160405/Signal/Statistics/dsp/NewtonRaphson.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/Statistics/dsp/NewtonRaphson.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/Statistics/dsp/NewtonRaphson.h,v $ - $Revision: 1.2 $ - $Date: 2011/08/04 21:03:22 $ - $Author: straten $ */ +// dspsr/Signal/Statistics/dsp/NewtonRaphson.h #ifndef __NewtonRaphsonMethod #define __NewtonRaphsonMethod diff -Nru bl-dspsr-0+git20160405/Signal/Statistics/dsp/Romberg.h bl-dspsr-0.0~git20180312.50ea209/Signal/Statistics/dsp/Romberg.h --- bl-dspsr-0+git20160405/Signal/Statistics/dsp/Romberg.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/Statistics/dsp/Romberg.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/Statistics/dsp/Romberg.h,v $ - $Revision: 1.2 $ - $Date: 2011/08/04 21:03:22 $ - $Author: straten $ */ +// dspsr/Signal/Statistics/dsp/Romberg.h #ifndef __RombergMethod #define __RombergMethod diff -Nru bl-dspsr-0+git20160405/Signal/Statistics/dsp/Trapezoid.h bl-dspsr-0.0~git20180312.50ea209/Signal/Statistics/dsp/Trapezoid.h --- bl-dspsr-0+git20160405/Signal/Statistics/dsp/Trapezoid.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/Statistics/dsp/Trapezoid.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/Statistics/dsp/Trapezoid.h,v $ - $Revision: 1.2 $ - $Date: 2011/08/04 21:03:22 $ - $Author: straten $ */ +// dspsr/Signal/Statistics/dsp/Trapezoid.h #ifndef __TrapezoidMethod #define __TrapezoidMethod diff -Nru bl-dspsr-0+git20160405/Signal/Statistics/dsp/VolumeIntegral.h bl-dspsr-0.0~git20180312.50ea209/Signal/Statistics/dsp/VolumeIntegral.h --- bl-dspsr-0+git20160405/Signal/Statistics/dsp/VolumeIntegral.h 2018-03-12 08:31:57.000000000 +0000 +++ bl-dspsr-0.0~git20180312.50ea209/Signal/Statistics/dsp/VolumeIntegral.h 2018-03-12 23:02:35.000000000 +0000 @@ -6,10 +6,7 @@ * ***************************************************************************/ -/* $Source: /cvsroot/dspsr/dspsr/Signal/Statistics/dsp/VolumeIntegral.h,v $ - $Revision: 1.2 $ - $Date: 2011/08/04 21:03:22 $ - $Author: straten $ */ +// dspsr/Signal/Statistics/dsp/VolumeIntegral.h #ifndef __Volume_Integral #define __Volume_Integral