diff -Nru diamond-aligner-2.0.6/CMakeLists.txt diamond-aligner-2.0.7/CMakeLists.txt --- diamond-aligner-2.0.6/CMakeLists.txt 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/CMakeLists.txt 2021-02-12 11:50:56.000000000 +0000 @@ -10,6 +10,8 @@ option(LEFTMOST_SEED_FILTER "LEFTMOST_SEED_FILTER" ON) option(SEQ_MASK "SEQ_MASK" ON) option(DP_STAT "DP_STAT" OFF) +option(SINGLE_THREADED "SINGLE_THREADED" OFF) +option(EIGEN_BLAS "EIGEN_BLAS" OFF) set(MAX_SHAPE_LEN 19) if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)") @@ -32,6 +34,10 @@ add_definitions(-DSTRICT_BAND) endif() +if(SINGLE_THREADED) + add_definitions(-DSINGLE_THREADED) +endif() + if(SEQ_MASK) add_definitions(-DSEQ_MASK) endif() @@ -89,15 +95,10 @@ message("Setting -fabi-version for GCC 4.x") endif() -include_directories( - "${ZLIB_INCLUDE_DIR}" - "${CMAKE_SOURCE_DIR}/src/lib") - set(DISPATCH_OBJECTS "src/dp/swipe/banded_3frame_swipe.cpp" "src/dp/swipe/swipe.cpp" "src/dp/swipe/banded_swipe.cpp" -"src/search/collision.cpp" "src/search/stage1.cpp" "src/search/stage2.cpp" "src/tools/benchmark.cpp" @@ -109,9 +110,12 @@ add_library(arch_generic OBJECT ${DISPATCH_OBJECTS}) target_compile_options(arch_generic PUBLIC -DDISPATCH_ARCH=ARCH_GENERIC -DARCH_ID=0 -DEigen=Eigen_GENERIC) +target_include_directories(arch_generic PRIVATE "${CMAKE_SOURCE_DIR}/src/lib") if(X86) add_library(arch_sse4_1 OBJECT ${DISPATCH_OBJECTS}) +target_include_directories(arch_sse4_1 PRIVATE "${CMAKE_SOURCE_DIR}/src/lib") add_library(arch_avx2 OBJECT ${DISPATCH_OBJECTS}) +target_include_directories(arch_avx2 PRIVATE "${CMAKE_SOURCE_DIR}/src/lib") if (${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC) target_compile_options(arch_sse4_1 PUBLIC -DDISPATCH_ARCH=ARCH_SSE4_1 -DARCH_ID=1 -D__SSSE3__ -D__SSE4_1__ -D__POPCNT__ -DEigen=Eigen_SSE4_1) target_compile_options(arch_avx2 PUBLIC -DDISPATCH_ARCH=ARCH_AVX2 -DARCH_ID=2 /arch:AVX2 -D__SSSE3__ -D__SSE4_1__ -D__POPCNT__ -DEigen=Eigen_AVX2) @@ -250,6 +254,7 @@ src/stats/comp_based_stats.cpp src/stats/hauser_correction.cpp src/stats/matrix_adjust.cpp + src/stats/matrix_adjust_eigen.cpp ) if(X86) @@ -258,6 +263,19 @@ add_executable(diamond $ ${OBJECTS}) endif() +list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}") + +target_include_directories(diamond PRIVATE + "${ZLIB_INCLUDE_DIR}" + "${CMAKE_SOURCE_DIR}/src/lib") + +if(EIGEN_BLAS) + add_definitions(-DEIGEN_USE_BLAS -DEIGEN_USE_LAPACKE) + find_package(LAPACKE) + target_include_directories(diamond PRIVATE "${LAPACKE_INCLUDE_DIR}") + target_link_libraries(diamond "${LAPACKE_LIBRARIES_DEP}") +endif() + target_link_libraries(diamond ${ZLIB_LIBRARY} ${CMAKE_THREAD_LIBS_INIT}) install(TARGETS diamond DESTINATION bin) diff -Nru diamond-aligner-2.0.6/debian/changelog diamond-aligner-2.0.7/debian/changelog --- diamond-aligner-2.0.6/debian/changelog 2021-01-03 13:45:31.000000000 +0000 +++ diamond-aligner-2.0.7/debian/changelog 2021-02-15 08:29:10.000000000 +0000 @@ -1,3 +1,10 @@ +diamond-aligner (2.0.7-1) unstable; urgency=medium + + * Team upload. + * New upstream version + + -- Michael R. Crusoe Mon, 15 Feb 2021 09:29:10 +0100 + diamond-aligner (2.0.6-1) unstable; urgency=medium * Team upload. diff -Nru diamond-aligner-2.0.6/FindLAPACKE.cmake diamond-aligner-2.0.7/FindLAPACKE.cmake --- diamond-aligner-2.0.6/FindLAPACKE.cmake 1970-01-01 00:00:00.000000000 +0000 +++ diamond-aligner-2.0.7/FindLAPACKE.cmake 2021-02-12 11:50:56.000000000 +0000 @@ -0,0 +1,421 @@ +### +# +# @copyright (c) 2009-2014 The University of Tennessee and The University +# of Tennessee Research Foundation. +# All rights reserved. +# @copyright (c) 2012-2016 Inria. All rights reserved. +# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. +# +### +# +# - Find LAPACKE include dirs and libraries +# Use this module by invoking find_package with the form: +# find_package(LAPACKE +# [REQUIRED] # Fail with error if lapacke is not found +# [COMPONENTS ...] # dependencies +# ) +# +# LAPACKE depends on the following libraries: +# - LAPACK +# +# This module finds headers and lapacke library. +# Results are reported in variables: +# LAPACKE_FOUND - True if headers and requested libraries were found +# LAPACKE_LINKER_FLAGS - list of required linker flags (excluding -l and -L) +# LAPACKE_INCLUDE_DIRS - lapacke include directories +# LAPACKE_LIBRARY_DIRS - Link directories for lapacke libraries +# LAPACKE_LIBRARIES - lapacke component libraries to be linked +# LAPACKE_INCLUDE_DIRS_DEP - lapacke + dependencies include directories +# LAPACKE_LIBRARY_DIRS_DEP - lapacke + dependencies link directories +# LAPACKE_LIBRARIES_DEP - lapacke libraries + dependencies +# +# The user can give specific paths where to find the libraries adding cmake +# options at configure (ex: cmake path/to/project -DLAPACKE_DIR=path/to/lapacke): +# LAPACKE_DIR - Where to find the base directory of lapacke +# LAPACKE_INCDIR - Where to find the header files +# LAPACKE_LIBDIR - Where to find the library files +# The module can also look for the following environment variables if paths +# are not given as cmake variable: LAPACKE_DIR, LAPACKE_INCDIR, LAPACKE_LIBDIR +# +# If the static version of the LAPACKE libraries is required, please add the +# following in your CMakeLists.txt before calling find_package(LAPACKE): +# set(LAPACKE_STATIC TRUE) +# +# LAPACKE could be directly embedded in LAPACK library (ex: Intel MKL) so that +# we test a lapacke function with the lapack libraries found and set LAPACKE +# variables to LAPACK ones if test is successful. To skip this feature and +# look for a stand alone lapacke, please add the following in your +# CMakeLists.txt before to call find_package(LAPACKE): +# set(LAPACKE_STANDALONE TRUE) + +#============================================================================= +# Copyright 2012-2013 Inria +# Copyright 2012-2013 Emmanuel Agullo +# Copyright 2012-2013 Mathieu Faverge +# Copyright 2012 Cedric Castagnede +# Copyright 2013-2016 Florent Pruvost +# +# This file is part of a computer program whose purpose is to process +# Matrices Over Runtime Systems @ Exascale (MORSE). More information +# can be found on the following website: http://www.inria.fr/en/teams/morse. +# +# This software is governed by the CeCILL-C license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL-C +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL-C license and that you accept its terms. +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the License for more information. +#============================================================================= + +if (NOT LAPACKE_FOUND) + set(LAPACKE_DIR "" CACHE PATH "Installation directory of LAPACKE library") + if (NOT LAPACKE_FIND_QUIETLY) + message(STATUS "A cache variable, namely LAPACKE_DIR, has been set to specify the install directory of LAPACKE") + endif() +endif() + +# LAPACKE depends on LAPACK anyway, try to find it +if (NOT LAPACK_FOUND) + if(LAPACKE_FIND_REQUIRED) + find_package(LAPACKEXT REQUIRED) + else() + find_package(LAPACKEXT) + endif() +endif() + +# LAPACKE depends on LAPACK +if (LAPACK_FOUND) + + if (NOT LAPACKE_STANDALONE) + # check if a lapacke function exists in the LAPACK lib + include(CheckFunctionExists) + set(CMAKE_REQUIRED_LIBRARIES "${LAPACK_LINKER_FLAGS};${LAPACK_LIBRARIES}") + unset(LAPACKE_WORKS CACHE) + check_function_exists(LAPACKE_dgeqrf LAPACKE_WORKS) + mark_as_advanced(LAPACKE_WORKS) + set(CMAKE_REQUIRED_LIBRARIES) + + if(LAPACKE_WORKS) + if(NOT LAPACKE_FIND_QUIETLY) + message(STATUS "Looking for lapacke: test with lapack succeeds") + endif() + # test succeeds: LAPACKE is in LAPACK + set(LAPACKE_LIBRARIES "${LAPACK_LIBRARIES}") + set(LAPACKE_LIBRARIES_DEP "${LAPACK_LIBRARIES}") + if (LAPACK_LIBRARY_DIRS) + set(LAPACKE_LIBRARY_DIRS "${LAPACK_LIBRARY_DIRS}") + endif() + if(LAPACK_INCLUDE_DIRS) + set(LAPACKE_INCLUDE_DIRS "${LAPACK_INCLUDE_DIRS}") + set(LAPACKE_INCLUDE_DIRS_DEP "${LAPACK_INCLUDE_DIRS}") + endif() + if (LAPACK_LINKER_FLAGS) + set(LAPACKE_LINKER_FLAGS "${LAPACK_LINKER_FLAGS}") + endif() + endif() + endif (NOT LAPACKE_STANDALONE) + + if (LAPACKE_STANDALONE OR NOT LAPACKE_WORKS) + + if(NOT LAPACKE_WORKS AND NOT LAPACKE_FIND_QUIETLY) + message(STATUS "Looking for lapacke : test with lapack fails") + endif() + # test fails: try to find LAPACKE lib exterior to LAPACK + + # Try to find LAPACKE lib + ####################### + + # Looking for include + # ------------------- + + # Add system include paths to search include + # ------------------------------------------ + unset(_inc_env) + set(ENV_LAPACKE_DIR "$ENV{LAPACKE_DIR}") + set(ENV_LAPACKE_INCDIR "$ENV{LAPACKE_INCDIR}") + if(ENV_LAPACKE_INCDIR) + list(APPEND _inc_env "${ENV_LAPACKE_INCDIR}") + elseif(ENV_LAPACKE_DIR) + list(APPEND _inc_env "${ENV_LAPACKE_DIR}") + list(APPEND _inc_env "${ENV_LAPACKE_DIR}/include") + list(APPEND _inc_env "${ENV_LAPACKE_DIR}/include/lapacke") + else() + if(WIN32) + string(REPLACE ":" ";" _inc_env "$ENV{INCLUDE}") + else() + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{CPATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + endif() + endif() + list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}") + list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}") + list(REMOVE_DUPLICATES _inc_env) + + + # Try to find the lapacke header in the given paths + # ------------------------------------------------- + # call cmake macro to find the header path + if(LAPACKE_INCDIR) + set(LAPACKE_lapacke.h_DIRS "LAPACKE_lapacke.h_DIRS-NOTFOUND") + find_path(LAPACKE_lapacke.h_DIRS + NAMES lapacke.h + HINTS ${LAPACKE_INCDIR}) + else() + if(LAPACKE_DIR) + set(LAPACKE_lapacke.h_DIRS "LAPACKE_lapacke.h_DIRS-NOTFOUND") + find_path(LAPACKE_lapacke.h_DIRS + NAMES lapacke.h + HINTS ${LAPACKE_DIR} + PATH_SUFFIXES "include" "include/lapacke") + else() + set(LAPACKE_lapacke.h_DIRS "LAPACKE_lapacke.h_DIRS-NOTFOUND") + find_path(LAPACKE_lapacke.h_DIRS + NAMES lapacke.h + HINTS ${_inc_env}) + endif() + endif() + mark_as_advanced(LAPACKE_lapacke.h_DIRS) + + # If found, add path to cmake variable + # ------------------------------------ + if (LAPACKE_lapacke.h_DIRS) + set(LAPACKE_INCLUDE_DIRS "${LAPACKE_lapacke.h_DIRS}") + else () + set(LAPACKE_INCLUDE_DIRS "LAPACKE_INCLUDE_DIRS-NOTFOUND") + if(NOT LAPACKE_FIND_QUIETLY) + message(STATUS "Looking for lapacke -- lapacke.h not found") + endif() + endif() + + + # Looking for lib + # --------------- + + # Add system library paths to search lib + # -------------------------------------- + unset(_lib_env) + set(ENV_LAPACKE_LIBDIR "$ENV{LAPACKE_LIBDIR}") + if(ENV_LAPACKE_LIBDIR) + list(APPEND _lib_env "${ENV_LAPACKE_LIBDIR}") + elseif(ENV_LAPACKE_DIR) + list(APPEND _lib_env "${ENV_LAPACKE_DIR}") + list(APPEND _lib_env "${ENV_LAPACKE_DIR}/lib") + else() + if(WIN32) + string(REPLACE ":" ";" _lib_env "$ENV{LIB}") + else() + if(APPLE) + string(REPLACE ":" ";" _lib_env "$ENV{DYLD_LIBRARY_PATH}") + else() + string(REPLACE ":" ";" _lib_env "$ENV{LD_LIBRARY_PATH}") + endif() + list(APPEND _lib_env "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}") + list(APPEND _lib_env "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}") + endif() + endif() + list(REMOVE_DUPLICATES _lib_env) + + # Try to find the lapacke lib in the given paths + # ---------------------------------------------- + + # name of the lapacke library + set(LAPACKE_lapacke_NAMES "lapacke") + if(LAPACKE_STATIC) + if(WIN32) + set(LAPACKE_lapacke_NAMES "liblapacke.lib") + endif() + + if(UNIX) + set(LAPACKE_lapacke_NAMES "liblapacke.a") + endif() + endif() + + # call cmake macro to find the lib path + if(LAPACKE_LIBDIR) + set(LAPACKE_lapacke_LIBRARY "LAPACKE_lapacke_LIBRARY-NOTFOUND") + find_library(LAPACKE_lapacke_LIBRARY + NAMES ${LAPACKE_lapacke_NAMES} + HINTS ${LAPACKE_LIBDIR}) + else() + if(LAPACKE_DIR) + set(LAPACKE_lapacke_LIBRARY "LAPACKE_lapacke_LIBRARY-NOTFOUND") + find_library(LAPACKE_lapacke_LIBRARY + NAMES ${LAPACKE_lapacke_NAMES} + HINTS ${LAPACKE_DIR} + PATH_SUFFIXES lib lib32 lib64) + else() + set(LAPACKE_lapacke_LIBRARY "LAPACKE_lapacke_LIBRARY-NOTFOUND") + find_library(LAPACKE_lapacke_LIBRARY + NAMES ${LAPACKE_lapacke_NAMES} + HINTS ${_lib_env}) + endif() + endif() + mark_as_advanced(LAPACKE_lapacke_LIBRARY) + + # If found, add path to cmake variable + # ------------------------------------ + if (LAPACKE_lapacke_LIBRARY) + get_filename_component(lapacke_lib_path "${LAPACKE_lapacke_LIBRARY}" PATH) + # set cmake variables + set(LAPACKE_LIBRARIES "${LAPACKE_lapacke_LIBRARY}") + set(LAPACKE_LIBRARY_DIRS "${lapacke_lib_path}") + else () + set(LAPACKE_LIBRARIES "LAPACKE_LIBRARIES-NOTFOUND") + set(LAPACKE_LIBRARY_DIRS "LAPACKE_LIBRARY_DIRS-NOTFOUND") + if (NOT LAPACKE_FIND_QUIETLY) + message(STATUS "Looking for lapacke -- lib lapacke not found") + endif() + endif () + + # check a function to validate the find + if(LAPACKE_LIBRARIES) + + set(REQUIRED_LDFLAGS) + set(REQUIRED_INCDIRS) + set(REQUIRED_LIBDIRS) + set(REQUIRED_LIBS) + + # LAPACKE + if (LAPACKE_INCLUDE_DIRS) + set(REQUIRED_INCDIRS "${LAPACKE_INCLUDE_DIRS}") + endif() + if (LAPACKE_LIBRARY_DIRS) + set(REQUIRED_LIBDIRS "${LAPACKE_LIBRARY_DIRS}") + endif() + set(REQUIRED_LIBS "${LAPACKE_LIBRARIES}") + # LAPACK + if (LAPACK_INCLUDE_DIRS) + list(APPEND REQUIRED_INCDIRS "${LAPACK_INCLUDE_DIRS}") + endif() + if (LAPACK_LIBRARY_DIRS) + list(APPEND REQUIRED_LIBDIRS "${LAPACK_LIBRARY_DIRS}") + endif() + list(APPEND REQUIRED_LIBS "${LAPACK_LIBRARIES}") + if (LAPACK_LINKER_FLAGS) + list(APPEND REQUIRED_LDFLAGS "${LAPACK_LINKER_FLAGS}") + endif() + # Fortran + if (CMAKE_C_COMPILER_ID MATCHES "GNU") + find_library( + FORTRAN_gfortran_LIBRARY + NAMES gfortran + HINTS ${_lib_env} + ) + mark_as_advanced(FORTRAN_gfortran_LIBRARY) + if (FORTRAN_gfortran_LIBRARY) + list(APPEND REQUIRED_LIBS "${FORTRAN_gfortran_LIBRARY}") + endif() + elseif (CMAKE_C_COMPILER_ID MATCHES "Intel") + find_library( + FORTRAN_ifcore_LIBRARY + NAMES ifcore + HINTS ${_lib_env} + ) + mark_as_advanced(FORTRAN_ifcore_LIBRARY) + if (FORTRAN_ifcore_LIBRARY) + list(APPEND REQUIRED_LIBS "${FORTRAN_ifcore_LIBRARY}") + endif() + endif() + # m + find_library(M_LIBRARY NAMES m HINTS ${_lib_env}) + mark_as_advanced(M_LIBRARY) + if(M_LIBRARY) + list(APPEND REQUIRED_LIBS "-lm") + endif() + # set required libraries for link + set(CMAKE_REQUIRED_INCLUDES "${REQUIRED_INCDIRS}") + set(CMAKE_REQUIRED_LIBRARIES) + list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LDFLAGS}") + foreach(lib_dir ${REQUIRED_LIBDIRS}) + list(APPEND CMAKE_REQUIRED_LIBRARIES "-L${lib_dir}") + endforeach() + list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LIBS}") + string(REGEX REPLACE "^ -" "-" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") + + # test link + unset(LAPACKE_WORKS CACHE) + include(CheckFunctionExists) + check_function_exists(LAPACKE_dgeqrf LAPACKE_WORKS) + mark_as_advanced(LAPACKE_WORKS) + + if(LAPACKE_WORKS) + # save link with dependencies + set(LAPACKE_LIBRARIES_DEP "${REQUIRED_LIBS}") + set(LAPACKE_LIBRARY_DIRS_DEP "${REQUIRED_LIBDIRS}") + set(LAPACKE_INCLUDE_DIRS_DEP "${REQUIRED_INCDIRS}") + set(LAPACKE_LINKER_FLAGS "${REQUIRED_LDFLAGS}") + list(REMOVE_DUPLICATES LAPACKE_LIBRARY_DIRS_DEP) + list(REMOVE_DUPLICATES LAPACKE_INCLUDE_DIRS_DEP) + list(REMOVE_DUPLICATES LAPACKE_LINKER_FLAGS) + else() + if(NOT LAPACKE_FIND_QUIETLY) + message(STATUS "Looking for lapacke: test of LAPACKE_dgeqrf with lapacke and lapack libraries fails") + message(STATUS "CMAKE_REQUIRED_LIBRARIES: ${CMAKE_REQUIRED_LIBRARIES}") + message(STATUS "CMAKE_REQUIRED_INCLUDES: ${CMAKE_REQUIRED_INCLUDES}") + message(STATUS "Check in CMakeFiles/CMakeError.log to figure out why it fails") + endif() + endif() + set(CMAKE_REQUIRED_INCLUDES) + set(CMAKE_REQUIRED_FLAGS) + set(CMAKE_REQUIRED_LIBRARIES) + endif(LAPACKE_LIBRARIES) + + endif (LAPACKE_STANDALONE OR NOT LAPACKE_WORKS) + +else(LAPACK_FOUND) + + if (NOT LAPACKE_FIND_QUIETLY) + message(STATUS "LAPACKE requires LAPACK but LAPACK has not been found." + "Please look for LAPACK first.") + endif() + +endif(LAPACK_FOUND) + +if (LAPACKE_LIBRARIES) + list(GET LAPACKE_LIBRARIES 0 first_lib) + get_filename_component(first_lib_path "${first_lib}" PATH) + if (${first_lib_path} MATCHES "(/lib(32|64)?$)|(/lib/intel64$|/lib/ia32$)") + string(REGEX REPLACE "(/lib(32|64)?$)|(/lib/intel64$|/lib/ia32$)" "" not_cached_dir "${first_lib_path}") + set(LAPACKE_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of LAPACKE library" FORCE) + else() + set(LAPACKE_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of LAPACKE library" FORCE) + endif() +endif() +mark_as_advanced(LAPACKE_DIR) +mark_as_advanced(LAPACKE_DIR_FOUND) + +# check that LAPACKE has been found +# --------------------------------- +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(LAPACKE DEFAULT_MSG + LAPACKE_LIBRARIES + LAPACKE_WORKS) diff -Nru diamond-aligner-2.0.6/FindLAPACKEXT.cmake diamond-aligner-2.0.7/FindLAPACKEXT.cmake --- diamond-aligner-2.0.6/FindLAPACKEXT.cmake 1970-01-01 00:00:00.000000000 +0000 +++ diamond-aligner-2.0.7/FindLAPACKEXT.cmake 2021-02-12 11:50:56.000000000 +0000 @@ -0,0 +1,338 @@ +### +# +# @copyright (c) 2009-2014 The University of Tennessee and The University +# of Tennessee Research Foundation. +# All rights reserved. +# @copyright (c) 2012-2017 Inria. All rights reserved. +# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. +# +### +# +# - Find LAPACK EXTENDED for MORSE projects: find include dirs and libraries +# +# This module allows to find LAPACK libraries by calling the official FindLAPACK module +# and handles the creation of different library lists whether the user wishes to link +# with a sequential LAPACK or a multihreaded (LAPACK_SEQ_LIBRARIES and LAPACK_PAR_LIBRARIES). +# LAPACK is detected with a FindLAPACK call and if the BLAS vendor is in the following list, +# Intel mkl, ACML then the module tries find the corresponding multithreaded libraries +# +# The following variables have been added to manage links with sequential or multithreaded +# versions: +# LAPACK_INCLUDE_DIRS - LAPACK include directories +# LAPACK_LIBRARY_DIRS - Link directories for LAPACK libraries +# LAPACK_SEQ_LIBRARIES - LAPACK component libraries to be linked (sequential) +# LAPACK_PAR_LIBRARIES - LAPACK component libraries to be linked (multithreaded) +# LAPACKEXT_FOUND - if a LAPACK has been found +# LAPACKEXT_LIBRARIES - Idem LAPACK_LIBRARIES +# LAPACKEXT_INCLUDE_DIRS - Idem LAPACK_INCLUDE_DIRS +# LAPACKEXT_LIBRARY_DIRS - Idem LAPACK_LIBRARY_DIRS + +#============================================================================= +# Copyright 2012-2013 Inria +# Copyright 2012-2013 Emmanuel Agullo +# Copyright 2012-2013 Mathieu Faverge +# Copyright 2012 Cedric Castagnede +# Copyright 2013-2017 Florent Pruvost +# +# Distributed under the OSI-approved BSD License (the "License"); +# see accompanying file MORSE-Copyright.txt for details. +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the License for more information. +#============================================================================= +# (To distribute this file outside of Morse, substitute the full +# License text for the above reference.) + +# Macro to factorize this call. required arguments allows to decide if +# the REQUIRED option must be given to find_package calls +macro(find_package_lapack required) + if(LAPACKEXT_FIND_REQUIRED AND required) + if(LAPACKEXT_FIND_QUIETLY) + find_package(LAPACK REQUIRED QUIET) + else() + find_package(LAPACK REQUIRED) + endif() + else() + if(LAPACKEXT_FIND_QUIETLY) + find_package(LAPACK QUIET) + else() + find_package(LAPACK) + endif() + endif() +endmacro() + +# LAPACKEXT depends on BLAS +if (NOT BLAS_FOUND) + if(LAPACKEXT_FIND_QUIETLY) + find_package(BLAS QUIET) + else() + find_package(BLAS) + endif() +endif () + +if(NOT LAPACKEXT_FIND_QUIETLY) + message(STATUS "In FindLAPACKEXT") +endif() + +if(BLA_VENDOR MATCHES "Intel*") + + ### + # look for include path if the LAPACK vendor is Intel + ### + + # gather system include paths + unset(_inc_env) + if(WIN32) + string(REPLACE ":" ";" _inc_env "$ENV{INCLUDE}") + else() + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{CPATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + endif() + list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}") + list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}") + set(ENV_MKLROOT "$ENV{MKLROOT}") + if (ENV_MKLROOT) + list(APPEND _inc_env "${ENV_MKLROOT}/include") + endif() + list(REMOVE_DUPLICATES _inc_env) + + if (BLAS_DIR) + set(LAPACK_DIR ${BLAS_DIR}) + endif () + if (BLAS_INCDIR) + set(LAPACK_INCDIR ${BLAS_INCDIR}) + endif () + # find mkl.h inside known include paths + set(LAPACK_mkl_lapack.h_INCLUDE_DIRS "LAPACK_mkl_lapack.h_INCLUDE_DIRS-NOTFOUND") + if(LAPACK_INCDIR) + find_path(LAPACK_mkl_lapack.h_INCLUDE_DIRS + NAMES mkl_lapack.h + HINTS ${LAPACK_INCDIR}) + else() + if(LAPACK_DIR) + find_path(LAPACK_mkl_lapack.h_INCLUDE_DIRS + NAMES mkl_lapack.h + HINTS ${LAPACK_DIR} + PATH_SUFFIXES include) + else() + find_path(LAPACK_mkl_lapack.h_INCLUDE_DIRS + NAMES mkl_lapack.h + HINTS ${_inc_env}) + endif() + endif() + mark_as_advanced(LAPACK_mkl_lapack.h_INCLUDE_DIRS) + ## Print status if not found + ## ------------------------- + #if (NOT LAPACK_mkl_lapack.h_INCLUDE_DIRS) + # Print_Find_Header_Status(lapack mkl_lapack.h) + #endif () + set(LAPACK_INCLUDE_DIRS "") + if(LAPACK_mkl_lapack.h_INCLUDE_DIRS) + list(APPEND LAPACK_INCLUDE_DIRS "${LAPACK_mkl_lapack.h_INCLUDE_DIRS}" ) + endif() + + ### + # look for libs + ### + + if (BLA_VENDOR MATCHES "Intel10_64lp*") + ## look for the sequential version + set(BLA_VENDOR "Intel10_64lp_seq") + endif() + find_package_lapack(0) + + if (LAPACK_FOUND) + if(BLAS_SEQ_LIBRARIES) + set(LAPACK_SEQ_LIBRARIES "${BLAS_SEQ_LIBRARIES}") + else() + set(LAPACK_SEQ_LIBRARIES "${LAPACK_SEQ_LIBRARIES-NOTFOUND}") + endif() + # if BLAS Intel 10 64 bit -> save sequential and multithreaded versions + if(BLA_VENDOR MATCHES "Intel10_64lp*") + if(BLAS_PAR_LIBRARIES) + set(LAPACK_PAR_LIBRARIES "${BLAS_PAR_LIBRARIES}") + else() + set(LAPACK_PAR_LIBRARIES "${LAPACK_PAR_LIBRARIES-NOTFOUND}") + endif() + endif() + endif() + +elseif(BLA_VENDOR MATCHES "IBMESSL*") + + ## look for the sequential version + set(BLA_VENDOR "IBMESSL") + find_package_lapack(0) + + if (LAPACK_FOUND) + if(LAPACK_LIBRARIES) + set(LAPACK_SEQ_LIBRARIES "${LAPACK_LIBRARIES}") + else() + set(LAPACK_SEQ_LIBRARIES "${LAPACK_SEQ_LIBRARIES-NOTFOUND}") + endif() + endif() + + ## look for the multithreaded version + set(BLA_VENDOR "IBMESSLMT") + find_package_lapack(0) + + if (LAPACK_FOUND) + if(LAPACK_LIBRARIES) + set(LAPACK_PAR_LIBRARIES "${LAPACK_LIBRARIES}") + else() + set(LAPACK_PAR_LIBRARIES "${LAPACK_PAR_LIBRARIES-NOTFOUND}") + endif() + endif() + +elseif(BLA_VENDOR MATCHES "ACML*") + + ### + # look for libs + ### + find_package_lapack(0) + + if (LAPACK_FOUND) + if(BLAS_SEQ_LIBRARIES) + set(LAPACK_SEQ_LIBRARIES "${BLAS_SEQ_LIBRARIES}") + else() + set(LAPACK_SEQ_LIBRARIES "${LAPACK_SEQ_LIBRARIES-NOTFOUND}") + endif() + if(BLAS_PAR_LIBRARIES) + set(LAPACK_PAR_LIBRARIES "${BLAS_PAR_LIBRARIES}") + else() + set(LAPACK_PAR_LIBRARIES "${LAPACK_PAR_LIBRARIES-NOTFOUND}") + endif() + endif() + +else() + + ## look for a sequential version + # call to the cmake official FindLAPACK module + # This module sets the following variables: + # LAPACK_FOUND - set to true if a library implementing the LAPACK interface + # is found + # LAPACK_LINKER_FLAGS - uncached list of required linker flags (excluding -l + # and -L). + # LAPACK_LIBRARIES - uncached list of libraries (using full path name) to + # link against to use LAPACK + # LAPACK95_LIBRARIES - uncached list of libraries (using full path name) + # to link against to use LAPACK95 interface + # LAPACK95_FOUND - set to true if a library implementing the LAPACK f95 interface + # is found + # BLA_STATIC if set on this determines what kind of linkage we do (static) + # BLA_VENDOR if set checks only the specified vendor, if not set checks + # all the possibilities + # BLA_F95 if set on tries to find the f95 interfaces for LAPACK/LAPACK + # Remark: it looks only into paths contained in the system environment variables + find_package_lapack(0) + + if(LAPACK_FOUND) + set(LAPACK_SEQ_LIBRARIES "${LAPACK_LIBRARIES}") + else() + set(LAPACK_SEQ_LIBRARIES "${LAPACK_SEQ_LIBRARIES-NOTFOUND}") + endif() + set(BLAS_PAR_LIBRARIES "${BLAS_PAR_LIBRARIES-NOTFOUND}") + +endif() + +if (LAPACK_SEQ_LIBRARIES) + set(LAPACK_LIBRARIES "${LAPACK_SEQ_LIBRARIES}") +endif() + +# extract libs paths +# remark: because it is not given by find_package(LAPACK) +set(LAPACK_LIBRARY_DIRS "") +string(REPLACE " " ";" LAPACK_LIBRARIES "${LAPACK_LIBRARIES}") +foreach(lapack_lib ${LAPACK_LIBRARIES}) + if (EXISTS "${lapack_lib}") + get_filename_component(a_lapack_lib_dir "${lapack_lib}" PATH) + list(APPEND LAPACK_LIBRARY_DIRS "${a_lapack_lib_dir}" ) + else() + string(REPLACE "-L" "" lapack_lib "${lapack_lib}") + if (EXISTS "${lapack_lib}") + list(APPEND LAPACK_LIBRARY_DIRS "${lapack_lib}" ) + else() + get_filename_component(a_lapack_lib_dir "${lapack_lib}" PATH) + if (EXISTS "${a_lapack_lib_dir}") + list(APPEND LAPACK_LIBRARY_DIRS "${a_lapack_lib_dir}" ) + endif() + endif() + endif() +endforeach() +if (LAPACK_LIBRARY_DIRS) + list(REMOVE_DUPLICATES LAPACK_LIBRARY_DIRS) +endif () + +# check that LAPACKEXT has been found +# ----------------------------------- +include(FindPackageHandleStandardArgs) +if(BLA_VENDOR MATCHES "Intel*") + if(NOT LAPACKEXT_FIND_QUIETLY) + message(STATUS "LAPACK found is Intel MKL") + message(STATUS "LAPACK sequential libraries stored in LAPACK_SEQ_LIBRARIES") + endif() + find_package_handle_standard_args(LAPACKEXT DEFAULT_MSG + LAPACK_SEQ_LIBRARIES + LAPACK_LIBRARY_DIRS + LAPACK_INCLUDE_DIRS) + if(BLA_VENDOR MATCHES "Intel10_64lp*" AND LAPACK_PAR_LIBRARIES) + if(NOT LAPACKEXT_FIND_QUIETLY) + message(STATUS "LAPACK parallel libraries stored in LAPACK_PAR_LIBRARIES") + endif() + find_package_handle_standard_args(LAPACKEXT DEFAULT_MSG + LAPACK_PAR_LIBRARIES) + endif() +elseif(BLA_VENDOR MATCHES "ACML*") + if(NOT LAPACKEXT_FIND_QUIETLY) + message(STATUS "LAPACK found is ACML") + message(STATUS "LAPACK sequential libraries stored in LAPACK_SEQ_LIBRARIES") + endif() + find_package_handle_standard_args(LAPACKEXT DEFAULT_MSG + LAPACK_SEQ_LIBRARIES + LAPACK_LIBRARY_DIRS) + if(LAPACK_PAR_LIBRARIES) + if(NOT LAPACKEXT_FIND_QUIETLY) + message(STATUS "LAPACK parallel libraries stored in LAPACK_PAR_LIBRARIES") + endif() + find_package_handle_standard_args(LAPACKEXT DEFAULT_MSG + LAPACK_PAR_LIBRARIES) + endif() +elseif(BLA_VENDOR MATCHES "IBMESSL*") + if(NOT LAPACKEXT_FIND_QUIETLY) + message(STATUS "LAPACK found is IBMESSL") + message(STATUS "LAPACK sequential libraries stored in LAPACK_SEQ_LIBRARIES") + endif() + find_package_handle_standard_args(LAPACKEXT DEFAULT_MSG + LAPACK_SEQ_LIBRARIES + LAPACK_LIBRARY_DIRS) + if(LAPACK_PAR_LIBRARIES) + if(NOT LAPACKEXT_FIND_QUIETLY) + message(STATUS "LAPACK parallel libraries stored in LAPACK_PAR_LIBRARIES") + endif() + find_package_handle_standard_args(LAPACKEXT DEFAULT_MSG + LAPACK_PAR_LIBRARIES) + endif() +else() + if(NOT LAPACKEXT_FIND_QUIETLY) + message(STATUS "LAPACK sequential libraries stored in LAPACK_SEQ_LIBRARIES") + endif() + find_package_handle_standard_args(LAPACKEXT DEFAULT_MSG + LAPACK_SEQ_LIBRARIES + LAPACK_LIBRARY_DIRS) +endif() + +if (LAPACK_LIBRARIES) + set(LAPACKEXT_LIBRARIES ${LAPACK_LIBRARIES}) +endif() +if (LAPACK_INCLUDE_DIRS) + set(LAPACKEXT_INCLUDE_DIRS ${LAPACK_INCLUDE_DIRS}) +endif() +if (LAPACK_LIBRARY_DIRS) + set(LAPACKEXT_LIBRARY_DIRS ${LAPACK_LIBRARY_DIRS}) +endif() diff -Nru diamond-aligner-2.0.6/README.md diamond-aligner-2.0.7/README.md --- diamond-aligner-2.0.6/README.md 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/README.md 2021-02-12 11:50:56.000000000 +0000 @@ -1,3 +1,50 @@ ![diamond](http://www.diamondsearch.org/diamond_white_95px.png) -The DIAMOND protein aligner - [http://www.diamondsearch.org](http://www.diamondsearch.org) +Introduction +============ + +DIAMOND is a sequence aligner for protein and translated DNA searches, +designed for high performance analysis of big sequence data. The key +features are: + +- Pairwise alignment of proteins and translated DNA at 100x-10,000x + speed of BLAST. +- Frameshift alignments for long read analysis. +- Low resource requirements and suitable for running on standard + desktops or laptops. +- Various output formats, including BLAST pairwise, tabular and XML, + as well as taxonomic classification. + +Keep posted about new developments by following me on +[Twitter](https://twitter.com/bbuchfink). + +[![Build Status](https://travis-ci.org/bbuchfink/diamond.svg?branch=master)](https://travis-ci.org/bbuchfink/diamond) +[![image](https://img.shields.io/github/downloads/bbuchfink/diamond/total)](https://github.com/bbuchfink/diamond/releases) +[![image](https://anaconda.org/bioconda/diamond/badges/version.svg)](https://anaconda.org/bioconda/diamond) +[![image](https://anaconda.org/bioconda/diamond/badges/downloads.svg)](https://anaconda.org/bioconda/diamond) +[![image](http://diamondsearch.org/cit.svg)](https://scholar.google.com/citations?user=kjPIF1cAAAAJ) + +Documentation +============= +The online documentation is located at the [GitHub Wiki](https://github.com/bbuchfink/diamond/wiki). + +Support +======= +The [issue tracker](https://github.com/bbuchfink/diamond/issues) as well as the newly introduced [GitHub discussions](https://github.com/bbuchfink/diamond/discussions) are open for support requests of any kind. + +About +===== +DIAMOND is developed by Benjamin Buchfink at the Drost lab, Max +Planck Institute for Developmental Biology, Tübingen, Germany. + +\[[Email](mailto:buchfink@gmail.com)\] +\[[Twitter](https://twitter.com/bbuchfink)\] \[[Google +Scholar](https://scholar.google.de/citations?user=kjPIF1cAAAAJ)\] +\[[Drost lab](https://drostlab.com/)\] +\[[MPI-EBIO](http://eb.tuebingen.mpg.de/)\] + +Publication: + +- Buchfink B, Xie C, Huson DH, \"Fast and sensitive protein alignment + using DIAMOND\", *Nature Methods* **12**, 59-60 (2015). + [doi:10.1038/nmeth.3176](https://doi.org/10.1038/nmeth.3176) diff -Nru diamond-aligner-2.0.6/src/align/extend.cpp diamond-aligner-2.0.7/src/align/extend.cpp --- diamond-aligner-2.0.6/src/align/extend.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/align/extend.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -39,6 +39,7 @@ using std::array; using std::pair; using std::endl; +using std::move; namespace Extension { @@ -65,16 +66,17 @@ const sequence *query_seq, int source_query_len, const Bias_correction *query_cb, - const double* query_comp, + const Stats::Composition& query_comp, FlatArray &seed_hits, vector &target_block_ids, const Metadata& metadata, Statistics& stat, int flags) { + static const size_t GAPPED_FILTER_MIN_QLEN = 85; stat.inc(Statistics::TARGET_HITS2, target_block_ids.size()); task_timer timer(flags & DP::PARALLEL ? config.target_parallel_verbosity : UINT_MAX); - if (config.gapped_filter_evalue > 0.0 && config.global_ranking_targets == 0) { + if (config.gapped_filter_evalue > 0.0 && config.global_ranking_targets == 0 && (!align_mode.query_translated || query_seq[0].length() >= GAPPED_FILTER_MIN_QLEN)) { timer.go("Computing gapped filter"); gapped_filter(query_seq, query_cb, seed_hits, target_block_ids, stat, flags, params); if ((flags & DP::PARALLEL) == 0) @@ -92,14 +94,15 @@ vector extend( size_t query_id, - const Parameters ¶ms, - const Metadata &metadata, - Statistics &stat, + const Parameters& params, + const Metadata& metadata, + Statistics& stat, int flags, const FlatArray& seed_hits, const vector& target_block_ids, const vector& target_scores) { + const unsigned UNIFIED_TARGET_LEN = 50; const unsigned contexts = align_mode.query_contexts; vector query_seq; vector query_cb; @@ -109,7 +112,8 @@ log_stream << "Query=" << query_title << " Hits=" << seed_hits.data_size() << endl; for (unsigned i = 0; i < contexts; ++i) - query_seq.push_back(query_seqs::get()[query_id*contexts + i]); + query_seq.push_back(query_seqs::get()[query_id * contexts + i]); + const unsigned query_len = (unsigned)query_seq.front().length(); task_timer timer(flags & DP::PARALLEL ? config.target_parallel_verbosity : UINT_MAX); if (Stats::CBS::hauser(config.comp_based_stats)) { @@ -118,19 +122,22 @@ query_cb.emplace_back(query_seq[i]); timer.finish(); } - vector query_comp; + Stats::Composition query_comp; if (Stats::CBS::matrix_adjust(config.comp_based_stats)) query_comp = Stats::composition(query_seq[0]); const int source_query_len = align_mode.query_translated ? (int)query_source_seqs::get()[query_id].length() : (int)query_seqs::get()[query_id].length(); - const int relaxed_cutoff = score_matrix.rawscore(config.min_bit_score == 0.0 - ? score_matrix.bitscore(config.max_evalue * config.relaxed_evalue_factor, (unsigned)query_seq[0].length()) - : config.min_bit_score); const size_t target_count = target_block_ids.size(); const size_t chunk_size = ranking_chunk_size(target_count); vector::const_iterator i0 = target_scores.cbegin(), i1 = std::min(i0 + chunk_size, target_scores.cend()); - if (config.toppercent == 100.0) - while (i1 < target_scores.cend() && i1->score >= relaxed_cutoff && size_t(i1 - i0) < config.max_alignments) ++i1; + + if (config.toppercent == 100.0 && config.min_bit_score == 0.0) +#ifdef EVAL_TARGET + while (i1 < target_scores.cend() && i1->evalue <= config.max_evalue && size_t(i1 - i0) < config.max_alignments) ++i1; +#else + while (i1 < target_scores.cend() && score_matrix.evalue(i1->score, query_len, UNIFIED_TARGET_LEN) <= config.max_evalue) i1 += 16; +#endif + const int low_score = config.query_memory ? memory->low_score(query_id) : 0; const size_t previous_count = config.query_memory ? memory->count(query_id) : 0; bool first_round_traceback = config.min_id > 0 || config.query_cover > 0 || config.subject_cover > 0; @@ -138,7 +145,7 @@ int tail_score = 0; if (first_round_traceback) flags |= DP::TRACEBACK; - TLS_FIX_S390X FlatArray seed_hits_chunk; + thread_local FlatArray seed_hits_chunk; thread_local vector target_block_ids_chunk; vector aligned_targets; @@ -157,20 +164,20 @@ } } else { - target_block_ids_chunk = TLS_FIX_S390X_MOVE(target_block_ids); - seed_hits_chunk = TLS_FIX_S390X_MOVE(seed_hits); + target_block_ids_chunk = move(target_block_ids); + seed_hits_chunk = move(seed_hits); } //multiplier = std::max(multiplier, chunk_size_multiplier(seed_hits_chunk, (int)query_seq.front().length())); - vector v = extend(params, query_id, query_seq.data(), source_query_len, query_cb.data(), query_comp.data(), seed_hits_chunk, target_block_ids_chunk, metadata, stat, flags); + vector v = extend(params, query_id, query_seq.data(), source_query_len, query_cb.data(), query_comp, seed_hits_chunk, target_block_ids_chunk, metadata, stat, flags); const size_t n = v.size(); stat.inc(Statistics::TARGET_HITS4, v.size()); bool new_hits = false; if (multi_chunk) new_hits = append_hits(aligned_targets, v.begin(), v.end(), chunk_size, source_query_len, query_title, query_seq.front()); else - aligned_targets = TLS_FIX_S390X_MOVE(v); + aligned_targets = move(v); if (n == 0 || !new_hits) { if (config.query_memory && current_chunk_size >= chunk_size) @@ -207,10 +214,10 @@ vector extend(const Parameters ¶ms, size_t query_id, hit* begin, hit* end, const Metadata &metadata, Statistics &stat, int flags) { task_timer timer(flags & DP::PARALLEL ? config.target_parallel_verbosity : UINT_MAX); timer.go("Loading seed hits"); - TLS_FIX_S390X FlatArray seed_hits; + thread_local FlatArray seed_hits; thread_local vector target_block_ids; thread_local vector target_scores; - load_hits(begin, end, seed_hits, target_block_ids, target_scores); + load_hits(begin, end, seed_hits, target_block_ids, target_scores, (unsigned)query_seqs::get()[query_id * align_mode.query_contexts].length()); stat.inc(Statistics::TARGET_HITS0, target_block_ids.size()); stat.inc(Statistics::TIME_LOAD_HIT_TARGETS, timer.microseconds()); timer.finish(); diff -Nru diamond-aligner-2.0.6/src/align/extend.h diamond-aligner-2.0.7/src/align/extend.h --- diamond-aligner-2.0.6/src/align/extend.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/align/extend.h 2021-02-12 11:50:56.000000000 +0000 @@ -62,8 +62,8 @@ TextBuffer* generate_output(vector &targets, size_t query_block_id, Statistics &stat, const Metadata &metadata, const Parameters ¶meters); TextBuffer* generate_intermediate_output(const vector &targets, size_t query_block_id); -inline int raw_score_cutoff(size_t query_len) { +/*inline int raw_score_cutoff(size_t query_len) { return score_matrix.rawscore(config.min_bit_score == 0 ? score_matrix.bitscore(config.max_evalue, (unsigned)query_len) : config.min_bit_score); -} +}*/ } diff -Nru diamond-aligner-2.0.6/src/align/gapped.cpp diamond-aligner-2.0.7/src/align/gapped.cpp --- diamond-aligner-2.0.6/src/align/gapped.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/align/gapped.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -83,16 +83,23 @@ max_hsp_culling(); } -void add_dp_targets(const WorkTarget &target, int target_idx, const sequence *query_seq, array, 2>, MAX_CONTEXT> &dp_targets, int flags) { +void add_dp_targets(const WorkTarget& target, int target_idx, const sequence* query_seq, array, 3>, MAX_CONTEXT>& dp_targets, int flags) { const int band = Extension::band((int)query_seq->length()), slen = (int)target.seq.length(); + const size_t qlen = query_seq[0].length(); const Stats::TargetMatrix* matrix = target.matrix.scores.empty() ? nullptr : &target.matrix; for (unsigned frame = 0; frame < align_mode.query_contexts; ++frame) { if (config.ext == "full") { - int b = target.ungapped_score <= config.cutoff_score_8bit ? 0 : 1; - if ((flags & DP::TRACEBACK) && query_seq[0].length() >= 256) - b = 1; + if (target.ungapped_score[frame] == 0) + continue; + int b = target.ungapped_score[frame] <= config.cutoff_score_8bit ? 0 : 1; + if (flags & DP::TRACEBACK) { + if (query_seq[0].length() >= 256) + b = 1; + if (query_seq[0].length() * target.seq.length() > config.max_swipe_dp) + b = 2; + } dp_targets[frame][b].emplace_back(target.seq, 0, 0, 0, 0, target_idx, (int)query_seq->length()); continue; } @@ -111,6 +118,8 @@ j0 = std::min(j0, hsp.subject_range.begin_); j1 = std::max(j1, hsp.subject_range.end_); bits = std::max(bits, (hsp.score <= config.cutoff_score_8bit && (d1 - d0) < 256) ? 0 : 1); + if (flags & DP::TRACEBACK && size_t(d1 - d0) * qlen > config.max_swipe_dp) + bits = 2; } else { if (d0 != INT_MAX) @@ -120,6 +129,8 @@ j0 = hsp.subject_range.begin_; j1 = hsp.subject_range.end_; bits = (hsp.score <= config.cutoff_score_8bit && (d1 - d0) < 256) ? 0 : 1; + if (flags & DP::TRACEBACK && size_t(d1 - d0) * qlen > config.max_swipe_dp) + bits = 2; } } @@ -128,7 +139,7 @@ } vector align(const vector &targets, const sequence *query_seq, const Bias_correction *query_cb, int source_query_len, int flags, Statistics &stat) { - array, 2>, MAX_CONTEXT> dp_targets; + array, 3>, MAX_CONTEXT> dp_targets; vector r; if (targets.empty()) return r; @@ -138,7 +149,7 @@ add_dp_targets(targets[i], i, query_seq, dp_targets, flags); if (targets[i].adjusted_matrix()) ++cbs_targets; - r.emplace_back(targets[i].block_id, targets[i].seq, targets[i].ungapped_score, targets[i].matrix); + r.emplace_back(targets[i].block_id, targets[i].seq, targets[i].ungapped_score.front(), targets[i].matrix); } stat.inc(Statistics::TARGET_HITS3_CBS, cbs_targets); @@ -152,6 +163,7 @@ query_seq[frame], dp_targets[frame][0], dp_targets[frame][1], + dp_targets[frame][2], nullptr, Frame(frame), Stats::CBS::hauser(config.comp_based_stats) ? &query_cb[frame] : nullptr, @@ -183,6 +195,7 @@ query_seq[0], v, v, + v, &target_it, Frame(0), Stats::CBS::hauser(config.comp_based_stats) ? &query_cb[0] : nullptr, @@ -202,7 +215,7 @@ return r; } -void add_dp_targets(const Target &target, int target_idx, const sequence *query_seq, array, 2>, MAX_CONTEXT> &dp_targets) { +void add_dp_targets(const Target &target, int target_idx, const sequence *query_seq, array, 3>, MAX_CONTEXT> &dp_targets) { const int band = Extension::band((int)query_seq->length()), slen = (int)target.seq.length(); const Stats::TargetMatrix* matrix = target.adjusted_matrix() ? &target.matrix : nullptr; @@ -210,14 +223,17 @@ const int qlen = (int)query_seq[frame].length(); for (const Hsp &hsp : target.hsp[frame]) { const bool byte_row_counter = config.ext == "full" ? qlen < 256 : (hsp.d_end - hsp.d_begin < 256); - vector& v = (hsp.score < 255 && byte_row_counter) ? dp_targets[frame][0] : dp_targets[frame][1]; - v.emplace_back(target.seq, hsp.d_begin, hsp.d_end, hsp.seed_hit_range.begin_, hsp.seed_hit_range.end_, target_idx, qlen, matrix); + int b = (hsp.score < 255 && byte_row_counter) ? 0 : 1; + const size_t dp_size = config.ext == "full" ? query_seq[0].length() * target.seq.length() : query_seq[0].length() * size_t(hsp.d_end - hsp.d_begin); + if (dp_size > config.max_swipe_dp) + b = 2; + dp_targets[frame][b].emplace_back(target.seq, hsp.d_begin, hsp.d_end, hsp.seed_hit_range.begin_, hsp.seed_hit_range.end_, target_idx, qlen, matrix); } } } vector align(vector &targets, const sequence *query_seq, const Bias_correction *query_cb, int source_query_len, int flags, Statistics &stat, bool first_round_traceback) { - array, 2>, MAX_CONTEXT> dp_targets; + array, 3>, MAX_CONTEXT> dp_targets; vector r; if (targets.empty()) return r; @@ -246,6 +262,7 @@ query_seq[frame], dp_targets[frame][0], dp_targets[frame][1], + dp_targets[frame][2], nullptr, Frame(frame), Stats::CBS::hauser(config.comp_based_stats) ? &query_cb[frame] : nullptr, diff -Nru diamond-aligner-2.0.6/src/align/gapped_filter.cpp diamond-aligner-2.0.7/src/align/gapped_filter.cpp --- diamond-aligner-2.0.6/src/align/gapped_filter.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/align/gapped_filter.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -32,44 +32,6 @@ namespace Extension { -/*bool gapped_filter(const LongScoreProfile *query_profile, const WorkTarget& target, Statistics &stat) { - const int slen = (int)target.seq.length(), qlen = (int)query_profile[0].length(); - int scores[128]; - stat.inc(Statistics::GAPPED_FILTER_TARGETS); - for (unsigned frame = 0; frame < align_mode.query_contexts; ++frame) - for (const Hsp_traits& hsp : target.hsp[frame]) { - stat.inc(Statistics::GAPPED_FILTER_HITS1); - const int d = std::max((hsp.d_max + hsp.d_min) / 2 - 128 / 2, -(slen - 1)), - j0 = std::max(hsp.subject_range.begin_ - config.gapped_filter_window, 0), - j1 = std::min(hsp.subject_range.end_ + config.gapped_filter_window, slen); - DP::scan_diags128(query_profile[frame], target.seq, d, j0, j1, scores); - const int score = DP::diag_alignment(scores, 128); - if (score_cutoff(score, qlen)) - return true; - } - return false; -} - -vector gapped_filter(const sequence* query, const Bias_correction* query_cbs, std::vector& targets, Statistics &stat) { - vector out; - vector query_profile; - query_profile.reserve(align_mode.query_contexts); - for (unsigned i = 0; i < align_mode.query_contexts; ++i) - query_profile.emplace_back(query[i], query_cbs[i]); - const int qlen = (int)query[0].length(); - - for (WorkTarget& target : targets) { - if (score_cutoff(target.filter_score, qlen)) { - out.push_back(std::move(target)); - continue; - } - if(gapped_filter(query_profile.data(), target, stat)) - out.push_back(std::move(target)); - } - - return out; -}*/ - int gapped_filter(const SeedHit &hit, const LongScoreProfile *query_profile, const sequence &target, int band, int window, std::function f) { const int slen = (int)target.length(); const int d = std::max(hit.diag() - band / 2, -(slen - 1)), @@ -81,17 +43,20 @@ } bool gapped_filter(const SeedHit *begin, const SeedHit *end, const LongScoreProfile *query_profile, uint32_t target_block_id, Statistics &stat, const Parameters ¶ms) { - constexpr int window1 = 100; + constexpr int window1 = 100, MIN_STAGE2_QLEN = 100; const int qlen = (int)query_profile->length(); const sequence target = ref_seqs::get()[target_block_id]; + const int slen = (int)target.length(); for (const SeedHit* hit = begin; hit < end; ++hit) { stat.inc(Statistics::GAPPED_FILTER_HITS1); const int f1 = gapped_filter(*hit, query_profile, target, 64, window1, DP::scan_diags64); - if(f1 > params.cutoff_gapped1(qlen)) { + if (f1 > params.cutoff_gapped1_new(qlen, slen)) { stat.inc(Statistics::GAPPED_FILTER_HITS2); + if (qlen < MIN_STAGE2_QLEN && align_mode.query_translated) + return true; const int f2 = gapped_filter(*hit, query_profile, target, 128, config.gapped_filter_window, DP::scan_diags128); - if(f2 > params.cutoff_gapped2(qlen)) + if (f2 > params.cutoff_gapped2_new(qlen, slen)) return true; } } diff -Nru diamond-aligner-2.0.6/src/align/global_ranking/extend.cpp diamond-aligner-2.0.7/src/align/global_ranking/extend.cpp --- diamond-aligner-2.0.6/src/align/global_ranking/extend.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/align/global_ranking/extend.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -94,6 +94,7 @@ vector block2db_id; db.rewind(); db.load_seqs(&block2db_id, SIZE_MAX, &ref_seqs::data_, &ref_ids::data_, true, &ranking_db_filter, true); + ReferenceDictionary::get().set_block2db(&block2db_id); TargetMap db2block_id; db2block_id.reserve(block2db_id.size()); for (size_t i = 0; i < block2db_id.size(); ++i) @@ -112,7 +113,7 @@ OutputSink::instance.reset(new OutputSink(0, &master_out)); uint32_t next_query = 0; vector threads; - for (size_t i = 0; i < config.threads_; ++i) + for (size_t i = 0; i < (config.threads_align ? config.threads_align : config.threads_); ++i) threads.emplace_back(align_worker, &query_list, &db2block_id, ¶ms, &metadata, &next_query); for (auto& i : threads) i.join(); diff -Nru diamond-aligner-2.0.6/src/align/load_hits.cpp diamond-aligner-2.0.7/src/align/load_hits.cpp --- diamond-aligner-2.0.6/src/align/load_hits.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/align/load_hits.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -22,7 +22,7 @@ namespace Extension { -void load_hits(hit* begin, hit* end, FlatArray &hits, vector &target_block_ids, vector &target_scores) { +void load_hits(hit* begin, hit* end, FlatArray &hits, vector &target_block_ids, vector &target_scores, unsigned query_len) { hits.clear(); hits.reserve(end - begin); target_block_ids.clear(); @@ -31,6 +31,7 @@ return; std::sort(begin, end, hit::CmpSubject()); const size_t total_subjects = ref_seqs::get().get_length(); + unsigned target_len; uint32_t target = UINT32_MAX; uint16_t score = 0; if (std::log2(total_subjects) * (end - begin) < total_subjects / 10) { @@ -39,11 +40,16 @@ const uint32_t t = (uint32_t)l.first; if (t != target) { if (target != UINT32_MAX) { +#ifdef EVAL_TARGET + target_scores.push_back({ uint32_t(target_block_ids.size() - 1), score, score_matrix.evalue(score, query_len, target_len) }); +#else target_scores.push_back({ uint32_t(target_block_ids.size() - 1), score }); +#endif score = 0; } hits.next(); target = t; + target_len = (unsigned)ref_seqs::get()[target].length(); target_block_ids.push_back(target); } hits.push_back({ (int)i->seed_offset_, (int)l.second, i->score_, i->query_ % align_mode.query_contexts }); @@ -58,19 +64,28 @@ uint32_t t = (uint32_t)(it - limit_begin) - 1; if (t != target) { if (target != UINT32_MAX) { +#ifdef EVAL_TARGET + target_scores.push_back({ uint32_t(target_block_ids.size() - 1), score, score_matrix.evalue(score, query_len, target_len) }); +#else target_scores.push_back({ uint32_t(target_block_ids.size() - 1), score }); +#endif score = 0; } hits.next(); target_block_ids.push_back(t); target = t; + target_len = (unsigned)ref_seqs::get()[target].length(); } hits.push_back({ (int)i->seed_offset_, (int)(subject_offset - *(it - 1)), i->score_, i->query_ % align_mode.query_contexts }); score = std::max(score, i->score_); } } if (target != UINT32_MAX) +#ifdef EVAL_TARGET + target_scores.push_back({ uint32_t(target_block_ids.size() - 1), score, score_matrix.evalue(score, query_len, target_len) }); +#else target_scores.push_back({ uint32_t(target_block_ids.size() - 1), score }); +#endif } } \ No newline at end of file diff -Nru diamond-aligner-2.0.6/src/align/target.h diamond-aligner-2.0.7/src/align/target.h --- diamond-aligner-2.0.6/src/align/target.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/align/target.h 2021-02-12 11:50:56.000000000 +0000 @@ -55,23 +55,23 @@ }; struct WorkTarget { - WorkTarget(size_t block_id, const sequence& seq, int query_len, const double* query_comp, const int16_t** query_matrix) : + WorkTarget(size_t block_id, const sequence& seq, int query_len, const Stats::Composition& query_comp, const int16_t** query_matrix) : block_id(block_id), - seq(seq), - ungapped_score(0) + seq(seq) { + ungapped_score.fill(0); if (config.comp_based_stats == Stats::CBS::HAUSER_AND_AVG_MATRIX_ADJUST) { const int l = (int)seq.length(); const auto c = Stats::composition(seq); - auto r = Stats::s_TestToApplyREAdjustmentConditional(query_len, l, query_comp, c.data(), score_matrix.background_freqs()); + auto r = Stats::s_TestToApplyREAdjustmentConditional(query_len, l, query_comp.data(), c.data(), score_matrix.background_freqs()); if (r == Stats::eCompoScaleOldMatrix) return; if (*query_matrix == nullptr) { - *query_matrix = Stats::make_16bit_matrix(Stats::CompositionMatrixAdjust(query_len, query_len, query_comp, query_comp, Stats::CBS::AVG_MATRIX_SCALE, score_matrix.ungapped_lambda(), score_matrix.joint_probs(), score_matrix.background_freqs())); + *query_matrix = Stats::make_16bit_matrix(Stats::CompositionMatrixAdjust(query_len, query_len, query_comp.data(), query_comp.data(), Stats::CBS::AVG_MATRIX_SCALE, score_matrix.ideal_lambda(), score_matrix.joint_probs(), score_matrix.background_freqs())); ++target_matrix_count; } if (target_matrices[block_id] == nullptr) { - int16_t* target_matrix = Stats::make_16bit_matrix(Stats::CompositionMatrixAdjust(l, l, c.data(), c.data(), Stats::CBS::AVG_MATRIX_SCALE, score_matrix.ungapped_lambda(), score_matrix.joint_probs(), score_matrix.background_freqs())); + int16_t* target_matrix = Stats::make_16bit_matrix(Stats::CompositionMatrixAdjust(l, l, c.data(), c.data(), Stats::CBS::AVG_MATRIX_SCALE, score_matrix.ideal_lambda(), score_matrix.joint_probs(), score_matrix.background_freqs())); bool del = false; { std::lock_guard lock(target_matrices_lock); @@ -93,12 +93,12 @@ } size_t block_id; sequence seq; - int ungapped_score; + std::array ungapped_score; std::array, MAX_CONTEXT> hsp; Stats::TargetMatrix matrix; }; -std::vector ungapped_stage(const sequence* query_seq, const Bias_correction* query_cb, const double* query_comp, FlatArray& seed_hits, const std::vector& target_block_ids, int flags, Statistics& stat); +std::vector ungapped_stage(const sequence* query_seq, const Bias_correction* query_cb, const Stats::Composition& query_comp, FlatArray& seed_hits, const std::vector& target_block_ids, int flags, Statistics& stat); struct Target { @@ -146,12 +146,19 @@ struct TargetScore { uint32_t target; uint16_t score; +#ifdef EVAL_TARGET + double evalue; +#endif bool operator<(const TargetScore& x) const { +#ifdef EVAL_TARGET + return evalue < x.evalue || (evalue == x.evalue && target < x.target); +#else return score > x.score || (score == x.score && target < x.target); +#endif } }; -void load_hits(hit* begin, hit* end, FlatArray &hits, std::vector &target_block_ids, std::vector &target_scores); +void load_hits(hit* begin, hit* end, FlatArray &hits, std::vector &target_block_ids, std::vector &target_scores, unsigned query_len); void culling(std::vector& targets, int source_query_len, const char* query_title, const sequence& query_seq, size_t min_keep); bool append_hits(std::vector& targets, std::vector::const_iterator begin, std::vector::const_iterator end, size_t chunk_size, int source_query_len, const char* query_title, const sequence& query_seq); std::vector gapped_filter(const sequence *query, const Bias_correction* query_cbs, std::vector& targets, Statistics &stat); diff -Nru diamond-aligner-2.0.6/src/align/ungapped.cpp diamond-aligner-2.0.7/src/align/ungapped.cpp --- diamond-aligner-2.0.6/src/align/ungapped.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/align/ungapped.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -48,25 +48,31 @@ std::mutex target_matrices_lock; atomic target_matrix_count(0); -WorkTarget ungapped_stage(SeedHit *begin, SeedHit *end, const sequence *query_seq, const Bias_correction *query_cb, const double* query_comp, const int16_t** query_matrix, uint32_t block_id, Statistics& stat) { +WorkTarget ungapped_stage(SeedHit *begin, SeedHit *end, const sequence *query_seq, const Bias_correction *query_cb, const Stats::Composition& query_comp, const int16_t** query_matrix, uint32_t block_id, Statistics& stat) { array, MAX_CONTEXT> diagonal_segments; task_timer timer; - WorkTarget target(block_id, ref_seqs::get()[block_id], (int)query_seq[0].length(), query_comp, query_matrix); + const bool masking = config.comp_based_stats == Stats::CBS::COMP_BASED_STATS_AND_MATRIX_ADJUST ? Stats::use_seg_masking(query_seq[0], ref_seqs_unmasked::get()[block_id]) : true; + WorkTarget target(block_id, masking ? ref_seqs::get()[block_id] : ref_seqs_unmasked::get()[block_id], Stats::count_true_aa(query_seq[0]), query_comp, query_matrix); stat.inc(Statistics::TIME_MATRIX_ADJUST, timer.microseconds()); if (!Stats::CBS::avg_matrix(config.comp_based_stats) && target.adjusted_matrix()) stat.inc(Statistics::MATRIX_ADJUST_COUNT); if (config.ext == "full") { - for (const SeedHit *hit = begin; hit < end; ++hit) - target.ungapped_score = std::max(target.ungapped_score, hit->score); + for (const SeedHit* hit = begin; hit < end; ++hit) + target.ungapped_score[hit->frame] = std::max(target.ungapped_score[hit->frame], hit->score); + return target; + } + if (end - begin == 1 && align_mode.query_translated) { + target.ungapped_score[begin->frame] = begin->score; + target.hsp[begin->frame].emplace_back(begin->diag(), begin->diag(), begin->score, begin->frame, interval(), interval()); return target; } std::sort(begin, end); for (const SeedHit *hit = begin; hit < end; ++hit) { - target.ungapped_score = std::max(target.ungapped_score, hit->score); + target.ungapped_score[hit->frame] = std::max(target.ungapped_score[hit->frame], hit->score); if (!diagonal_segments[hit->frame].empty() && diagonal_segments[hit->frame].back().diag() == hit->diag() && diagonal_segments[hit->frame].back().subject_end() >= hit->j) continue; - const auto d = xdrop_ungapped(query_seq[hit->frame], target.seq, hit->i, hit->j); + const Diagonal_segment d = xdrop_ungapped(query_seq[hit->frame], target.seq, hit->i, hit->j); if (d.score > 0) { diagonal_segments[hit->frame].push_back(d); } @@ -82,10 +88,10 @@ return target; } -void ungapped_stage_worker(size_t i, size_t thread_id, const sequence *query_seq, const Bias_correction *query_cb, const double* query_comp, FlatArray *seed_hits, const uint32_t*target_block_ids, vector *out, mutex *mtx, Statistics* stat) { +void ungapped_stage_worker(size_t i, size_t thread_id, const sequence *query_seq, const Bias_correction *query_cb, const Stats::Composition* query_comp, FlatArray *seed_hits, const uint32_t*target_block_ids, vector *out, mutex *mtx, Statistics* stat) { Statistics stats; const int16_t* query_matrix = nullptr; - WorkTarget target = ungapped_stage(seed_hits->begin(i), seed_hits->end(i), query_seq, query_cb, query_comp, &query_matrix, target_block_ids[i], stats); + WorkTarget target = ungapped_stage(seed_hits->begin(i), seed_hits->end(i), query_seq, query_cb, *query_comp, &query_matrix, target_block_ids[i], stats); { std::lock_guard guard(*mtx); out->push_back(std::move(target)); @@ -94,14 +100,15 @@ delete[] query_matrix; } -vector ungapped_stage(const sequence *query_seq, const Bias_correction *query_cb, const double* query_comp, FlatArray &seed_hits, const vector& target_block_ids, int flags, Statistics& stat) { +vector ungapped_stage(const sequence *query_seq, const Bias_correction *query_cb, const Stats::Composition& query_comp, FlatArray &seed_hits, const vector& target_block_ids, int flags, Statistics& stat) { vector targets; if (target_block_ids.size() == 0) return targets; + targets.reserve(target_block_ids.size()); const int16_t* query_matrix = nullptr; if (flags & DP::PARALLEL) { mutex mtx; - Util::Parallel::scheduled_thread_pool_auto(config.threads_, seed_hits.size(), ungapped_stage_worker, query_seq, query_cb, query_comp, &seed_hits, target_block_ids.data(), &targets, &mtx, &stat); + Util::Parallel::scheduled_thread_pool_auto(config.threads_, seed_hits.size(), ungapped_stage_worker, query_seq, query_cb, &query_comp, &seed_hits, target_block_ids.data(), &targets, &mtx, &stat); } else { for (size_t i = 0; i < target_block_ids.size(); ++i) diff -Nru diamond-aligner-2.0.6/src/basic/basic.cpp diamond-aligner-2.0.7/src/basic/basic.cpp --- diamond-aligner-2.0.6/src/basic/basic.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/basic/basic.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -28,7 +28,7 @@ #include "sequence.h" #include "masking.h" -const char* Const::version_string = "2.0.6"; +const char* Const::version_string = "2.0.7"; const char* Const::program_name = "diamond"; const char* Const::id_delimiters = " \a\b\f\n\r\t\v\1"; @@ -328,7 +328,8 @@ "11010001010010111", "111010100001001011", "1101001001000100111" -} // 15 8x9 +}, // 15 8x9 +{ "111011001011010111" } // 16 1x12 }; diff -Nru diamond-aligner-2.0.6/src/basic/config.cpp diamond-aligner-2.0.7/src/basic/config.cpp --- diamond-aligner-2.0.6/src/basic/config.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/basic/config.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -105,11 +105,15 @@ } void Config::set_sens(Sensitivity sens) { - if (sensitivity != Sensitivity::FAST) + if (sensitivity != Sensitivity::DEFAULT) throw std::runtime_error("Sensitivity switches are mutually exclusive."); sensitivity = sens; } +std::string Config::single_query_file() const { + return query_file.empty() ? string() : query_file.front(); +} + Config::Config(int argc, const char **argv, bool check_io) { Command_line_parser parser; @@ -222,7 +226,8 @@ cluster.add() ("cluster-algo", 0, "Clustering algorithm (\"multi-step\", \"mcl\")", cluster_algo) ("cluster-steps", 0, "Clustering steps (\"fast\",\"mid-sensitive\",\"sensitive\", \"more-sensitive\", \"very-sensitive\", \"ultra-sensitive\")", cluster_steps, { "sensitive" }) - ("max-size-set", 0, "Maximum size of a set", max_size_set, (size_t) 100) + ("max-size-set", 0, "Maximum size of a set", max_size_set, (size_t) 1000) + ("external", 0, "save set of edges external", external, false) ("cluster-similarity", 0, "Clustering similarity measure", cluster_similarity) ("mcl-expansion", 0, "MCL expansion coefficient (default=2)", cluster_mcl_expansion, (uint32_t) 2) ("mcl-inflation", 0, "MCL inflation coefficient (default=2.0)", cluster_mcl_inflation, 2.0) @@ -283,6 +288,7 @@ ("freq-sd", 0, "number of standard deviations for ignoring frequent seeds", freq_sd, 0.0) ("id2", 0, "minimum number of identities for stage 1 hit", min_identities) ("xdrop", 'x', "xdrop for ungapped alignment", ungapped_xdrop, 12.3) + ("gapped-filter-evalue", 0, "E-value threshold for gapped filter (auto)", gapped_filter_evalue, -1.0) ("band", 0, "band for dynamic programming computation", padding) ("shapes", 's', "number of seed shapes (default=all available)", shapes) ("shape-mask", 0, "seed shapes", shape_mask) @@ -307,7 +313,9 @@ ("roc-file", 0, "", roc_file) ("family-map", 0, "", family_map) ("family-map-query", 0, "", family_map_query) - ("query-parallel-limit", 0, "", query_parallel_limit, 3000000u); + ("query-parallel-limit", 0, "", query_parallel_limit, 3000000u) + ("log-evalue-scale", 0, "", log_evalue_scale, 1.0 / std::log(2.0)) + ("bootstrap", 0, "", bootstrap); Options_group view_options("View options"); view_options.add() @@ -332,6 +340,10 @@ ("lambda", 0, "lambda parameter for custom matrix", lambda) ("K", 0, "K parameter for custom matrix", K); + double query_match_distance_threshold; + double length_ratio_threshold; + double cbs_angle; + #ifdef EXTRA Options_group hidden_options(""); #else @@ -408,17 +420,16 @@ ("chaining-range-cover", 0, "", chaining_range_cover, (size_t)8) ("index-mode", 0, "index mode (0=4x12, 1=16x9)", index_mode) ("no-swipe-realign", 0, "", no_swipe_realign) - ("bootstrap", 0, "", bootstrap) ("chaining-maxnodes", 0, "", chaining_maxnodes) ("cutoff-score-8bit", 0, "", cutoff_score_8bit, 240) ("min-band-overlap", 0, "", min_band_overlap, 0.2) ("min-realign-overhang", 0, "", min_realign_overhang, 30) ("ungapped-window", 0, "", ungapped_window, 48) ("gapped-filter-diag-score", 0, "", gapped_filter_diag_bit_score, 12.0) - ("gapped-filter-evalue", 0, "", gapped_filter_evalue, -1.0) ("gapped-filter-window", 0, "", gapped_filter_window, 200) ("output-hits", 0, "", output_hits) ("ungapped-evalue", 0, "", ungapped_evalue, -1.0) + ("ungapped-evalue-short", 0, "", ungapped_evalue_short, -1.0) ("no-logfile", 0, "", no_logfile) ("no-heartbeat", 0, "", no_heartbeat) ("band-bin", 0, "", band_bin, 24) @@ -428,7 +439,7 @@ ("tile-size", 0, "", tile_size, (uint32_t)1024) ("short-query-ungapped-bitscore", 0, "", short_query_ungapped_bitscore, 25.0) ("short-query-max-len", 0, "", short_query_max_len, 60) - ("gapped-filter-evalue1", 0, "", gapped_filter_evalue1, 1.0e+04) + ("gapped-filter-evalue1", 0, "", gapped_filter_evalue1, 2000.0) ("ext-yield", 0, "", ext_min_yield) ("full-sw-len", 0, "", full_sw_len) ("relaxed-evalue-factor", 0, "", relaxed_evalue_factor, 1.0) @@ -454,12 +465,19 @@ ("family-cap", 0, "", family_cap) ("cbs-matrix-scale", 0, "", cbs_matrix_scale, 1) ("query-count", 0, "", query_count, (size_t)1) - ("cbs-angle", 0, "", cbs_angle, 50.0) + ("cbs-angle", 0, "", cbs_angle, -1.0) ("target-seg", 0, "", target_seg, -1) ("cbs-err-tolerance", 0, "", cbs_err_tolerance, 0.00000001) ("cbs-it-limit", 0, "", cbs_it_limit, 2000) - ("query-match-distance-threshold", 0, "", query_match_distance_threshold, 0.0) - ("length-ratio-threshold", 0, "", length_ratio_threshold, 0.0); + ("hash_join_swap", 0, "", hash_join_swap) + ("deque_bucket_size", 0, "", deque_bucket_size, (size_t)524288) + ("query-match-distance-threshold", 0, "", query_match_distance_threshold, -1.0) + ("length-ratio-threshold", 0, "", length_ratio_threshold, -1.0) + ("fast", 0, "", mode_fast) + ("target-indexed", 0, "", target_indexed) + ("mmap-target-index", 0, "", mmap_target_index) + ("save-target-index", 0, "", save_target_index) + ("max-swipe-dp", 0, "", max_swipe_dp, (size_t)4000000); parser.add(general).add(makedb).add(cluster).add(aligner).add(advanced).add(view_options).add(getseq_options).add(hidden_options).add(deprecated_options); parser.store(argc, argv, command); @@ -476,8 +494,8 @@ if (command == blastx && no_self_hits) throw std::runtime_error("--no-self-hits option is not supported in blastx mode."); - if (command == blastx && (ext == "full" || swipe_all)) - throw std::runtime_error("Full matrix extension is not supported in blastx mode."); + if (command == blastx && swipe_all) + throw std::runtime_error("Full db alignment is not supported in blastx mode."); if (long_reads) { query_range_culling = true; @@ -496,7 +514,11 @@ ext = "full"; } +#ifdef EXTRA if (comp_based_stats >= Stats::CBS::COUNT) +#else + if (comp_based_stats >= 5) +#endif throw std::runtime_error("Invalid value for --comp-based-stats. Permitted values: 0, 1, 2, 3, 4."); if (masking == -1) @@ -505,6 +527,8 @@ if (target_seg == -1) target_seg = Stats::CBS::target_seg(comp_based_stats); + Stats::comp_based_stats = Stats::CBS(comp_based_stats, query_match_distance_threshold, length_ratio_threshold, cbs_angle); + if (command == blastx && !Stats::CBS::support_translated(comp_based_stats)) throw std::runtime_error("This mode of composition based stats is not supported for translated searches."); @@ -523,6 +547,9 @@ if (masking < 0 || masking > 1) throw std::runtime_error("Permitted values for --masking: 0, 1"); + if (target_indexed) + hashed_seeds = true; + if (check_io) { switch (command) { case Config::makedb: @@ -670,7 +697,7 @@ } } message_stream << "Scoring parameters: " << score_matrix << endl; - if (masking == 1 || target_seg) + //if (masking == 1 || target_seg) Masking::instance = unique_ptr(new Masking(score_matrix)); } @@ -689,7 +716,8 @@ #endif } - sensitivity = Sensitivity::FAST; + sensitivity = Sensitivity::DEFAULT; + if (mode_fast) set_sens(Sensitivity::FAST); if (mode_mid_sensitive) set_sens(Sensitivity::MID_SENSITIVE); if (mode_sensitive) set_sens(Sensitivity::SENSITIVE); if (mode_more_sensitive) set_sens(Sensitivity::MORE_SENSITIVE); @@ -699,12 +727,9 @@ if (algo == Config::query_indexed && (sensitivity == Sensitivity::MID_SENSITIVE || sensitivity >= Sensitivity::VERY_SENSITIVE)) throw std::runtime_error("Query-indexed mode is not supported for this sensitivity setting."); - set ext_modes = { "", "banded-fast", "banded-slow" }; -#ifdef EXTRA - ext_modes.insert("full"); -#endif + const set ext_modes = { "", "banded-fast", "banded-slow", "full" }; if (ext_modes.find(ext) == ext_modes.end()) - throw std::runtime_error("Possible values for --ext are: banded-fast, banded-slow"); + throw std::runtime_error("Possible values for --ext are: banded-fast, banded-slow, full"); Translator::init(query_gencode); @@ -722,6 +747,22 @@ if (!aligned_file.empty()) log_stream << "Aligned file format: " << alfmt << endl; + if (command == blastx) { + if (query_file.size() > 2) + throw std::runtime_error("A maximum of 2 query files is supported in blastx mode."); + } + else if (query_file.size() > 1) + throw std::runtime_error("--query/-q has more than one argument."); + + if ((mmap_target_index || save_target_index) && !target_indexed) + throw std::runtime_error("Option requires --target-indexed."); + + if (mmap_target_index && save_target_index) + throw std::runtime_error("Options are exclusive."); + + if (target_indexed && lowmem != 1) + throw std::runtime_error("--target-indexed requires -c1."); + /*log_stream << "sizeof(hit)=" << sizeof(hit) << " sizeof(packed_uint40_t)=" << sizeof(packed_uint40_t) << " sizeof(sorted_list::entry)=" << sizeof(sorted_list::entry) << endl;*/ diff -Nru diamond-aligner-2.0.6/src/basic/config.h diamond-aligner-2.0.7/src/basic/config.h --- diamond-aligner-2.0.6/src/basic/config.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/basic/config.h 2021-02-12 11:50:56.000000000 +0000 @@ -26,7 +26,7 @@ #include #include -enum class Sensitivity { FAST = 0, MID_SENSITIVE = 1, SENSITIVE = 2, MORE_SENSITIVE = 3, VERY_SENSITIVE = 4, ULTRA_SENSITIVE = 5 }; +enum class Sensitivity { FAST = 0, DEFAULT = 1, MID_SENSITIVE = 2, SENSITIVE = 3, MORE_SENSITIVE = 4, VERY_SENSITIVE = 5, ULTRA_SENSITIVE = 6 }; enum class TracebackMode { NONE = 0, SCORE_ONLY = 1, STAT = 2, VECTOR = 3, SCORE_BUFFER = 4 }; struct Config @@ -38,7 +38,7 @@ string_vector input_ref_file; unsigned threads_; string database; - string query_file; + string_vector query_file; unsigned merge_seq_treshold; unsigned hit_cap; unsigned shapes; @@ -246,12 +246,20 @@ int family_cap; int cbs_matrix_scale; size_t query_count; - double cbs_angle; int target_seg; double cbs_err_tolerance; int cbs_it_limit; double query_match_distance_threshold; double length_ratio_threshold; + bool hash_join_swap; + bool target_indexed; + size_t deque_bucket_size; + bool mmap_target_index; + bool save_target_index; + bool mode_fast; + double log_evalue_scale; + double ungapped_evalue_short; + size_t max_swipe_dp; Sensitivity sensitivity; TracebackMode traceback_mode; @@ -274,6 +282,7 @@ string cluster_similarity; size_t max_size_set; + bool external; string_vector cluster_steps; double cluster_mcl_inflation; uint32_t cluster_mcl_expansion; @@ -285,7 +294,8 @@ std::map sens_map{ - {"fast", Sensitivity::FAST}, + {"fast", Sensitivity::FAST }, + {"default", Sensitivity::DEFAULT}, {"sensitive", Sensitivity::SENSITIVE}, {"mid-sensitive", Sensitivity::MID_SENSITIVE}, {"more-sensitive", Sensitivity::MORE_SENSITIVE}, @@ -322,6 +332,7 @@ } void set_sens(Sensitivity sens); + std::string single_query_file() const; bool mem_buffered() const { return tmpdir == "/dev/shm"; } diff -Nru diamond-aligner-2.0.6/src/basic/const.h diamond-aligner-2.0.7/src/basic/const.h --- diamond-aligner-2.0.6/src/basic/const.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/basic/const.h 2021-02-12 11:50:56.000000000 +0000 @@ -25,8 +25,12 @@ { enum { - build_version = 144, + build_version = 145, +#ifdef SINGLE_THREADED + seedp_bits = 0, +#else seedp_bits = 10, +#endif seedp = 1< +Copyright (C) 2016-2020 Max Planck Society for the Advancement of Science e.V. + Benjamin Buchfink + +Code developed by Benjamin Buchfink This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -16,9 +19,7 @@ along with this program. If not, see . ****/ -#ifndef SEED_ITERATOR_H_ -#define SEED_ITERATOR_H_ - +#pragma once #include "shape.h" #include "sequence.h" #include "../util/hash_function.h" @@ -68,7 +69,7 @@ #else const Letter l = *(ptr_++); #endif - if (l == value_traits.mask_char) + if (!is_amino_acid(l)) return false; last_ |= Reduction::reduction(l); seed = murmur_hash()(last_ & mask); @@ -121,5 +122,3 @@ const Letter *ptr_, *end_; uint64_t last_; }; - -#endif \ No newline at end of file diff -Nru diamond-aligner-2.0.6/src/ChangeLog diamond-aligner-2.0.7/src/ChangeLog --- diamond-aligner-2.0.6/src/ChangeLog 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/ChangeLog 2021-02-12 11:50:56.000000000 +0000 @@ -1,3 +1,21 @@ +[2.0.7] +- Added support for computing full-matrix instead of banded Smith Waterman extensions + (command line option `--ext full`). +- Added support for the new `prot.accession2taxid.FULL.gz` taxonomy mapping file from + NCBI. +- Added the option `--gapped-filter-evalue` to set the e-value threshold of the gapped + filter heuristic. +- Added setting the scores of the mask letter according to BLAST rules when a + compositionally adjusted matrix is used. +- Changed formatting of e-values to print two decimals instead of one. +- Added the output field `qseq_translated` to print the translation of the aligned part + of the query sequence. +- Added support for providing two input files to `--query/-q` when running alignment + in blastx mode. +- Added the output field `full_qseq_mate` to print the sequence of the query's mate + (enabled when using two query files in blastx mode). +- Fixed a bug that could cause a crash in blastx mode for very long queries. + [2.0.6] - Changed the computation of expected values to use the method described in Park, Y., Sheetlin, S., Ma, N. et al. New finite-size correction for local alignment score diff -Nru diamond-aligner-2.0.6/src/cluster/multi_step_cluster.cpp diamond-aligner-2.0.7/src/cluster/multi_step_cluster.cpp --- diamond-aligner-2.0.6/src/cluster/multi_step_cluster.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/cluster/multi_step_cluster.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -58,55 +58,126 @@ opt.db_filter = filter; Workflow::Search::run(opt); + + /* + auto lo = nb.dSet.getListOfSets(); + for (auto& set : lo) { + cout << "set :"; + for (auto item : set) { + cout << " " << item; + } + cout << endl; + } + */ - //message_stream << "Edges = " << nb.edges.size() << endl; - - unordered_map components = find_connected_components(nb.smallest_index, nb.number_edges); + vector> connected = nb.dSet.getListOfSets(); + vector EdgSet(nb.number_edges.size()); + unordered_map components = find_connected_components(connected, EdgSet, nb.number_edges); message_stream << "Number of connected components: " << components.size() << endl; - message_stream << "Average number of nodes per connected component: " << (double)nb.smallest_index.size() / components.size() << endl; + message_stream << "Average number of nodes per connected component: " << (double)nb.number_edges.size() / components.size() << endl; uint32_t large = max_element(components.begin(), components.end(), [](const pair& left, const pair& right) {return left.second.nodes < right.second.nodes; })-> second.nodes; message_stream << "Largest connected component has " << large << " nodes." << endl; - - mapping_comp_set(components); - uint32_t number_sets = max_element(components.begin(), components.end(), [](const pair& left, const pair& right) {return left.second.set < right.second.set; })->second.set; - message_stream << "Number of sets: " << number_sets << endl; - - return Util::Algo::greedy_vortex_cover(nb); + vector tmp_sets = mapping_comp_set(components); + + uint32_t number_sets = max_element(components.begin(), components.end(), [](const pair& left, const pair& right) {return left.second.set < right.second.set; })-> second.set; + message_stream << "Number of sets: " << number_sets + 1 << endl; + + + if (config.external) { + save_edges_external(nb.tempfiles, tmp_sets, components, EdgSet); + return cluster_sets(db.ref_header.sequences, tmp_sets); + } + + else { + return Util::Algo::greedy_vortex_cover(nb); + } } -unordered_map MultiStep::find_connected_components(vector& sindex, const vector & nedges){ - vector seen_nodes; - unordered_map ne; +void MultiStep::save_edges_external(vector& all_edges, vector& sorted_edges, const unordered_map& comp, const vector& s_index){ + + size_t count = 0; + uint32_t query; + uint32_t subject; + uint32_t result_set; + while (count < all_edges.size()) { + InputFile tmp_edges(*all_edges[count]); + while (true) { + try { + tmp_edges.read(query); + tmp_edges.read(subject); + result_set = comp.at(s_index[query]).set; + sorted_edges[result_set]->write(query); + sorted_edges[result_set]->write(subject); + } + catch (EndOfStream&) { + tmp_edges.close_and_delete(); + break; + } + } + count++; + } +} - uint32_t curr = 0; - for (size_t k = 0; k < sindex.size(); k++) { - curr = sindex[k]; - while (sindex[curr] != curr) { - seen_nodes.push_back(sindex[curr]); - curr = sindex[curr]; +vector MultiStep::cluster_sets(const size_t nb_size, vector &sorted_edges){ + vector cluster(nb_size); + vector> tmp_neighbors(nb_size); + vectorcurr; + uint32_t query; + uint32_t subject; + + iota(cluster.begin(), cluster.end(), 0); + + for (size_t i = 0; i < sorted_edges.size(); i++) { + InputFile tmp(*sorted_edges[i]); + delete(sorted_edges[i]); + while (true) { + try { + tmp.read(query); + tmp.read(subject); + tmp_neighbors[query].push_back(subject); + } + catch (EndOfStream&) { + tmp.close_and_delete(); + break; + } } + curr = Util::Algo::greedy_vortex_cover(tmp_neighbors); - sindex[k] = curr; - for (size_t i : seen_nodes) { - sindex[i] = curr; + for (int i = 0; i < (int)curr.size(); i++) { + if (curr[i] != i) { + cluster[i] = curr[i]; + } } - seen_nodes.clear(); + + tmp_neighbors.clear(); + tmp_neighbors.resize(nb_size); } + return cluster; +} - for (size_t i = 0; i < sindex.size(); i++) { - ++ne[sindex[i]].nodes; - ne[sindex[i]].edges += nedges[i]; +unordered_map MultiStep::find_connected_components(const vector> &connected, vector &EdgSet, const vector & nedges){ + + unordered_map ne; + for (size_t i = 0; i < connected.size(); i++) { + for(uint32_t j : connected[i]){ + EdgSet[j] = i; + ++ne[i].nodes; + ne[i].edges += nedges[j]; + } } + return ne; } -void MultiStep::mapping_comp_set(unordered_map& comp) { +vector MultiStep::mapping_comp_set(unordered_map& comp) { vector > set; vector size; + vector temp_set; + bool TooBig; @@ -125,8 +196,12 @@ set.push_back({ it.first }); size.push_back({ it.second.edges }); comp[it.first].set = set.size() - 1; + if (config.external) { + temp_set.push_back(new TempFile()); + } } } + return temp_set; } @@ -194,7 +269,6 @@ string id; db->seek_direct(); Hsp hsp; - size_t n; out->precision(3); for (int i = 0; i < (int)db->ref_header.sequences; ++i) { diff -Nru diamond-aligner-2.0.6/src/cluster/multi_step_cluster.h diamond-aligner-2.0.7/src/cluster/multi_step_cluster.h --- diamond-aligner-2.0.6/src/cluster/multi_step_cluster.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/cluster/multi_step_cluster.h 2021-02-12 11:50:56.000000000 +0000 @@ -37,6 +37,8 @@ #include "cluster.h" #include #include +#include "../util/io/temp_file.h" +#include "disjoint_set.h" using namespace std; @@ -52,8 +54,10 @@ private: BitVector rep_bitset(const vector ¢roid, const BitVector *superset = nullptr); vector cluster(DatabaseFile& db, const BitVector* filter); - unordered_map find_connected_components(vector& sindex, const vector& nedges); - void mapping_comp_set(unordered_map& comp); + void save_edges_external(vector &all_edges,vector &sorted_edges, const unordered_map & comp, const vector& s_index); + vector cluster_sets(const size_t nb_size, vector &sorted_edges); + unordered_map find_connected_components(const vector> &connected, vector& EdgSet, const vector& nedges); + vector mapping_comp_set(unordered_map& comp); void steps(BitVector& current_reps, BitVector& previous_reps, vector& current_centroids, vector& previous_centroids, int count); public: @@ -66,34 +70,45 @@ }; struct Neighbors : public vector>, public Consumer { - Neighbors(size_t n) : vector>(n), smallest_index(n,0), number_edges(n,0){ + Neighbors(size_t n) : vector>(n), smallest_index(n,0), number_edges(n,0), dSet(n){ iota(smallest_index.begin(), smallest_index.end(), 0); } vector smallest_index; vector number_edges; - - + vector tempfiles; + size_t size; + LazyDisjointIntegralSet dSet; + + virtual void consume(const char* ptr, size_t n) override { const char* end = ptr + n; + if (config.external) { + size += n; + if (tempfiles.empty() || size >= UINT32_MAX) { + tempfiles.push_back(new TempFile()); + size = 0; + } + tempfiles.back()->write(ptr, n); + } + while (ptr < end) { const uint32_t query = *(uint32_t*)ptr; ptr += sizeof(uint32_t); const uint32_t subject = *(uint32_t*)ptr; ptr += sizeof(uint32_t); - (*this)[query].push_back(subject); - - if (subject < smallest_index[query]) - smallest_index[query] = subject; - - if (query < smallest_index[subject]) - smallest_index[subject] = query; + if (!config.external) { + (*this)[query].push_back(subject); + } + dSet.merge(query, subject); + ++number_edges[query]; + } } - + }; }} diff -Nru diamond-aligner-2.0.6/src/data/enum_seeds.h diamond-aligner-2.0.7/src/data/enum_seeds.h --- diamond-aligner-2.0.6/src/data/enum_seeds.h 1970-01-01 00:00:00.000000000 +0000 +++ diamond-aligner-2.0.7/src/data/enum_seeds.h 2021-02-12 11:50:56.000000000 +0000 @@ -0,0 +1,142 @@ +#pragma once + +#include "sequence_set.h" + +template +void enum_seeds(const Sequence_set* seqs, _f* f, unsigned begin, unsigned end, std::pair shape_range, const _filter* filter) +{ + vector buf(seqs->max_len(begin, end)); + uint64_t key; + for (unsigned i = begin; i < end; ++i) { + const sequence seq = (*seqs)[i]; + Reduction::reduce_seq(seq, buf); + for (size_t shape_id = shape_range.first; shape_id < shape_range.second; ++shape_id) { + const Shape& sh = shapes[shape_id]; + if (seq.length() < sh.length_) continue; + Seed_iterator it(buf, sh); + size_t j = 0; + while (it.good()) { + if (it.get(key, sh)) + if (filter->contains(key, shape_id)) + (*f)(key, seqs->position(i, j), shape_id); + ++j; + } + } + } + f->finish(); +} + +template +void enum_seeds_hashed(const Sequence_set* seqs, _f* f, unsigned begin, unsigned end, std::pair shape_range, const _filter* filter) +{ + uint64_t key; + for (unsigned i = begin; i < end; ++i) { + const sequence seq = (*seqs)[i]; + for (size_t shape_id = shape_range.first; shape_id < shape_range.second; ++shape_id) { + const Shape& sh = shapes[shape_id]; + if (seq.length() < sh.length_) continue; + const uint64_t shape_mask = sh.long_mask(); + Hashed_seed_iterator<_b> it(seq, sh); + size_t j = 0; + while (it.good()) { + if (it.get(key, shape_mask)) + if (filter->contains(key, shape_id)) + (*f)(key, seqs->position(i, j), shape_id); + ++j; + } + } + } + f->finish(); +} + +template +void enum_seeds_contiguous(const Sequence_set* seqs, _f* f, unsigned begin, unsigned end, const _filter* filter) +{ + uint64_t key; + for (unsigned i = begin; i < end; ++i) { + const sequence seq = (*seqs)[i]; + if (seq.length() < _it::length()) continue; + _it it(seq); + size_t j = 0; + while (it.good()) { + if (it.get(key)) + if (filter->contains(key, 0)) + if ((*f)(key, seqs->position(i, j), 0) == false) + return; + ++j; + } + } + f->finish(); +} + +template +static void enum_seeds_worker(_f* f, const Sequence_set* seqs, unsigned begin, unsigned end, std::pair shape_range, const _filter* filter, bool contig) +{ + static const char* errmsg = "Unsupported contiguous seed."; + if (shape_range.second - shape_range.first == 1 && shapes[shape_range.first].contiguous() && shapes.count() == 1 && (config.algo == Config::query_indexed || contig)) { + const uint64_t b = Reduction::reduction.bit_size(), l = shapes[shape_range.first].length_; + switch (l) { + case 7: + switch (b) { + case 4: + enum_seeds_contiguous<_f, Contiguous_seed_iterator<7, 4>, _filter>(seqs, f, begin, end, filter); + break; + default: + throw std::runtime_error(errmsg); + } + break; + case 6: + switch (b) { + case 4: + enum_seeds_contiguous<_f, Contiguous_seed_iterator<6, 4>, _filter>(seqs, f, begin, end, filter); + break; + default: + throw std::runtime_error(errmsg); + } + break; + case 5: + switch (b) { + case 4: + enum_seeds_contiguous<_f, Contiguous_seed_iterator<5, 4>, _filter>(seqs, f, begin, end, filter); + break; + default: + throw std::runtime_error(errmsg); + } + break; + default: + throw std::runtime_error(errmsg); + } + } + else if (config.hashed_seeds) { + const uint64_t b = Reduction::reduction.bit_size(); + switch (b) { + case 4: + enum_seeds_hashed<_f, 4, _filter>(seqs, f, begin, end, shape_range, filter); + break; + default: + throw std::runtime_error("Unsupported reduction."); + } + } + else + enum_seeds<_f, _filter>(seqs, f, begin, end, shape_range, filter); +} + +struct No_filter +{ + bool contains(uint64_t seed, uint64_t shape) const + { + return true; + } +}; + +extern No_filter no_filter; + +template +void enum_seeds(const Sequence_set* seqs, PtrVector<_f>& f, const std::vector& p, size_t shape_begin, size_t shape_end, const _filter* filter, bool contig = false) +{ + std::vector threads; + for (unsigned i = 0; i < f.size(); ++i) + threads.emplace_back(enum_seeds_worker<_f, _filter>, &f[i], seqs, (unsigned)p[i], (unsigned)p[i + 1], std::make_pair(shape_begin, shape_end), filter, contig); + for (auto& t : threads) + t.join(); +} \ No newline at end of file diff -Nru diamond-aligner-2.0.6/src/data/frequent_seeds.cpp diamond-aligner-2.0.7/src/data/frequent_seeds.cpp --- diamond-aligner-2.0.6/src/data/frequent_seeds.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/data/frequent_seeds.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -81,13 +81,6 @@ ++it; } - const size_t ht_size = std::max((size_t)(buf.size() * hash_table_factor), buf.size() + 1); - PHash_set hash_set(ht_size); - - for (vector::const_iterator i = buf.begin(); i != buf.end(); ++i) - hash_set.insert(*i); - - frequent_seeds.tables_[sid][seedp] = std::move(hash_set); (*counts)[seedp] = (unsigned)n; } diff -Nru diamond-aligner-2.0.6/src/data/frequent_seeds.h diamond-aligner-2.0.7/src/data/frequent_seeds.h --- diamond-aligner-2.0.6/src/data/frequent_seeds.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/data/frequent_seeds.h 2021-02-12 11:50:56.000000000 +0000 @@ -31,16 +31,6 @@ { void build(unsigned sid, const SeedPartitionRange &range, DoubleArray *query_seed_hits, DoubleArray *ref_seed_hits); - - bool get(const Letter *pos, unsigned sid) const - { - Packed_seed seed; - const bool t = config.algo == Config::double_indexed ? shapes[sid].set_seed(seed, pos) : shapes[sid].set_seed_shifted(seed, pos); - if (!t) - return true; - return tables_[sid][seed_partition(seed)].contains(seed_partition_offset(seed)); - } - static void clear_masking(Sequence_set& seqs); private: @@ -60,8 +50,6 @@ static void compute_sd(std::atomic *seedp, DoubleArray *query_seed_hits, DoubleArray *ref_seed_hits, vector *ref_out, vector *query_out); - PHash_set tables_[Const::max_shapes][Const::seedp]; - }; extern Frequent_seeds frequent_seeds; diff -Nru diamond-aligner-2.0.6/src/data/load_seqs.h diamond-aligner-2.0.7/src/data/load_seqs.h --- diamond-aligner-2.0.6/src/data/load_seqs.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/data/load_seqs.h 2021-02-12 11:50:56.000000000 +0000 @@ -21,6 +21,7 @@ ****/ #pragma once +#include #include "sequence_set.h" #include "../basic/translate.h" #include "../util/seq_file_format.h" @@ -52,7 +53,8 @@ } } -inline size_t load_seqs(TextInputFile &file, +inline size_t load_seqs(std::list::iterator file_begin, + std::list::iterator file_end, const Sequence_file_format &format, Sequence_set** seqs, String_set*& ids, @@ -60,7 +62,8 @@ String_set** quals, size_t max_letters, const string &filter, - const Value_traits &value_traits) + const Value_traits &value_traits, + size_t modulo = 1) { *seqs = new Sequence_set(); ids = new String_set(); @@ -80,16 +83,22 @@ else if (config.query_strands == "minus") frame_mask = ((1 << 3) - 1) << 3; - while (letters < max_letters && format.get_seq(id, seq, file, value_traits, quals ? &qual : nullptr)) { + std::list::iterator file_it = file_begin; + bool read_success = true; + + while ((letters < max_letters || (n % modulo != 0)) && (read_success = format.get_seq(id, seq, *file_it, value_traits, quals ? &qual : nullptr))) { if (seq.size() > 0 && (filter.empty() || id2.assign(id.data(), id.data() + id.size()).find(filter, 0) != string::npos)) { ids->push_back(id.begin(), id.end()); letters += push_seq(**seqs, source_seqs, seq, frame_mask, value_traits.seq_type); if (quals) (*quals)->push_back(qual.begin(), qual.end()); ++n; - if ((*seqs)->get_length() >(size_t)std::numeric_limits::max()) + if ((*seqs)->get_length() > (size_t)std::numeric_limits::max()) throw std::runtime_error("Number of sequences in file exceeds supported maximum."); } + ++file_it; + if (file_it == file_end) + file_it = file_begin; } ids->finish_reserve(); if (quals) @@ -105,5 +114,7 @@ if (quals) delete *quals; } + if (file_it != file_begin || (!read_success && ++file_it != file_end && format.get_seq(id, seq, *file_it, value_traits, nullptr))) + throw std::runtime_error("Unequal number of sequences in paired read files."); return n; } diff -Nru diamond-aligner-2.0.6/src/data/ref_dictionary.h diamond-aligner-2.0.7/src/data/ref_dictionary.h --- diamond-aligner-2.0.6/src/data/ref_dictionary.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/data/ref_dictionary.h 2021-02-12 11:50:56.000000000 +0000 @@ -72,6 +72,10 @@ return (*block_to_database_id_)[block_id]; } + void set_block2db(const vector* block_to_database_id) { + block_to_database_id_ = block_to_database_id; + } + uint32_t check_id(uint32_t i) const { if (i >= next_) diff -Nru diamond-aligner-2.0.6/src/data/reference.cpp diamond-aligner-2.0.7/src/data/reference.cpp --- diamond-aligner-2.0.6/src/data/reference.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/data/reference.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -47,6 +47,7 @@ Partitioned_histogram ref_hst; unsigned current_ref_block; Sequence_set* ref_seqs::data_ = nullptr; +Sequence_set* ref_seqs_unmasked::data_ = nullptr; bool blocked_processing; std::vector block_to_database_id; @@ -188,7 +189,7 @@ offset += seq.length() + id_len + 3; } -void make_db(TempFile **tmp_out, TextInputFile *input_file) +void make_db(TempFile **tmp_out, list *input_file) { if (config.input_ref_file.size() > 1) throw std::runtime_error("Too many arguments provided for option --in."); @@ -200,7 +201,13 @@ task_timer total; task_timer timer("Opening the database file", true); - TextInputFile *db_file = input_file ? input_file : new TextInputFile(input_file_name); + list* db_file; + if (input_file) + db_file = input_file; + else { + db_file = new list; + db_file->emplace_back(input_file_name); + } OutputFile *out = tmp_out ? new TempFile() : new OutputFile(config.database); ReferenceHeader header; @@ -219,7 +226,7 @@ FileBackedBuffer accessions; try { - while ((timer.go("Loading sequences"), n = load_seqs(*db_file, format, &seqs, ids, 0, nullptr, (size_t)(1e9), string(), amino_acid_traits)) > 0) { + while ((timer.go("Loading sequences"), n = load_seqs(db_file->begin(), db_file->end(), format, &seqs, ids, 0, nullptr, (size_t)(1e9), string(), amino_acid_traits)) > 0) { if (config.masking == 1) { timer.go("Masking sequences"); mask_seqs(*seqs, Masking::get(), false); @@ -228,7 +235,7 @@ for (size_t i = 0; i < n; ++i) { sequence seq = (*seqs)[i]; if (seq.length() == 0) - throw std::runtime_error("File format error: sequence of length 0 at line " + to_string(db_file->line_count)); + throw std::runtime_error("File format error: sequence of length 0 at line " + to_string(db_file->front().line_count)); push_seq(seq, (*ids)[i], ids->length(i), offset, pos_array, *out, letters, n_seqs); } if (!config.prot_accession2taxid.empty()) { @@ -278,7 +285,7 @@ if (!input_file) { timer.go("Closing the input file"); - db_file->close(); + db_file->front().close(); delete db_file; } @@ -416,7 +423,7 @@ (*dst_seq)->print_stats(); } - if (config.multiprocessing) + if (config.multiprocessing || config.global_ranking_targets) blocked_processing = true; else blocked_processing = seqs_processed < ref_header.sequences; @@ -449,7 +456,7 @@ { std::map seq_titles; if (!config.query_file.empty()) { - TextInputFile list(config.query_file); + TextInputFile list(config.single_query_file()); while (list.getline(), !list.eof()) { const vector t(tokenize(list.line.c_str(), "\t")); if (t.size() != 2) diff -Nru diamond-aligner-2.0.6/src/data/reference.h diamond-aligner-2.0.7/src/data/reference.h --- diamond-aligner-2.0.6/src/data/reference.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/data/reference.h 2021-02-12 11:50:56.000000000 +0000 @@ -26,6 +26,7 @@ #include #include #include +#include #include "../util/io/serializer.h" #include "../util/io/input_file.h" #include "../util/io/text_input_file.h" @@ -34,9 +35,6 @@ #include "metadata.h" #include "../util/data_structures/bit_vector.h" -using std::vector; -using std::string; - struct ReferenceHeader { ReferenceHeader() : @@ -146,7 +144,7 @@ }; -void make_db(TempFile **tmp_out = nullptr, TextInputFile *input_file = nullptr); +void make_db(TempFile **tmp_out = nullptr, std::list* input_file = nullptr); struct ref_seqs { @@ -157,6 +155,19 @@ static Sequence_set *data_; }; +struct ref_seqs_unmasked +{ + static const Sequence_set& get() + { + return *data_; + } + static Sequence_set& get_nc() + { + return *data_; + } + static Sequence_set* data_; +}; + struct ref_ids { static const String_set& get() diff -Nru diamond-aligner-2.0.6/src/data/seed_array.cpp diamond-aligner-2.0.7/src/data/seed_array.cpp --- diamond-aligner-2.0.6/src/data/seed_array.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/data/seed_array.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -19,8 +19,12 @@ #include #include "seed_array.h" #include "seed_set.h" +#include "enum_seeds.h" +#include "../util/data_structures/deque.h" -typedef vector > PtrSet; +using std::array; + +typedef vector> PtrSet; char* SeedArray::alloc_buffer(const Partitioned_histogram &hst) { @@ -108,9 +112,90 @@ PtrVector cb; for (size_t i = 0; i < seq_partition.size() - 1; ++i) cb.push_back(new BuildCallback(range, iterators[i].begin())); - seqs.enum_seeds(cb, seq_partition, shape, shape + 1, filter); + enum_seeds(&seqs, cb, seq_partition, shape, shape + 1, filter); } template SeedArray::SeedArray(const Sequence_set &, size_t, const shape_histogram &, const SeedPartitionRange &, const vector&, char *buffer, const No_filter *); template SeedArray::SeedArray(const Sequence_set &, size_t, const shape_histogram &, const SeedPartitionRange &, const vector&, char *buffer, const Seed_set *); -template SeedArray::SeedArray(const Sequence_set &, size_t, const shape_histogram &, const SeedPartitionRange &, const vector&, char *buffer, const Hashed_seed_set *); \ No newline at end of file +template SeedArray::SeedArray(const Sequence_set &, size_t, const shape_histogram &, const SeedPartitionRange &, const vector&, char *buffer, const Hashed_seed_set *); + +struct BufferedWriter2 +{ + static const unsigned BUFFER_SIZE = 16; + BufferedWriter2() + { + memset(n, 0, sizeof(n)); + } + void push(Packed_seed key, Loc value, const SeedPartitionRange& range) + { + const unsigned p = seed_partition(key); + if (range.contains(p)) { + assert(n[p] < BUFFER_SIZE); + buf[p][n[p]++] = SeedArray::Entry(seed_partition_offset(key), value); + if (n[p] == BUFFER_SIZE) + flush(p); + } + } + void flush(unsigned p) + { + out[p].push_back(buf[p], n[p]); + n[p] = 0; + } + void flush() + { + for (unsigned p = 0; p < Const::seedp; ++p) + if (n[p] > 0) + flush(p); + } + Deque out[Const::seedp]; + SeedArray::Entry buf[Const::seedp][BUFFER_SIZE]; + uint8_t n[Const::seedp]; +}; + +struct BuildCallback2 +{ + BuildCallback2(const SeedPartitionRange& range) : + range(range), + it(new BufferedWriter2()) + { } + bool operator()(uint64_t seed, uint64_t pos, size_t shape) + { + it->push(seed, pos, range); + return true; + } + void finish() + { + it->flush(); + } + ~BuildCallback2() + { + delete it; + } + SeedPartitionRange range; + BufferedWriter2* it; +}; + +template +SeedArray::SeedArray(const Sequence_set& seqs, size_t shape, const SeedPartitionRange& range, const _filter* filter) : + data_(nullptr) +{ + const auto seq_partition = seqs.partition(config.threads_); + PtrVector cb; + for (size_t i = 0; i < seq_partition.size() - 1; ++i) + cb.push_back(new BuildCallback2(range)); + enum_seeds(&seqs, cb, seq_partition, shape, shape + 1, filter); + + array counts; + counts.fill(0); + for (BuildCallback2* p : cb) + for (size_t i = 0; i < Const::seedp; ++i) + counts[i] += p->it->out[i].size(); + + for (size_t i = 0; i < Const::seedp; ++i) { + entries_[i].reserve(counts[i]); + for (BuildCallback2* p : cb) + p->it->out[i].move(entries_[i]); + } +} + +template SeedArray::SeedArray(const Sequence_set&, size_t, const SeedPartitionRange&, const Hashed_seed_set*); \ No newline at end of file diff -Nru diamond-aligner-2.0.6/src/data/seed_array.h diamond-aligner-2.0.7/src/data/seed_array.h --- diamond-aligner-2.0.6/src/data/seed_array.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/data/seed_array.h 2021-02-12 11:50:56.000000000 +0000 @@ -16,9 +16,9 @@ along with this program. If not, see . ****/ -#ifndef SEED_ARRAY_H_ -#define SEED_ARRAY_H_ - +#pragma once +#include +#include #include "seed_histogram.h" #include "../basic/packed_loc.h" @@ -53,19 +53,43 @@ template SeedArray(const Sequence_set &seqs, size_t shape, const shape_histogram &hst, const SeedPartitionRange &range, const vector &seq_partition, char *buffer, const _filter *filter); + template + SeedArray(const Sequence_set& seqs, size_t shape, const SeedPartitionRange& range, const _filter* filter); + Entry* begin(unsigned i) { - return &data_[begin_[i]]; + if (data_) + return &data_[begin_[i]]; + else + return entries_[i].data(); } const Entry* begin(unsigned i) const { - return &data_[begin_[i]]; + if (data_) + return &data_[begin_[i]]; + else + return entries_[i].data(); } size_t size(size_t i) const { - return begin_[i + 1] - begin_[i]; + if (data_) + return begin_[i + 1] - begin_[i]; + else { + return entries_[i].size(); + } + } + + size_t size() const { + if(data_) + return begin_[Const::seedp]; + else { + size_t n = 0; + for (const auto& v : entries_) + n += v.size(); + return n; + } } static char *alloc_buffer(const Partitioned_histogram &hst); @@ -74,9 +98,8 @@ Entry *data_; size_t begin_[Const::seedp + 1]; + std::array, Const::seedp> entries_; }; -#pragma pack() - -#endif \ No newline at end of file +#pragma pack() \ No newline at end of file diff -Nru diamond-aligner-2.0.6/src/data/seed_histogram.h diamond-aligner-2.0.7/src/data/seed_histogram.h --- diamond-aligner-2.0.6/src/data/seed_histogram.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/data/seed_histogram.h 2021-02-12 11:50:56.000000000 +0000 @@ -26,6 +26,7 @@ #include "../basic/seed_iterator.h" #include "seed_set.h" #include "../util/data_structures/array.h" +#include "enum_seeds.h" using std::vector; @@ -98,9 +99,9 @@ cb.push_back(new Callback(i, data_)); if (serial) for (unsigned s = 0; s < shapes.count(); ++s) - seqs.enum_seeds(cb, p_, s, s + 1, filter); + enum_seeds(&seqs, cb, p_, s, s + 1, filter); else - seqs.enum_seeds(cb, p_, 0, shapes.count(), filter); + enum_seeds(&seqs, cb, p_, 0, shapes.count(), filter); } const shape_histogram& get(unsigned sid) const diff -Nru diamond-aligner-2.0.6/src/data/seed_set.cpp diamond-aligner-2.0.7/src/data/seed_set.cpp --- diamond-aligner-2.0.6/src/data/seed_set.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/data/seed_set.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -16,11 +16,18 @@ along with this program. If not, see . ****/ +#include #include "seed_set.h" #include "../util/ptr_vector.h" #include "../util/math/integer.h" +#include "enum_seeds.h" +#include "../util/system/system.h" +#include "../util/io/output_file.h" + +static const size_t PADDING = 32; using std::endl; +using std::get; No_filter no_filter; @@ -54,13 +61,13 @@ throw std::runtime_error("Contiguous seed required."); PtrVector v; v.push_back(new Seed_set_callback(data_, size_t(max_coverage*pow(Reduction::reduction.size(), shapes[0].length_)))); - seqs.enum_seeds(v, seqs.partition(1), 0, 1, &no_filter, true); + enum_seeds(&seqs, v, seqs.partition(1), 0, 1, &no_filter, true); coverage_ = (double)v.back().coverage / pow(Reduction::reduction.size(), shapes[0].length_); } struct Hashed_seed_set_callback { - Hashed_seed_set_callback(PtrVector > &dst): + Hashed_seed_set_callback(PtrVector> &dst): dst(dst) {} bool operator()(uint64_t seed, uint64_t pos, uint64_t shape) @@ -75,11 +82,59 @@ Hashed_seed_set::Hashed_seed_set(const Sequence_set &seqs) { + if (config.mmap_target_index) { + for (size_t i = 0; i < shapes.count(); ++i) { + auto f = mmap_file((config.database + '.' + std::to_string(i)).c_str()); + if (get<0>(f) == nullptr) { + if (data_.empty()) { + message_stream << "WARNING: Target index file not found." << std::endl; + break; + } + else + throw std::runtime_error("Target index file not found."); + } + data_.push_back(new PHash_set((uint8_t*)get<0>(f), get<1>(f) - PADDING)); + fd_.push_back(get<2>(f)); + log_stream << "MMAPED Shape=" << i << " Hash_table_size=" << data_[i].size() << " load=" << (double)data_[i].load() / data_[i].size() << endl; + } + if(!data_.empty()) return; + } for (size_t i = 0; i < shapes.count(); ++i) data_.push_back(new PHash_set(next_power_of_2(seqs.letters()*1.25))); + //data_.push_back(new PHash_set(seqs.letters() * 1.25)); PtrVector v; v.push_back(new Hashed_seed_set_callback(data_)); - seqs.enum_seeds(v, seqs.partition(1), 0, shapes.count(), &no_filter); + enum_seeds(&seqs, v, seqs.partition(1), 0, shapes.count(), &no_filter); + + vector sizes; + for (size_t i = 0; i < shapes.count(); ++i) + sizes.push_back(data_[i].load()); + data_.clear(); + + for (size_t i = 0; i < shapes.count(); ++i) + data_.push_back(new PHash_set(next_power_of_2(sizes[i] * 1.25))); + //data_.push_back(new PHash_set(seqs.letters() * 1.25)); + enum_seeds(&seqs, v, seqs.partition(1), 0, shapes.count(), &no_filter); + for (size_t i = 0; i < shapes.count(); ++i) - log_stream << "Shape=" << i << " Hash_table_size=" << data_[i].size() << " load=" << data_[i].load() << endl; + log_stream << "Shape=" << i << " Hash_table_size=" << data_[i].size() << " load=" << (double)data_[i].load()/data_[i].size() << endl; + + if (config.save_target_index) { + log_stream << "Saving hashed seed sets to file." << endl; + for (size_t i = 0; i < shapes.count(); ++i) { + OutputFile out(config.database + '.' + std::to_string(i)); + out.write(data_[i].data(), data_[i].size()); + out.write(data_[i].data(), PADDING); + out.close(); + } + } +} + +Hashed_seed_set::~Hashed_seed_set() { + if (fd_.empty()) + return; + for (size_t i = 0; i < shapes.count(); ++i) { + unmap_file((char*)data_[i].table, data_[i].size(), fd_[i]); + data_[i].table = nullptr; + } } \ No newline at end of file diff -Nru diamond-aligner-2.0.6/src/data/seed_set.h diamond-aligner-2.0.7/src/data/seed_set.h --- diamond-aligner-2.0.6/src/data/seed_set.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/data/seed_set.h 2021-02-12 11:50:56.000000000 +0000 @@ -16,9 +16,7 @@ along with this program. If not, see . ****/ -#ifndef SEED_SET_H_ -#define SEED_SET_H_ - +#pragma once #include #include "sequence_set.h" #include "../util/hash_table.h" @@ -43,12 +41,12 @@ struct Hashed_seed_set { Hashed_seed_set(const Sequence_set &seqs); + ~Hashed_seed_set(); bool contains(uint64_t key, uint64_t shape) const { return data_[shape].contains(key); } private: - PtrVector > data_; + PtrVector> data_; + std::vector fd_; }; - -#endif \ No newline at end of file diff -Nru diamond-aligner-2.0.6/src/data/sequence_set.h diamond-aligner-2.0.7/src/data/sequence_set.h --- diamond-aligner-2.0.6/src/data/sequence_set.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/data/sequence_set.h 2021-02-12 11:50:56.000000000 +0000 @@ -16,9 +16,7 @@ along with this program. If not, see . ****/ -#ifndef SEQUENCE_SET_H_ -#define SEQUENCE_SET_H_ - +#pragma once #include #include #include @@ -105,150 +103,7 @@ return this->letters() / this->get_length(); } - template - void enum_seeds(PtrVector<_f> &f, const std::vector &p, size_t shape_begin, size_t shape_end, const _filter *filter, bool contig = false) const - { - std::vector threads; - for (unsigned i = 0; i < f.size(); ++i) - threads.emplace_back(enum_seeds_worker<_f, _filter>, &f[i], this, (unsigned)p[i], (unsigned)p[i + 1], std::make_pair(shape_begin, shape_end), filter, contig); - for (auto &t : threads) - t.join(); - } - virtual ~Sequence_set() { } -private: - - template - void enum_seeds(_f *f, unsigned begin, unsigned end, std::pair shape_range, const _filter *filter) const - { - vector buf(max_len(begin, end)); - uint64_t key; - for (unsigned i = begin; i < end; ++i) { - const sequence seq = (*this)[i]; - Reduction::reduce_seq(seq, buf); - for (size_t shape_id = shape_range.first; shape_id < shape_range.second; ++shape_id) { - const Shape& sh = shapes[shape_id]; - if (seq.length() < sh.length_) continue; - Seed_iterator it(buf, sh); - size_t j = 0; - while (it.good()) { - if (it.get(key, sh)) - if (filter->contains(key, shape_id)) - (*f)(key, position(i, j), shape_id); - ++j; - } - } - } - f->finish(); - } - - template - void enum_seeds_hashed(_f *f, unsigned begin, unsigned end, std::pair shape_range, const _filter *filter) const - { - uint64_t key; - for (unsigned i = begin; i < end; ++i) { - const sequence seq = (*this)[i]; - for (size_t shape_id = shape_range.first; shape_id < shape_range.second; ++shape_id) { - const Shape& sh = shapes[shape_id]; - if (seq.length() < sh.length_) continue; - const uint64_t shape_mask = sh.long_mask(); - Hashed_seed_iterator<_b> it(seq, sh); - size_t j = 0; - while (it.good()) { - if (it.get(key, shape_mask)) - if (filter->contains(key, shape_id)) - (*f)(key, position(i, j), shape_id); - ++j; - } - } - } - f->finish(); - } - - template - void enum_seeds_contiguous(_f *f, unsigned begin, unsigned end, const _filter *filter) const - { - uint64_t key; - for (unsigned i = begin; i < end; ++i) { - const sequence seq = (*this)[i]; - if (seq.length() < _it::length()) continue; - _it it(seq); - size_t j = 0; - while (it.good()) { - if (it.get(key)) - if (filter->contains(key, 0)) - if ((*f)(key, position(i, j), 0) == false) - return; - ++j; - } - } - f->finish(); - } - - template - static void enum_seeds_worker(_f *f, const Sequence_set *seqs, unsigned begin, unsigned end, std::pair shape_range, const _filter *filter, bool contig) - { - static const char *errmsg = "Unsupported contiguous seed."; - if (shape_range.second - shape_range.first == 1 && shapes[shape_range.first].contiguous() && shapes.count() == 1 && (config.algo == Config::query_indexed || contig)) { - const uint64_t b = Reduction::reduction.bit_size(), l = shapes[shape_range.first].length_; - switch (l) { - case 7: - switch (b) { - case 4: - seqs->enum_seeds_contiguous<_f, Contiguous_seed_iterator<7, 4>,_filter>(f, begin, end, filter); - break; - default: - throw std::runtime_error(errmsg); - } - break; - case 6: - switch (b) { - case 4: - seqs->enum_seeds_contiguous<_f, Contiguous_seed_iterator<6, 4>, _filter>(f, begin, end, filter); - break; - default: - throw std::runtime_error(errmsg); - } - break; - case 5: - switch (b) { - case 4: - seqs->enum_seeds_contiguous<_f, Contiguous_seed_iterator<5, 4>, _filter>(f, begin, end, filter); - break; - default: - throw std::runtime_error(errmsg); - } - break; - default: - throw std::runtime_error(errmsg); - } - } - else if (config.hashed_seeds) { - const uint64_t b = Reduction::reduction.bit_size(); - switch (b) { - case 4: - seqs->enum_seeds_hashed<_f, 4, _filter>(f, begin, end, shape_range, filter); - break; - default: - throw std::runtime_error("Unsupported reduction."); - } - } - else - seqs->enum_seeds<_f,_filter>(f, begin, end, shape_range, filter); - } - -}; - -struct No_filter -{ - bool contains(uint64_t seed, uint64_t shape) const - { - return true; - } -}; - -extern No_filter no_filter; - -#endif /* SEQUENCE_SET_H_ */ \ No newline at end of file +}; \ No newline at end of file diff -Nru diamond-aligner-2.0.6/src/data/taxonomy.cpp diamond-aligner-2.0.7/src/data/taxonomy.cpp --- diamond-aligner-2.0.6/src/data/taxonomy.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/data/taxonomy.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -78,25 +78,45 @@ return t; } +int mapping_file_format(const string& header) { + string field1, field2; + Util::String::Tokenizer tok(header, "\t"); + tok >> field1 >> field2; + if (field1 == "accession" && field2 == "accession.version") { + tok >> field1 >> field2; + if (field1 == "taxid" && field2 == "gi" && !tok.good()) + return 0; + } + else if (field1 == "accession.version" && field2 == "taxid" && !tok.good()) + return 1; + throw std::runtime_error("Accession mapping file header has to be in one of these formats:\naccession\taccession.version\ttaxid\tgi\naccession.version\ttaxid"); +} + void Taxonomy::load() { - char acc[max_accesion_len + 2]; unsigned taxid; TextInputFile f(config.prot_accession2taxid); f.getline(); + int format = mapping_file_format(f.line); + string accession; while (!f.eof() && (f.getline(), !f.line.empty())) { - if (sscanf(f.line.c_str(), "%*s%15s%u%*u", acc, &taxid) != 2) { - //std::cout << f.line << endl; - throw std::runtime_error("Invalid taxonomy mapping file format."); - } - if (strlen(acc) > max_accesion_len) { - //std::cout << f.line << endl; - throw std::runtime_error("Accession exceeds supported length."); - } - accession2taxid_.push_back(std::make_pair(Accession(acc), taxid)); - /*if (f.line_count % 10000 == 0) - std::cout << f.line_count << endl;*/ + if (format == 0) + Util::String::Tokenizer(f.line, "\t") >> Util::String::Skip() >> accession >> taxid; + else + Util::String::Tokenizer(f.line, "\t") >> accession >> taxid; + + if (accession.empty()) + throw std::runtime_error("Empty accession field in line " + std::to_string(f.line_count)); + + size_t i = accession.find(":PDB="); + if (i != string::npos) + accession.erase(i); + + if (accession.length() > max_accesion_len) + throw std::runtime_error("Accession exceeds supported length in line " + std::to_string(f.line_count)); + + accession2taxid_.push_back(std::make_pair(Accession(accession.c_str()), taxid)); } f.close(); merge_sort(accession2taxid_.begin(), accession2taxid_.end(), config.threads_); diff -Nru diamond-aligner-2.0.6/src/dp/dp.h diamond-aligner-2.0.7/src/dp/dp.h --- diamond-aligner-2.0.6/src/dp/dp.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/dp/dp.h 2021-02-12 11:50:56.000000000 +0000 @@ -248,7 +248,7 @@ namespace BandedSwipe { -DECL_DISPATCH(std::list, swipe, (const sequence &query, std::vector &targets8, std::vector &targets16, DynamicIterator* targets, Frame frame, const Bias_correction *composition_bias, int flags, Statistics &stat)) +DECL_DISPATCH(std::list, swipe, (const sequence &query, std::vector &targets8, const std::vector &targets16, const std::vector& targets32, DynamicIterator* targets, Frame frame, const Bias_correction *composition_bias, int flags, Statistics &stat)) } diff -Nru diamond-aligner-2.0.6/src/dp/swipe/banded_3frame_swipe.cpp diamond-aligner-2.0.7/src/dp/swipe/banded_3frame_swipe.cpp --- diamond-aligner-2.0.6/src/dp/swipe/banded_3frame_swipe.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/dp/swipe/banded_3frame_swipe.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -335,7 +335,7 @@ Hsp out; out.swipe_target = target.target_idx; - out.score = ScoreTraits<_sv>::int_score(max_score); + out.score = ScoreTraits<_sv>::int_score(max_score) * config.cbs_matrix_scale; out.evalue = evalue; out.transcript.reserve(size_t(out.score * config.transcript_len_estimate)); @@ -376,7 +376,7 @@ Hsp out; const int j0 = i1 - (target.d_end - 1); out.swipe_target = target.target_idx; - out.score = ScoreTraits<_sv>::int_score(max_score); + out.score = ScoreTraits<_sv>::int_score(max_score) * config.cbs_matrix_scale; out.evalue = evalue; out.query_range.end_ = std::min(i0 + max_col + (int)dp.band() / 3 / 2, (int)query[0].length()); out.query_range.begin_ = std::max(out.query_range.end_ - (j0 + max_col), 0); @@ -491,7 +491,7 @@ list out; for (int i = 0; i < targets.n_targets; ++i) { if (best[i] < ScoreTraits<_sv>::max_score()) { - const int score = ScoreTraits<_sv>::int_score(best[i]); + const int score = ScoreTraits<_sv>::int_score(best[i]) * config.cbs_matrix_scale; const double evalue = score_matrix.evalue(score, qlen, (unsigned)subject_begin[i].seq.length()); if (score_matrix.report_cutoff(score, evalue)) out.push_back(traceback<_sv>(q, strand, (int)query.source().length(), dp, subject_begin[i], d_begin[i], best[i], evalue, max_col[i], i, i0 - j, i1 - j)); diff -Nru diamond-aligner-2.0.6/src/dp/swipe/banded_swipe.cpp diamond-aligner-2.0.7/src/dp/swipe/banded_swipe.cpp --- diamond-aligner-2.0.6/src/dp/swipe/banded_swipe.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/dp/swipe/banded_swipe.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -394,7 +394,7 @@ } inline ColumnIterator begin(int offset, int col) { - return ColumnIterator(&hgap_[offset], &score_[offset], &trace_mask_[(col + 1)*band_ + offset]); + return ColumnIterator(&hgap_[offset], &score_[offset], &trace_mask_[size_t(col + 1)*(size_t)band_ + (size_t)offset]); } int band() const { return band_; @@ -579,8 +579,8 @@ Hsp out; out.swipe_target = target.target_idx; out.score = ScoreTraits<_sv>::int_score(max_score); - if (target.adjusted_matrix()) - out.score = std::round((double)out.score / (double)config.cbs_matrix_scale); + if (!target.adjusted_matrix()) + out.score *= config.cbs_matrix_scale; out.evalue = evalue; out.frame = frame.index(); out.d_begin = target.d_begin; @@ -628,8 +628,8 @@ out.subject_range.end_ = it.j + 1; const int end_score = out.score; const bool adjusted_matrix = target.adjusted_matrix(); - if (adjusted_matrix) - out.score = std::round((double)out.score / (double)config.cbs_matrix_scale); + if (!adjusted_matrix) + out.score *= config.cbs_matrix_scale; int score = 0; const int* matrix = adjusted_matrix ? target.matrix->scores32.data() : score_matrix.matrix32(); @@ -815,7 +815,9 @@ task_timer timer; for (int i = 0; i < targets.n_targets; ++i) { if (best[i] < ScoreTraits<_sv>::max_score()) { - const int score = std::round((double)ScoreTraits<_sv>::int_score(best[i]) / (double)subject_begin[i].matrix_scale()); + int score = ScoreTraits<_sv>::int_score(best[i]); + if (!subject_begin[i].adjusted_matrix()) + score *= config.cbs_matrix_scale; const double evalue = score_matrix.evalue(score, qlen, (unsigned)subject_begin[i].seq.length()); if (score_matrix.report_cutoff(score, evalue)) { out.push_back(traceback<_sv>(query, frame, composition_bias, dp, subject_begin[i], d_begin[i], best[i], evalue, max_col[i], i, i0 - j, i1 - j, max_band_row[i])); diff -Nru diamond-aligner-2.0.6/src/dp/swipe/swipe.cpp diamond-aligner-2.0.7/src/dp/swipe/swipe.cpp --- diamond-aligner-2.0.6/src/dp/swipe/swipe.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/dp/swipe/swipe.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -284,7 +284,7 @@ { Hsp out; out.swipe_target = target.target_idx; - out.score = ScoreTraits<_sv>::int_score(max_score); + out.score = ScoreTraits<_sv>::int_score(max_score) * config.cbs_matrix_scale; out.evalue = evalue; out.frame = frame.index(); return out; @@ -299,7 +299,7 @@ typename TracebackVectorMatrix<_sv>::TracebackIterator it(dp.traceback(max_col, max_i, max_j, channel)); Hsp out; out.swipe_target = target.target_idx; - out.score = ScoreTraits<_sv>::int_score(max_score); + out.score = ScoreTraits<_sv>::int_score(max_score) * config.cbs_matrix_scale; out.evalue = evalue; out.transcript.reserve(size_t(out.score * config.transcript_len_estimate)); @@ -396,7 +396,7 @@ overflow.push_back(targets.dp_targets[c]); reinit = true; } else if (!targets.inc(c)) { - const int s = ScoreTraits<_sv>::int_score(best[c]); + const int s = ScoreTraits<_sv>::int_score(best[c]) * config.cbs_matrix_scale; const double evalue = score_matrix.evalue(s, qlen, (unsigned)targets.dp_targets[c].seq.length()); if (score_matrix.report_cutoff(s, evalue)) out.push_back(traceback<_sv>(query, frame, composition_bias, dp, targets.dp_targets[c], best[c], evalue, max_col[c], max_i[c], max_j[c], c)); diff -Nru diamond-aligner-2.0.6/src/dp/swipe/swipe.h diamond-aligner-2.0.7/src/dp/swipe/swipe.h --- diamond-aligner-2.0.6/src/dp/swipe/swipe.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/dp/swipe/swipe.h 2021-02-12 11:50:56.000000000 +0000 @@ -27,7 +27,7 @@ #include "../score_vector_int8.h" #include "../score_vector_int16.h" #include "../../basic/value.h" -#include "../util/simd/vector.h" +#include "../../util/simd/vector.h" #include "../../util/system.h" #include "../../util/memory/alignment.h" #include "../../util/simd/transpose.h" diff -Nru diamond-aligner-2.0.6/src/dp/swipe/swipe_wrapper.cpp diamond-aligner-2.0.7/src/dp/swipe/swipe_wrapper.cpp --- diamond-aligner-2.0.6/src/dp/swipe/swipe_wrapper.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/dp/swipe/swipe_wrapper.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -197,7 +197,7 @@ return swipe_targets<_sv>(query, begin, end, targets ? targets : my_targets.get(), frame, composition_bias, flags, overflow, stat); } -list swipe(const sequence &query, vector &targets8, vector &targets16, DynamicIterator* targets, Frame frame, const Bias_correction *composition_bias, int flags, Statistics &stat) +list swipe(const sequence &query, vector &targets8, const vector &targets16, const vector& targets32, DynamicIterator* targets, Frame frame, const Bias_correction *composition_bias, int flags, Statistics &stat) { vector overflow8, overflow16, overflow32; list out; @@ -218,7 +218,7 @@ overflow8 = std::move(targets8); #endif #ifdef __SSE2__ - if (!overflow8.empty() || !targets16.empty()) { + if (!overflow8.empty() || !targets16.empty() || !targets32.empty()) { overflow8.insert(overflow8.end(), targets16.begin(), targets16.end()); stat.inc(Statistics::EXT16, overflow8.size()); task_timer timer; @@ -227,7 +227,8 @@ timer.go(); out.splice(out.end(), swipe_threads<::DISPATCH_ARCH::score_vector>(query, overflow8.begin(), overflow8.end(), nullptr, frame, composition_bias ? composition_bias->int8.data() : nullptr, flags, overflow16, stat)); if ((flags & PARALLEL) == 0) stat.inc(time_stat, timer.microseconds()); - if (!overflow16.empty()) { + if (!overflow16.empty() || !targets32.empty()) { + overflow16.insert(overflow16.end(), targets32.begin(), targets32.end()); stat.inc(Statistics::EXT32, overflow16.size()); timer.go(); out.splice(out.end(), swipe_threads(query, overflow16.begin(), overflow16.end(), nullptr, frame, composition_bias ? composition_bias->int8.data() : nullptr, flags, overflow32, stat)); diff -Nru diamond-aligner-2.0.6/src/dp/swipe/target_iterator.h diamond-aligner-2.0.7/src/dp/swipe/target_iterator.h --- diamond-aligner-2.0.6/src/dp/swipe/target_iterator.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/dp/swipe/target_iterator.h 2021-02-12 11:50:56.000000000 +0000 @@ -23,7 +23,7 @@ #include #include #include "../dp.h" -#include "../basic/value.h" +#include "../../basic/value.h" #include "../../util/simd/vector.h" #include "../../util/dynamic_iterator.h" #include "../../stats/hauser_correction.h" @@ -281,7 +281,7 @@ return dp_targets[channel].seq[pos[channel]]; } else - return SUPER_HARD_MASK;; + return SUPER_HARD_MASK; } #ifdef __SSSE3__ diff -Nru diamond-aligner-2.0.6/src/lib/blast/nlm_linear_algebra.cpp diamond-aligner-2.0.7/src/lib/blast/nlm_linear_algebra.cpp --- diamond-aligner-2.0.6/src/lib/blast/nlm_linear_algebra.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/lib/blast/nlm_linear_algebra.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -36,10 +36,9 @@ /* Documented in nlm_linear_algebra.h. */ double ** -Nlm_DenseMatrixNew(int nrows, - int ncols) +Nlm_DenseMatrixNew(size_t nrows, + size_t ncols) { - int i; /* iteration index */ double ** mat; /* the new matrix */ mat = (double **) calloc(nrows, sizeof(double *)); @@ -47,7 +46,7 @@ mat[0] = (double *) malloc((size_t) nrows * (size_t) ncols * sizeof(double)); if (mat[0] != NULL) { - for (i = 1; i < nrows; i++) { + for (size_t i = 1; i < nrows; i++) { mat[i] = &mat[0][i * ncols]; } } else { @@ -71,7 +70,7 @@ L = (double**) calloc(n, sizeof(double *)); if (L != NULL) { - L[0] = (double*) malloc(nelts * sizeof(double)); + L[0] = (double*) calloc(nelts, sizeof(double)); // was malloc if (L[0] != NULL) { for (i = 1; i < n; i++) { L[i] = L[i - 1] + i; diff -Nru diamond-aligner-2.0.6/src/lib/blast/nlm_linear_algebra.h diamond-aligner-2.0.7/src/lib/blast/nlm_linear_algebra.h --- diamond-aligner-2.0.6/src/lib/blast/nlm_linear_algebra.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/lib/blast/nlm_linear_algebra.h 2021-02-12 11:50:56.000000000 +0000 @@ -44,7 +44,7 @@ * @param nrows the number of rows for the new matrix. * @param ncols the number of columns for the new matrix. */ -double ** Nlm_DenseMatrixNew(int nrows, int ncols); +double ** Nlm_DenseMatrixNew(size_t nrows, size_t ncols); /** diff -Nru diamond-aligner-2.0.6/src/output/blast_tab_format.cpp diamond-aligner-2.0.7/src/output/blast_tab_format.cpp --- diamond-aligner-2.0.6/src/output/blast_tab_format.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/output/blast_tab_format.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -94,7 +94,9 @@ { "cigar", "CIGAR", true, false }, // 58 { "skingdoms", "Subject kingdoms", false, false }, // 59 { "sphylums", "Subject phylums", false, false }, // 60 -{ "ungapped_score", "Ungapped score", false, false } // 61 +{ "ungapped_score", "Ungapped score", false, false }, // 61 +{ "full_qseq_mate", "Query sequence of the mate", false, false }, // 62 +{ "qseq_translated", "Aligned part of query sequence (translated)", true, false } // 63 needs transcript only in frameshift mode }; Blast_tab_format::Blast_tab_format() : @@ -134,6 +136,10 @@ config.use_lazy_dict = true; if (j == 49 || j == 53) config.store_query_quality = true; + if (j == 62) + needs_paired_end_info = true; + if ((j == 62 || j == 63) && !align_mode.query_translated) + throw std::runtime_error("Output field only supported for translated search."); if (field_def[j].need_transcript) needs_transcript = true; if (field_def[j].need_stats) @@ -377,6 +383,31 @@ case 61: out << r.ungapped_score; break; + case 62: { + if (config.query_file.size() == 2) { + unsigned mate = r.query_id % 2, mate_id = mate == 0 ? r.query_id + 1 : r.query_id - 1; + query_source_seqs::get()[mate_id].print(out, input_value_traits); + break; + } + else { + out << '*'; + break; + } + } + case 63: { + if (config.frame_shift) { + vector seq; + seq.reserve(r.query_range().length()); + for (Hsp_context::Iterator j = r.begin(); j.good(); ++j) + if (j.op() != op_deletion && j.op() != op_frameshift_forward && j.op() != op_frameshift_reverse) + seq.push_back(j.query()); + out << sequence(seq); + } + else { + r.query.index(r.frame()).print(out, r.query_range().begin_, r.query_range().end_, amino_acid_traits); + } + break; + } default: throw std::runtime_error(string("Invalid output field: ") + field_def[*i].key); } @@ -413,6 +444,7 @@ case 57: case 59: case 60: + case 63: out << '*'; break; case 12: diff -Nru diamond-aligner-2.0.6/src/output/output_format.h diamond-aligner-2.0.7/src/output/output_format.h --- diamond-aligner-2.0.6/src/output/output_format.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/output/output_format.h 2021-02-12 11:50:56.000000000 +0000 @@ -42,7 +42,8 @@ needs_taxon_scientific_names(false), needs_taxon_ranks(false), needs_transcript(true), - needs_stats(false) + needs_stats(false), + needs_paired_end_info(false) {} virtual void print_query_intro(size_t query_num, const char *query_name, unsigned query_len, TextBuffer &out, bool unaligned) const {} @@ -63,7 +64,7 @@ return code; } unsigned code; - bool needs_taxon_id_lists, needs_taxon_nodes, needs_taxon_scientific_names, needs_taxon_ranks, needs_transcript, needs_stats; + bool needs_taxon_id_lists, needs_taxon_nodes, needs_taxon_scientific_names, needs_taxon_ranks, needs_transcript, needs_stats, needs_paired_end_info; enum { daa, blast_tab, blast_xml, sam, blast_pairwise, null, taxon, paf, bin1 }; }; diff -Nru diamond-aligner-2.0.6/src/output/view.cpp diamond-aligner-2.0.7/src/output/view.cpp --- diamond-aligner-2.0.6/src/output/view.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/output/view.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -137,7 +137,7 @@ message_stream << "DB sequences used = " << daa.db_seqs_used() << endl; message_stream << "DB letters = " << daa.db_letters() << endl; - const Parameters params{ daa.db_seqs(), daa.db_letters(), 0, 0.0, 0.0 }; + const Parameters params{ daa.db_seqs(), daa.db_letters(), 0, 0.0, 0.0, 0.0, 0.0 }; Metadata metadata; init_output(false, false, false); diff -Nru diamond-aligner-2.0.6/src/run/double_indexed.cpp diamond-aligner-2.0.7/src/run/double_indexed.cpp --- diamond-aligner-2.0.6/src/run/double_indexed.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/run/double_indexed.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -49,6 +49,7 @@ #include "../util/parallel/parallelizer.h" #include "../util/system/system.h" #include "../align/target.h" +#include "../data/enum_seeds.h" using std::unique_ptr; using std::endl; @@ -89,6 +90,9 @@ { log_rss(); + if (config.comp_based_stats == Stats::CBS::COMP_BASED_STATS_AND_MATRIX_ADJUST) + ref_seqs_unmasked::data_ = new Sequence_set(*ref_seqs::data_); + task_timer timer; if (config.masking == 1 && !config.no_ref_masking) { timer.go("Masking reference"); @@ -117,11 +121,19 @@ char *ref_buffer = SeedArray::alloc_buffer(ref_hst); timer.finish(); + Hashed_seed_set* target_seeds = nullptr; + if (config.target_indexed) { + timer.go("Building database seed set"); + target_seeds = new Hashed_seed_set(*ref_seqs::data_); + timer.finish(); + } + for (unsigned i = 0; i < shapes.count(); ++i) - search_shape(i, query_chunk, query_buffer, ref_buffer, params); + search_shape(i, query_chunk, query_buffer, ref_buffer, params, target_seeds); timer.go("Deallocating buffers"); delete[] ref_buffer; + delete target_seeds; timer.go("Clearing query masking"); Frequent_seeds::clear_masking(*query_seqs::data_); @@ -156,6 +168,8 @@ timer.go("Deallocating reference"); delete ref_seqs::data_; delete ref_ids::data_; + if (config.comp_based_stats == Stats::CBS::COMP_BASED_STATS_AND_MATRIX_ADJUST) + delete ref_seqs_unmasked::data_; timer.finish(); } @@ -180,7 +194,7 @@ if (query_chunk == 0) setup_search_cont(); if (config.algo == -1) { - if (config.sensitivity >= Sensitivity::VERY_SENSITIVE || config.sensitivity == Sensitivity::MID_SENSITIVE) { + if (config.sensitivity >= Sensitivity::VERY_SENSITIVE || config.sensitivity == Sensitivity::MID_SENSITIVE || config.sensitivity == Sensitivity::FAST) { config.algo = Config::double_indexed; } else { @@ -218,13 +232,15 @@ db_file.ref_header.letters, db_file.total_blocks(), config.gapped_filter_evalue1, + config.gapped_filter_evalue, + config.gapped_filter_evalue1, config.gapped_filter_evalue }; char *query_buffer = nullptr; const pair query_len_bounds = query_seqs::data_->len_bounds(shapes[0].length_); - if (!config.swipe_all) { + if (!config.swipe_all && !config.target_indexed) { timer.go("Building query histograms"); query_hst = Partitioned_histogram(*query_seqs::data_, false, &no_filter); @@ -390,13 +406,23 @@ } task_timer timer("Opening the input file", true); - TextInputFile *query_file = nullptr; + list *query_file = nullptr; const Sequence_file_format *format_n = nullptr; + bool paired_mode = false; if (!options.self) { if (config.query_file.empty() && !options.query_file) std::cerr << "Query file parameter (--query/-q) is missing. Input will be read from stdin." << endl; - query_file = options.query_file ? options.query_file : new TextInputFile(config.query_file); - format_n = guess_format(*query_file); + if (options.query_file) + query_file = options.query_file; + else { + query_file = new list; + for(const string& f : config.query_file) + query_file->emplace_back(f); + if (query_file->empty()) + query_file->emplace_back(""); + paired_mode = query_file->size() == 2; + } + format_n = guess_format(query_file->front()); } current_query_chunk = 0; @@ -417,9 +443,9 @@ deallocate_queries(); query_file_offset = db_file->tell_seq(); } else { - if (!load_seqs(*query_file, *format_n, &query_seqs::data_, query_ids::data_, &query_source_seqs::data_, + if (!load_seqs(query_file->begin(), query_file->end(), *format_n, &query_seqs::data_, query_ids::data_, &query_source_seqs::data_, config.store_query_quality ? &query_qual : nullptr, - (size_t)(config.chunk_size * 1e9), config.qfilt, input_value_traits)) + (size_t)(config.chunk_size * 1e9), config.qfilt, input_value_traits, paired_mode ? 2 : 1)) break; deallocate_queries(); } @@ -428,7 +454,7 @@ db_file->rewind(); query_file_offset = 0; } else { - query_file->rewind(); + query_file->front().rewind(); } for (size_t i=0; itell_seq(); } else - if (!load_seqs(*query_file, *format_n, &query_seqs::data_, query_ids::data_, &query_source_seqs::data_, + if (!load_seqs(query_file->begin(), query_file->end(), *format_n, &query_seqs::data_, query_ids::data_, &query_source_seqs::data_, config.store_query_quality ? &query_qual : nullptr, - (size_t)(config.chunk_size * 1e9), config.qfilt, input_value_traits)) + (size_t)(config.chunk_size * 1e9), config.qfilt, input_value_traits, paired_mode ? 2 : 1)) break; timer.finish(); @@ -500,7 +526,7 @@ if (query_file && !options.query_file) { timer.go("Closing the input file"); - query_file->close(); + query_file->front().close(); delete query_file; } diff -Nru diamond-aligner-2.0.6/src/run/tools.cpp diamond-aligner-2.0.7/src/run/tools.cpp --- diamond-aligner-2.0.6/src/run/tools.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/run/tools.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -74,7 +74,7 @@ void sort_file() { - TextInputFile f(config.query_file); + TextInputFile f(config.single_query_file()); vector > data; while (f.getline(), !f.eof()) { unsigned query; @@ -113,7 +113,7 @@ void run_masker() { - TextInputFile f(config.query_file); + TextInputFile f(config.single_query_file()); vector seq, seq2; string id; const FASTA_format format; @@ -148,7 +148,7 @@ void fastq2fasta() { - unique_ptr f(new TextInputFile(config.query_file)); + unique_ptr f(new TextInputFile(config.single_query_file())); vector seq; string id; const FASTQ_format format; @@ -184,7 +184,7 @@ { const double ID = 0.35; srand((unsigned)time(0)); - TextInputFile in(config.query_file); + TextInputFile in(config.single_query_file()); OutputFile out(config.output_file); FASTA_format format; string id; @@ -269,7 +269,7 @@ value_traits = nucleotide_traits; score_matrix = Score_matrix("DNA", 5, 2, 0, 1); - TextInputFile in(config.query_file); + TextInputFile in(config.single_query_file()); std::mutex input_lock, output_lock; vector threads; for (unsigned i = 0; i < config.threads_; ++i) @@ -288,7 +288,7 @@ void translate() { input_value_traits = nucleotide_traits; - TextInputFile in(config.query_file); + TextInputFile in(config.single_query_file()); string id; vector seq; vector proteins[6]; @@ -302,7 +302,7 @@ void reverse() { input_value_traits = amino_acid_traits; - TextInputFile in(config.query_file); + TextInputFile in(config.single_query_file()); string id; vector seq; TextBuffer buf; @@ -323,7 +323,7 @@ void show_cbs() { score_matrix = Score_matrix("BLOSUM62", config.gap_open, config.gap_extend, config.frame_shift, 1); init_cbs(); - TextInputFile in(config.query_file); + TextInputFile in(config.single_query_file()); string id; vector seq; while (FASTA_format().get_seq(id, seq, in, value_traits)) { diff -Nru diamond-aligner-2.0.6/src/run/workflow.h diamond-aligner-2.0.7/src/run/workflow.h --- diamond-aligner-2.0.6/src/run/workflow.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/run/workflow.h 2021-02-12 11:50:56.000000000 +0000 @@ -20,6 +20,7 @@ ****/ #pragma once +#include #include "../basic/config.h" #include "../util/data_structures/bit_vector.h" @@ -41,7 +42,7 @@ bool self; DatabaseFile *db; Consumer *consumer; - TextInputFile *query_file; + std::list *query_file; const BitVector* db_filter; }; diff -Nru diamond-aligner-2.0.6/src/search/collision.cpp diamond-aligner-2.0.7/src/search/collision.cpp --- diamond-aligner-2.0.6/src/search/collision.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/search/collision.cpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,120 +0,0 @@ -/**** -DIAMOND protein aligner -Copyright (C) 2013-2017 Benjamin Buchfink - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . -****/ - -#include "search.h" -#include "collision.h" -#include "seed_complexity.h" -#include "finger_print.h" - -namespace DISPATCH_ARCH { - -bool verify_hit(const Letter *query, const Letter *subject, unsigned sid) -{ - const Finger_print fq(query), fs(subject); - if (fq.match(fs) < config.min_identities) - return false; - return true; - /*unsigned delta, len; - return stage2_ungapped(query, subject, sid, delta, len) >= config.min_ungapped_raw_score;*/ -} - -inline bool match_shape_mask(const uint64_t mask, const uint64_t shape_mask) -{ - return (mask & shape_mask) == shape_mask; -} - -inline bool is_lower_chunk(const Letter *subject, unsigned sid) -{ - Packed_seed seed; - if (config.algo == Config::double_indexed) - shapes[sid].set_seed(seed, subject); - else - shapes[sid].set_seed_shifted(seed, subject); - return current_range.lower(seed_partition(seed)); -} - -inline bool is_lower_or_equal_chunk(const Letter *subject, unsigned sid) -{ - Packed_seed seed; - if (config.algo == Config::double_indexed) - shapes[sid].set_seed(seed, subject); - else - shapes[sid].set_seed_shifted(seed, subject); - return current_range.lower_or_equal(seed_partition(seed)); -} - -inline bool is_high_frequency(const Letter *subject, unsigned sid, bool previous_shape) -{ - if (config.simple_freq) - return !SeedComplexity::complex(subject, shapes[sid]); - else - return frequent_seeds.get(subject, sid); -} - -inline bool shape_collision_right(uint64_t mask, uint64_t shape_mask, const Letter *subject, unsigned sid) -{ - if (!match_shape_mask(mask, shape_mask)) return false; - return is_lower_chunk(subject, sid) - && !is_high_frequency(subject, sid, false); -} - -inline bool shape_collision_left(uint64_t mask, uint64_t shape_mask, const Letter *subject, unsigned sid, bool chunked) -{ - if (!match_shape_mask(mask, shape_mask)) return false; - return (!chunked || is_lower_or_equal_chunk(subject, sid)) - && !is_high_frequency(subject, sid, false); -} - -inline bool previous_shape_collision(uint64_t mask, uint64_t shape_mask, const Letter *subject, unsigned sid) -{ - if (!match_shape_mask(mask, shape_mask)) return false; - return !is_high_frequency(subject, sid, true); -} - -bool is_primary_hit(const Letter *query, - const Letter *subject, - const unsigned seed_offset, - const unsigned sid, - const unsigned len) -{ - const bool chunked(config.lowmem > 1); - uint64_t mask = reduced_match32(query, subject, len); - unsigned i = 0; - uint64_t current_mask = shapes[sid].mask_; - unsigned shape_len = len - shapes[0].length_ + 1; - while (i < shape_len) { - if (len - i > 32) - mask |= reduced_match32(query + 32, subject + 32, len - i - 32) << 32; - for (unsigned j = 0; j < 32 && i < shape_len; ++j) { - for (unsigned k = 0; k < sid; ++k) - if (previous_shape_collision(mask, shapes[k].mask_, &subject[j], k) && verify_hit(&query[j], &subject[j], k)) - return false; - if (i < seed_offset && shape_collision_left(mask, current_mask, &subject[j], sid, chunked) && verify_hit(&query[j], &subject[j], sid)) - return false; - if (chunked && i > seed_offset && shape_collision_right(mask, current_mask, &subject[j], sid) && verify_hit(&query[j], &subject[j], sid)) - return false; - ++i; - mask >>= 1; - } - query += 32; - subject += 32; - } - return true; -} - -} diff -Nru diamond-aligner-2.0.6/src/search/collision.h diamond-aligner-2.0.7/src/search/collision.h --- diamond-aligner-2.0.6/src/search/collision.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/search/collision.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,28 +0,0 @@ -/**** -DIAMOND protein aligner -Copyright (C) 2013-2017 Benjamin Buchfink - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . -****/ - -#ifndef COLLISION_H_ -#define COLLISION_H_ - -#include "../data/frequent_seeds.h" -#include "sse_dist.h" -#include "../util/simd.h" - -DECL_DISPATCH(bool, is_primary_hit, (const Letter *query, const Letter *subject, const unsigned seed_offset, const unsigned sid, const unsigned len)) - -#endif /* COLLISION_H_ */ diff -Nru diamond-aligner-2.0.6/src/search/search.h diamond-aligner-2.0.7/src/search/search.h --- diamond-aligner-2.0.6/src/search/search.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/search/search.h 2021-02-12 11:50:56.000000000 +0000 @@ -30,12 +30,19 @@ #include "../util/data_structures/flat_array.h" #include "../util/scores/cutoff_table.h" #include "../basic/parameters.h" +#include "../data/seed_set.h" + +// #define UNGAPPED_SPOUGE namespace Search { struct Context { const PatternMatcher previous_matcher, current_matcher; - const Util::Scores::CutoffTable cutoff_table; +#ifdef UNGAPPED_SPOUGE + const Util::Scores::CutoffTable2D cutoff_table; +#else + const Util::Scores::CutoffTable cutoff_table, cutoff_table_short; +#endif const int short_query_ungapped_cutoff; }; @@ -61,7 +68,7 @@ unsigned q, s; }; -void search_shape(unsigned sid, unsigned query_block, char *query_buffer, char *ref_buffer, const Parameters ¶ms); +void search_shape(unsigned sid, unsigned query_block, char *query_buffer, char *ref_buffer, const Parameters ¶ms, const Hashed_seed_set* target_seeds); bool use_single_indexed(double coverage, size_t query_letters, size_t ref_letters); void setup_search(); void setup_search_cont(); diff -Nru diamond-aligner-2.0.6/src/search/setup.cpp diamond-aligner-2.0.7/src/search/setup.cpp --- diamond-aligner-2.0.6/src/search/setup.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/search/setup.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -32,7 +32,7 @@ void setup_search_cont() { - if (config.sensitivity >= Sensitivity::VERY_SENSITIVE || config.sensitivity == Sensitivity::MID_SENSITIVE) + if (config.sensitivity >= Sensitivity::VERY_SENSITIVE || config.sensitivity == Sensitivity::MID_SENSITIVE || config.sensitivity == Sensitivity::FAST) return; unsigned index_mode; Reduction::reduction = Reduction("A KR EDNQ C G H ILVM FYW P ST"); @@ -62,21 +62,24 @@ Config::set_option(config.freq_sd, 20.0); Config::set_option(config.min_identities, 9u); Config::set_option(config.ungapped_evalue, 300000.0, -1.0); - Config::set_option(config.gapped_filter_evalue, 10.0, -1.0); + Config::set_option(config.ungapped_evalue_short, 30000.0, -1.0); + Config::set_option(config.gapped_filter_evalue, 1.0, -1.0); Config::set_option(config.lowmem, 1u); Config::set_option(config.query_bins, 64u); } else if (config.sensitivity == Sensitivity::VERY_SENSITIVE) { Config::set_option(config.freq_sd, 15.0); Config::set_option(config.min_identities, 9u); Config::set_option(config.ungapped_evalue, 100000.0, -1.0); - Config::set_option(config.gapped_filter_evalue, 10.0, -1.0); + Config::set_option(config.ungapped_evalue_short, 30000.0, -1.0); + Config::set_option(config.gapped_filter_evalue, 1.0, -1.0); Config::set_option(config.lowmem, 1u); Config::set_option(config.query_bins, 16u); } else if (config.sensitivity == Sensitivity::MORE_SENSITIVE) { Config::set_option(config.freq_sd, 200.0); Config::set_option(config.min_identities, 11u); Config::set_option(config.ungapped_evalue, 10000.0, -1.0); - Config::set_option(config.gapped_filter_evalue, 10.0, -1.0); + Config::set_option(config.ungapped_evalue_short, 10000.0, -1.0); + Config::set_option(config.gapped_filter_evalue, 1.0, -1.0); Config::set_option(config.lowmem, 4u); Config::set_option(config.query_bins, 16u); } @@ -84,7 +87,8 @@ Config::set_option(config.freq_sd, 20.0); Config::set_option(config.min_identities, 11u); Config::set_option(config.ungapped_evalue, 10000.0, -1.0); - Config::set_option(config.gapped_filter_evalue, 10.0, -1.0); + Config::set_option(config.ungapped_evalue_short, 10000.0, -1.0); + Config::set_option(config.gapped_filter_evalue, 1.0, -1.0); Config::set_option(config.lowmem, 4u); Config::set_option(config.query_bins, 16u); } @@ -92,6 +96,7 @@ Config::set_option(config.freq_sd, 20.0); Config::set_option(config.min_identities, 11u); Config::set_option(config.ungapped_evalue, 10000.0, -1.0); + Config::set_option(config.ungapped_evalue_short, 10000.0, -1.0); Config::set_option(config.gapped_filter_evalue, 0.0, -1.0); Config::set_option(config.lowmem, 4u); Config::set_option(config.query_bins, 16u); @@ -100,6 +105,8 @@ Config::set_option(config.freq_sd, 50.0); Config::set_option(config.min_identities, 11u); Config::set_option(config.ungapped_evalue, 10000.0, -1.0); + Config::set_option(config.ungapped_evalue_short, 10000.0, -1.0); + Config::set_option(config.gapped_filter_evalue, 0.0, -1.0); Config::set_option(config.lowmem, 4u); Config::set_option(config.query_bins, 16u); } @@ -121,9 +128,11 @@ case Sensitivity::MID_SENSITIVE: Config::set_option(config.index_mode, 15u); break; - case Sensitivity::FAST: + case Sensitivity::DEFAULT: Config::set_option(config.index_mode, 8u); break; + case Sensitivity::FAST: + Config::set_option(config.index_mode, 16u); } Reduction::reduction = Reduction("A KR EDNQ C G H ILVM FYW P ST"); ::shapes = shape_config(config.index_mode, config.shapes, config.shape_mask); diff -Nru diamond-aligner-2.0.6/src/search/stage0.cpp diamond-aligner-2.0.7/src/search/stage0.cpp --- diamond-aligner-2.0.6/src/search/stage0.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/search/stage0.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -48,7 +48,8 @@ DoubleArray *ref_seeds_hits) { unsigned p; - const unsigned bits = (unsigned)ceil(shapes[0].weight_ * Reduction::reduction.bit_size_exact()) - Const::seedp_bits; + const unsigned bits = config.hashed_seeds ? sizeof(SeedArray::Entry::Key) * 8 + : (unsigned)ceil(shapes[0].weight_ * Reduction::reduction.bit_size_exact()) - Const::seedp_bits; while ((p = (*seedp)++) < seedp_range->end()) { std::pair, DoubleArray> join = hash_join( Relation(query_seeds->begin(p), query_seeds->size(p)), @@ -71,7 +72,7 @@ statistics += stats; } -void search_shape(unsigned sid, unsigned query_block, char *query_buffer, char *ref_buffer, const Parameters ¶ms) +void search_shape(unsigned sid, unsigned query_block, char *query_buffer, char *ref_buffer, const Parameters ¶ms, const Hashed_seed_set* target_seeds) { ::partition p(Const::seedp, config.lowmem); DoubleArray query_seed_hits[Const::seedp], ref_seed_hits[Const::seedp]; @@ -97,7 +98,14 @@ ref_idx = new SeedArray(*ref_seqs::data_, sid, ref_hst.get(sid), range, ref_hst.partition(), ref_buffer, &no_filter); timer.go("Building query seed array"); - SeedArray *query_idx = new SeedArray(*query_seqs::data_, sid, query_hst.get(sid), range, query_hst.partition(), query_buffer, &no_filter); + SeedArray* query_idx; + if (target_seeds) + query_idx = new SeedArray(*query_seqs::data_, sid, range, target_seeds); + else + query_idx = new SeedArray(*query_seqs::data_, sid, query_hst.get(sid), range, query_hst.partition(), query_buffer, &no_filter); + timer.finish(); + + log_stream << "Indexed query seeds = " << query_idx->size() << '/' << query_seqs::get().letters() << ", reference seeds = " << ref_idx->size() << '/' << ref_seqs::get().letters() << endl; timer.go("Computing hash join"); atomic seedp(range.begin()); @@ -115,6 +123,7 @@ context = new Search::Context{ {patterns.data(), patterns.data() + patterns.size() - 1 }, {patterns.data(), patterns.data() + patterns.size() }, config.ungapped_evalue, + config.ungapped_evalue_short, score_matrix.rawscore(config.short_query_ungapped_bitscore) }; diff -Nru diamond-aligner-2.0.6/src/search/stage2.cpp diamond-aligner-2.0.7/src/search/stage2.cpp --- diamond-aligner-2.0.6/src/search/stage2.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/search/stage2.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -40,6 +40,28 @@ namespace Search { namespace DISPATCH_ARCH { +static const int SHORT_QUERY_LEN = 85; + +static int ungapped_cutoff(int query_len, const Context& context) { +#ifdef UNGAPPED_SPOUGE + return query_len > config.short_query_max_len ? context.cutoff_table(query_len, 50) : context.short_query_ungapped_cutoff; +#else + if (query_len <= config.short_query_max_len) + return context.short_query_ungapped_cutoff; + else if (query_len <= SHORT_QUERY_LEN && align_mode.query_translated) + return context.cutoff_table_short(query_len); + else + return context.cutoff_table(query_len); +#endif +} + +static int ungapped_window(int query_len) { + if (query_len <= SHORT_QUERY_LEN && align_mode.query_translated) + return query_len; + else + return config.ungapped_window; +} + void search_query_offset(uint64_t q, const Packed_loc* s, const uint32_t *hits, @@ -59,14 +81,15 @@ int scores[N]; std::fill(scores, scores + N, INT_MAX); - const sequence query_clipped = Util::Sequence::clip(query - config.ungapped_window, config.ungapped_window * 2, config.ungapped_window); - const int window_left = int(query - query_clipped.data()), window = (int)query_clipped.length(); unsigned query_id = UINT_MAX, seed_offset = UINT_MAX; std::pair l = query_seqs::data_->local_position(q); query_id = (unsigned)l.first; seed_offset = (unsigned)l.second; const int query_len = query_seqs::data_->length(query_id); - const int score_cutoff = query_len > config.short_query_max_len ? context.cutoff_table(query_len) : context.short_query_ungapped_cutoff; + const int score_cutoff = ungapped_cutoff(query_len, context); + const int window = ungapped_window(query_len); + const sequence query_clipped = Util::Sequence::clip(query - window, window * 2, window); + const int window_left = int(query - query_clipped.data()), window_clipped = (int)query_clipped.length(); size_t hit_count = 0; const int interval_mod = config.left_most_interval > 0 ? seed_offset % config.left_most_interval : window_left, interval_overhang = std::max(window_left - interval_mod, 0); @@ -76,10 +99,15 @@ const size_t n = std::min(N, hits_end - i); for (size_t j = 0; j < n; ++j) subjects[j] = ref_seqs::data_->data(s[*(i + j)]) - window_left; - DP::window_ungapped_best(query_clipped.data(), subjects, n, window, scores); + DP::window_ungapped_best(query_clipped.data(), subjects, n, window_clipped, scores); for (size_t j = 0; j < n; ++j) { if (scores[j] > score_cutoff) { +#ifdef UNGAPPED_SPOUGE + std::pair l = ref_seqs::data_->local_position(s[*(i + j)]); + if (scores[j] < context.cutoff_table(query_len, ref_seqs::data_->length(l.first))) + continue; +#endif stats.inc(Statistics::TENTATIVE_MATCHES2); if (left_most_filter(query_clipped + interval_overhang, subjects[j] + interval_overhang, window_left - interval_overhang, shapes[sid].length_, context, sid == 0, sid, score_cutoff)) { stats.inc(Statistics::TENTATIVE_MATCHES3); diff -Nru diamond-aligner-2.0.6/src/stats/cbs.cpp diamond-aligner-2.0.7/src/stats/cbs.cpp --- diamond-aligner-2.0.6/src/stats/cbs.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/stats/cbs.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -21,11 +21,36 @@ #include "cbs.h" #include "../basic/config.h" #include "score_matrix.h" +#include "../basic/masking.h" namespace Stats { -std::vector composition(const sequence& s) { - std::vector r(20, 0.0); +CBS comp_based_stats(0, -1.0, -1.0, -1.0); + +CBS::CBS(unsigned code, double query_match_distance_threshold, double length_ratio_threshold, double angle): + query_match_distance_threshold(-1.0), + length_ratio_threshold(-1.0), + angle(50.0) +{ + switch (code) { + case COMP_BASED_STATS_AND_MATRIX_ADJUST: + this->angle = 70.0; + this->query_match_distance_threshold = 0.16; + this->length_ratio_threshold = 3.0; + default: + ; + } + if (angle != -1.0) + this->angle = angle; + if (query_match_distance_threshold != 1.0) + this->query_match_distance_threshold = query_match_distance_threshold; + if (length_ratio_threshold != -1.0) + this->length_ratio_threshold = length_ratio_threshold; +} + +Composition composition(const sequence& s) { + Composition r; + r.fill(0.0); int n = 0; for (size_t i = 0; i < s.length(); ++i) { int l = s[i]; @@ -41,15 +66,38 @@ return r; } -TargetMatrix::TargetMatrix(const double* query_comp, int query_len, const sequence& target) +int count_true_aa(const sequence& s) { + int n = 0; + for (size_t i = 0; i < s.length(); ++i) + if ((size_t)s[i] < TRUE_AA) + ++n; + return n; +} + +bool use_seg_masking(const sequence& a, const sequence& b) { + if (config.comp_based_stats != CBS::COMP_BASED_STATS_AND_MATRIX_ADJUST || a.length() != b.length()) + return true; + size_t n = 0; + for (size_t i = 0; i < a.length(); ++i) + if (a[i] == b[i]) + ++n; + return n != a.length(); +} + +TargetMatrix::TargetMatrix(const Composition& query_comp, int query_len, const sequence& target) { if (!CBS::matrix_adjust(config.comp_based_stats)) return; + + vector target_seq = target.copy(); + //Masking::get()(target_seq.data(), target_seq.size(), Masking::Algo::SEG); - auto c = composition(target); + //auto c = composition(target); + auto c = composition(sequence(target_seq.data(), target_seq.size())); + EMatrixAdjustRule rule = eUserSpecifiedRelEntropy; if (CBS::conditioned(config.comp_based_stats)) { - auto r = s_TestToApplyREAdjustmentConditional(query_len, (int)target.length(), query_comp, c.data(), score_matrix.background_freqs()); - if (r == eCompoScaleOldMatrix) + rule = s_TestToApplyREAdjustmentConditional(query_len, (int)target.length(), query_comp.data(), c.data(), score_matrix.background_freqs()); + if (rule == eCompoScaleOldMatrix && config.comp_based_stats != CBS::COMP_BASED_STATS_AND_MATRIX_ADJUST) return; } @@ -57,15 +105,20 @@ scores32.resize(32 * AMINO_ACID_COUNT); score_min = INT_MAX; score_max = INT_MIN; - vector s = CompositionMatrixAdjust(query_len, (int)target.length(), query_comp, c.data(), config.cbs_matrix_scale, score_matrix.ungapped_lambda(), score_matrix.joint_probs(), score_matrix.background_freqs()); + vector s; + + if (config.comp_based_stats == CBS::COMP_BASED_STATS || rule == eCompoScaleOldMatrix) + s = CompositionBasedStats(score_matrix.matrix32_scaled_pointers().data(), query_comp, c, score_matrix.ungapped_lambda(), score_matrix.freq_ratios()); + else + s = CompositionMatrixAdjust(query_len, count_true_aa(target), query_comp.data(), c.data(), config.cbs_matrix_scale, score_matrix.ideal_lambda(), score_matrix.joint_probs(), score_matrix.background_freqs()); for (size_t i = 0; i < AMINO_ACID_COUNT; ++i) { for (size_t j = 0; j < AMINO_ACID_COUNT; ++j) - if (i < 20 && j < 20) { - scores[i * 32 + j] = s[j * 20 + i]; - scores32[i * 32 + j] = s[j * 20 + i]; - score_min = std::min(score_min, s[j * 20 + i]); - score_max = std::max(score_max, s[j * 20 + i]); + if ((i < 20 || i == MASK_LETTER) && (j < 20 || j == MASK_LETTER)) { + scores[i * 32 + j] = s[j * AMINO_ACID_COUNT + i]; + scores32[i * 32 + j] = s[j * AMINO_ACID_COUNT + i]; + score_min = std::min(score_min, s[j * AMINO_ACID_COUNT + i]); + score_max = std::max(score_max, s[j * AMINO_ACID_COUNT + i]); //std::cerr << s2[j * 20 + i] << ' '; } else { diff -Nru diamond-aligner-2.0.6/src/stats/cbs.h diamond-aligner-2.0.7/src/stats/cbs.h --- diamond-aligner-2.0.6/src/stats/cbs.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/stats/cbs.h 2021-02-12 11:50:56.000000000 +0000 @@ -23,10 +23,13 @@ #include #include #include "../basic/sequence.h" +#include "standard_matrix.h" namespace Stats { -std::vector composition(const sequence& s); +using Composition = std::array; + +Composition composition(const sequence& s); struct TargetMatrix { @@ -35,7 +38,7 @@ TargetMatrix(const int16_t* query_matrix, const int16_t* target_matrix); - TargetMatrix(const double* query_comp, int query_len, const sequence& target); + TargetMatrix(const Composition& query_comp, int query_len, const sequence& target); std::vector scores; std::vector scores32; @@ -82,8 +85,17 @@ in standard context */ } Blast_MatrixInfo; -void Blast_FreqRatioToScore(double** matrix, int rows, int cols, double Lambda); -void s_RoundScoreMatrix(int** matrix, int rows, int cols, double** floatScoreMatrix); +void Blast_FreqRatioToScore(double** matrix, size_t rows, size_t cols, double Lambda); +void s_RoundScoreMatrix(int** matrix, size_t rows, size_t cols, double** floatScoreMatrix); +int s_GetMatrixScoreProbs(double** scoreProb, int* obs_min, int* obs_max, + const int* const* matrix, int alphsize, + const double* subjectProbArray, + const double* queryProbArray); +double s_CalcLambda(double probs[], int min_score, int max_score, double lambda0); +double ideal_lambda(const int** matrix); +void s_SetXUOScores(double** M, int alphsize, const double row_probs[], const double col_probs[]); +int count_true_aa(const sequence& s); +bool use_seg_masking(const sequence& a, const sequence& b); EMatrixAdjustRule s_TestToApplyREAdjustmentConditional(int Len_query, @@ -97,6 +109,8 @@ switch (code) { case 0: case 4: + case COMP_BASED_STATS: + case COMP_BASED_STATS_AND_MATRIX_ADJUST: return false; case 1: case 2: @@ -114,6 +128,8 @@ case HAUSER_AND_AVG_MATRIX_ADJUST: case HAUSER_AND_MATRIX_ADJUST: case MATRIX_ADJUST: + case COMP_BASED_STATS: + case COMP_BASED_STATS_AND_MATRIX_ADJUST: return true; default: throw std::runtime_error("Unknown CBS code."); @@ -140,6 +156,7 @@ switch (code) { case HAUSER_AND_AVG_MATRIX_ADJUST: case HAUSER_AND_MATRIX_ADJUST: + case COMP_BASED_STATS_AND_MATRIX_ADJUST: return true; default: return false; @@ -159,6 +176,8 @@ case HAUSER_AND_AVG_MATRIX_ADJUST: case HAUSER_AND_MATRIX_ADJUST: case MATRIX_ADJUST: + case COMP_BASED_STATS: + case COMP_BASED_STATS_AND_MATRIX_ADJUST: return 1; default: return 0; @@ -170,20 +189,40 @@ HAUSER_AND_AVG_MATRIX_ADJUST = 2, HAUSER_AND_MATRIX_ADJUST = 3, MATRIX_ADJUST = 4, + COMP_BASED_STATS = 5, + COMP_BASED_STATS_AND_MATRIX_ADJUST = 6, COUNT }; + CBS(unsigned code, double query_match_distance_threshold, double length_ratio_threshold, double angle); + double query_match_distance_threshold; + double length_ratio_threshold; + double angle; static constexpr int AVG_MATRIX_SCALE = 32; }; std::vector CompositionMatrixAdjust(int query_len, int target_len, const double* query_comp, const double* target_comp, int scale, double ungapped_lambda, const double* joint_probs, const double* background_freqs); +std::vector CompositionBasedStats(const int* const* matrix_in, const Composition& queryProb, const Composition& resProb, double lambda, const FreqRatios& freq_ratios); +int Blast_OptimizeTargetFrequencies(double x[], + int alphsize, + int* iterations, + const double q[], + const double row_sums[], + const double col_sums[], + int constrain_rel_entropy, + double relative_entropy, + double tol, + int maxits); +bool OptimizeTargetFrequencies(double* out, const double* joints_prob, const double* row_probs, const double* col_probs, double relative_entropy, double tol, int maxits); inline int16_t* make_16bit_matrix(const std::vector& matrix) { int16_t* out = new int16_t[TRUE_AA * TRUE_AA]; - for (size_t i = 0; i < TRUE_AA * TRUE_AA; ++i) - out[i] = matrix[i]; + for (size_t i = 0; i < TRUE_AA; ++i) + for (size_t j = 0; j < TRUE_AA; ++j) + out[i * TRUE_AA + j] = matrix[i * AMINO_ACID_COUNT + j]; return out; } extern const int ALPH_TO_NCBI[]; +extern CBS comp_based_stats; } \ No newline at end of file diff -Nru diamond-aligner-2.0.6/src/stats/comp_based_stats.cpp diamond-aligner-2.0.7/src/stats/comp_based_stats.cpp --- diamond-aligner-2.0.6/src/stats/comp_based_stats.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/stats/comp_based_stats.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -18,6 +18,37 @@ along with this program. If not, see . ****/ +/* =========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's official duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government have not placed any restriction on its use or reproduction. +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* Please cite the author in any work or product based on this material. +* +* ===========================================================================*/ + +/** @file composition_adjustment.c + * Highest level functions to solve the optimization problem for + * compositional score matrix adjustment. + * + * @author Yi-Kuo Yu, Alejandro Schaffer, E. Michael Gertz + */ + #include #include #include @@ -82,7 +113,7 @@ *mat = NULL; } -inline int BLAST_Gcd(int a, int b) +static int BLAST_Gcd(int a, int b) { int c; @@ -98,7 +129,7 @@ return a; } -inline static double +static double NlmKarlinLambdaNR(double* probs, int d, int low, int high, double lambda0, double tolx, int itmax, int maxNewton, int* itn) { @@ -177,7 +208,7 @@ return -log(x) / d; } -inline double +double Blast_KarlinLambdaNR(Blast_ScoreFreq* sfp, double initialLambdaGuess) { int low; /* Lowest score (must be negative) */ @@ -211,7 +242,7 @@ return returnValue; } -inline static double +double s_CalcLambda(double probs[], int min_score, int max_score, double lambda0) { @@ -236,7 +267,7 @@ return Blast_KarlinLambdaNR(&freq, lambda0); } -inline static void s_GetScoreRange(int* obs_min, int* obs_max, +static void s_GetScoreRange(int* obs_min, int* obs_max, const int* const* matrix, int rows) { int aa; /* index of an amino-acid in the 20 @@ -258,7 +289,7 @@ *obs_max = maxScore; } -inline static int +int s_GetMatrixScoreProbs(double** scoreProb, int* obs_min, int* obs_max, const int* const* matrix, int alphsize, const double* subjectProbArray, @@ -295,58 +326,123 @@ } void -Blast_FreqRatioToScore(double** matrix, int rows, int cols, double Lambda) +Blast_FreqRatioToScore(double** matrix, size_t rows, size_t cols, double Lambda) { - int i; - for (i = 0; i < rows; i++) { - int j; - for (j = 0; j < cols; j++) { + for (size_t i = 0; i < rows; i++) { + for (size_t j = 0; j < cols; j++) { if (0.0 == matrix[i][j]) { matrix[i][j] = COMPO_SCORE_MIN; } else { - matrix[i][j] = log(matrix[i][j]) / Lambda; + matrix[i][j] = log(matrix[i][j]) / Lambda; } } } } void -s_RoundScoreMatrix(int** matrix, int rows, int cols, +s_RoundScoreMatrix(int** matrix, size_t rows, size_t cols, double** floatScoreMatrix) { - int p, c; /*indices over positions and characters*/ - - for (p = 0; p < rows; p++) { - for (c = 0; c < cols; c++) { + for (size_t p = 0; p < rows; p++) { + for (size_t c = 0; c < cols; c++) { if (floatScoreMatrix[p][c] < INT_MIN) { matrix[p][c] = INT_MIN; } else { - matrix[p][c] = std::round(floatScoreMatrix[p][c]); + matrix[p][c] = (int)std::round(floatScoreMatrix[p][c]); } } } } -inline static int + +static double +s_CalcAvgScore(double* M, int alphsize, int incM, const double probs[]) +{ + int j; /* iteration index */ + double score_iX = 0.0; /* score of character i substituted by X */ + + for (j = 0; j < alphsize; j++) { + //if (alphaConvert[j] >= 0) { + /* If the column corresponds to a true amino acid */ + score_iX += M[j * incM] * probs[j]; + //} + } + return score_iX; +} + +static const double kMaximumXscore = -1.0; + +static double +s_CalcXScore(double* M, int alphsize, int incM, const double probs[]) +{ + return std::min(s_CalcAvgScore(M, alphsize, incM, probs), kMaximumXscore); +} + +void +s_SetXUOScores(double** M, int alphsize, + const double row_probs[], const double col_probs[]) +{ + int i; /* iteration index */ + double score_XX = 0.0; /* score of matching an X to an X */ + /* the matrix has alphsize colums (this variable exists just to + make things easier to read) */ + const int cols = alphsize; + + for (i = 0; i < alphsize; i++) { + //if (alphaConvert[i] >= 0) { + double avg_iX = s_CalcAvgScore(M[i], alphsize, 1, col_probs); + M[i][MASK_LETTER] = std::min(avg_iX, kMaximumXscore); + score_XX += avg_iX * row_probs[i]; + + M[MASK_LETTER][i] = s_CalcXScore(&M[0][i], alphsize, AMINO_ACID_COUNT, row_probs); + //} + } + M[MASK_LETTER][MASK_LETTER] = std::min(score_XX, kMaximumXscore); + + /* Set X scores for pairwise ambiguity characters */ + //M[eBchar][eXchar] = s_CalcXScore(M[eBchar], alphsize, 1, col_probs); + //M[eXchar][eBchar] = s_CalcXScore(&M[0][eBchar], alphsize, cols, row_probs); + + //M[eZchar][eXchar] = s_CalcXScore(M[eZchar], alphsize, 1, col_probs); + //M[eXchar][eZchar] = s_CalcXScore(&M[0][eZchar], alphsize, cols, row_probs); + //if (alphsize > eJchar) { + // M[eJchar][eXchar] = s_CalcXScore(M[eJchar], alphsize, 1, col_probs); + // M[eXchar][eJchar] = + // s_CalcXScore(&M[0][eJchar], alphsize, cols, row_probs); + //} + ///* Copy C scores to U */ + //memcpy(M[eSelenocysteine], M[eCchar], alphsize * sizeof(double)); + //for (i = 0; i < alphsize; i++) { + // M[i][eSelenocysteine] = M[i][eCchar]; + //} + ///* Copy X scores to O */ + //if (alphsize > eOchar) { + // memcpy(M[eOchar], M[eXchar], alphsize * sizeof(double)); + // for (i = 0; i < alphsize; i++) { + // M[i][eOchar] = M[i][eXchar]; + // } + //} +} + +static int s_ScaleSquareMatrix(int** matrix, int alphsize, const double row_prob[], const double col_prob[], - double Lambda, const double (&freq_ratios)[NCBI_ALPH][NCBI_ALPH]) + double Lambda, const double(&freq_ratios)[NCBI_ALPH][NCBI_ALPH]) { double** scores; /* a double precision matrix of scores */ - int i; /* iteration index */ scores = Nlm_DenseMatrixNew(alphsize, alphsize); if (scores == 0) return -1; - for (i = 0; i < alphsize; i++) { - for (int j = 0; j < alphsize; ++j) + for (size_t i = 0; i < TRUE_AA; i++) { + for (size_t j = 0; j < TRUE_AA; ++j) scores[i][j] = freq_ratios[ALPH_TO_NCBI[i]][ALPH_TO_NCBI[j]]; //memcpy(scores[i], freq_ratios[i], alphsize * sizeof(double)); } - Blast_FreqRatioToScore(scores, alphsize, alphsize, Lambda); - //s_SetXUOScores(scores, alphsize, row_prob, col_prob); + Blast_FreqRatioToScore(scores, TRUE_AA, TRUE_AA, Lambda); + s_SetXUOScores(scores, TRUE_AA, row_prob, col_prob); s_RoundScoreMatrix(matrix, alphsize, alphsize, scores); /*for (i = 0; i < alphsize; i++) { matrix[i][(int)STOP_LETTER] = start_matrix[i][(int)STOP_LETTER]; @@ -368,16 +464,7 @@ double* scoreArray; /* an array of score probabilities */ int out_of_memory; /* status flag to indicate out of memory */ - /*if (ungappedLambda == 0.0) { - - s_GetMatrixScoreProbs(&scoreArray, &obs_min, &obs_max, matrix_in, 20, BLOSUM62_bg, BLOSUM62_bg); - ungappedLambda = s_CalcLambda(scoreArray, obs_min, obs_max, 0.5); - //std::cerr << "lambda=" << ungappedLambda << endl; - free(scoreArray); - - }*/ - - out_of_memory = s_GetMatrixScoreProbs(&scoreArray, &obs_min, &obs_max, matrix_in, 20, resProb, queryProb); + out_of_memory = s_GetMatrixScoreProbs(&scoreArray, &obs_min, &obs_max, matrix_in, TRUE_AA, resProb, queryProb); const double ungappedLambda = lambda / config.cbs_matrix_scale; if (out_of_memory) @@ -385,6 +472,9 @@ correctUngappedLambda = s_CalcLambda(scoreArray, obs_min, obs_max, ungappedLambda); + if (correctUngappedLambda < 0.0) + return -1; + /* calc_lambda will return -1 in the case where the * expected score is >=0; however, because of the MAX statement 3 * lines below, LambdaRatio should always be > 0; the succeeding @@ -399,8 +489,9 @@ if (*LambdaRatio > 0) { double scaledLambda = ungappedLambda / (*LambdaRatio); - s_ScaleSquareMatrix(matrix, 20, - queryProb, resProb, scaledLambda, freq_ratios); + if (s_ScaleSquareMatrix(matrix, AMINO_ACID_COUNT, + queryProb, resProb, scaledLambda, freq_ratios) < 0) + return -1; } free(scoreArray); @@ -408,4 +499,67 @@ return 0; } +vector CompositionBasedStats(const int* const* matrix_in, const Composition& queryProb, const Composition& resProb, double lambda, const FreqRatios& freq_ratios) { + vector v(AMINO_ACID_COUNT*AMINO_ACID_COUNT); + vector p; + p.reserve(AMINO_ACID_COUNT); + for (size_t i = 0; i < AMINO_ACID_COUNT; ++i) + p.push_back(&v[i * AMINO_ACID_COUNT]); + double LambdaRatio; + if (Blast_CompositionBasedStats(p.data(), &LambdaRatio, matrix_in, queryProb.data(), resProb.data(), lambda, freq_ratios) != 0) { + for (size_t i = 0; i < AMINO_ACID_COUNT; ++i) + for (size_t j = 0; j < AMINO_ACID_COUNT; ++j) + v[i * AMINO_ACID_COUNT + j] = score_matrix(i, j) * config.cbs_matrix_scale; + } + return v; +} + +/* amino acid background frequencies from Robinson and Robinson */ +static std::pair Robinson_prob[] = { + { 'A', 78.05 }, + { 'C', 19.25 }, + { 'D', 53.64 }, + { 'E', 62.95 }, + { 'F', 38.56 }, + { 'G', 73.77 }, + { 'H', 21.99 }, + { 'I', 51.42 }, + { 'K', 57.44 }, + { 'L', 90.19 }, + { 'M', 22.43 }, + { 'N', 44.87 }, + { 'P', 52.03 }, + { 'Q', 42.64 }, + { 'R', 51.29 }, + { 'S', 71.20 }, + { 'T', 58.41 }, + { 'V', 64.41 }, + { 'W', 13.30 }, + { 'Y', 32.16 } +}; /**< amino acid background frequencies from Robinson and Robinson */ + +double ideal_lambda(const int** matrix) { + int obs_min, obs_max; + double* scoreArray; + double bg[TRUE_AA]; + double s = 0.0; + for (size_t i = 0; i < TRUE_AA; ++i) { + int j = value_traits.from_char(Robinson_prob[i].first); + bg[j] = Robinson_prob[i].second; + s += bg[j]; + } + for (size_t i = 0; i < TRUE_AA; ++i) + bg[i] /= s; + + int out_of_memory = s_GetMatrixScoreProbs(&scoreArray, &obs_min, &obs_max, matrix, TRUE_AA, bg, bg); + + if (out_of_memory) + throw std::runtime_error("Failed lambda calculation."); + double correctUngappedLambda = s_CalcLambda(scoreArray, obs_min, obs_max, 0.5); + if (correctUngappedLambda < 0.0) + throw std::runtime_error("Failed lambda calculation."); + free(scoreArray); + return correctUngappedLambda; +} + } \ No newline at end of file diff -Nru diamond-aligner-2.0.6/src/stats/matrix_adjust.cpp diamond-aligner-2.0.7/src/stats/matrix_adjust.cpp --- diamond-aligner-2.0.6/src/stats/matrix_adjust.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/stats/matrix_adjust.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -18,21 +18,29 @@ along with this program. If not, see . ****/ -#include "../basic/config.h" -#include "score_matrix.h" -#include -#include -#include -#include -#include "../lib/blast/nlm_linear_algebra.h" -#include "cbs.h" - -namespace Stats { - -static constexpr int COMPO_NUM_TRUE_AA = 20; -static const int kReMatrixAdjustmentPseudocounts = 20; -/** relative entropy of BLOSUM62 */ -static const double kFixedReBlosum62 = 0.44; +/* =========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's official duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government have not placed any restriction on its use or reproduction. +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* Please cite the author in any work or product based on this material. +* +* ===========================================================================*/ /** * @file optimize_target_freq.c @@ -95,6 +103,30 @@ * @author E. Michael Gertz */ + +#include "../basic/config.h" +#include "score_matrix.h" +#include +#include +#include +#include +#include "../lib/blast/nlm_linear_algebra.h" +#include "cbs.h" + +namespace Stats { + +static constexpr int COMPO_NUM_TRUE_AA = 20; +static const int kReMatrixAdjustmentPseudocounts = 20; +/** relative entropy of BLOSUM62 */ +static const double kFixedReBlosum62 = 0.44; + +static void print(const double* v, int n, const char* s) { + /*std::cout << s << std::endl; + for (int i = 0; i < n; ++i) + std::cout << v[i] << ' '; + std::cout << std::endl;*/ +} + //#include /** bound on error for Newton's method */ @@ -258,6 +290,7 @@ rA[i + alphsize - 1] = row_sums[i]; } MultiplyByA(1.0, rA, alphsize, -1.0, x); + print(rA, 2 * 20 - 1, "ResidualsLinearConstraints rA"); } @@ -296,6 +329,7 @@ } } MultiplyByAtranspose(1.0, resids_x, alphsize, 1.0, z); + print(resids_x, n, "DualResiduals resids_x"); } @@ -355,6 +389,7 @@ } *rnorm = sqrt(norm_resids_x * norm_resids_x + norm_resids_z * norm_resids_z); + print(rnorm, 1, "CalculateResiduals rnorm"); } @@ -529,6 +564,7 @@ /* Then we compute J D^{-1} J^T; First fill in the part that corresponds * to the linear constraints */ ScaledSymmetricProductA(W, Dinv, alphsize); + print(W[0], 820, "FactorReNewtonSystem W"); if (constrain_rel_entropy) { /* Save the gradient of the relative entropy constraint. */ @@ -757,6 +793,7 @@ if (grads == NULL) goto error_return; ComputeScoresFromProbs(old_scores, alphsize, q, row_sums, col_sums); + print(old_scores, n, "old_scores"); /* Use q as the initial value for x */ memcpy(x, q, n * sizeof(double)); @@ -767,6 +804,8 @@ /* Compute the residuals */ EvaluateReFunctions(values, grads, alphsize, x, q, old_scores, constrain_rel_entropy); + print(values, 2, "values"); + print(grads[0], 2 * n, "grads"); CalculateResiduals(&rnorm, resids_x, alphsize, resids_z, values, grads, row_sums, col_sums, x, z, constrain_rel_entropy, relative_entropy); @@ -801,6 +840,7 @@ } } } + print(x, 20 * 20, "x"); converged = 0; if (its <= maxits && rnorm <= tol) { /* Newton's iteration converged */ @@ -857,7 +897,7 @@ /* Documented in composition_adjustment.h. */ void -Blast_TrueAaToStdTargetFreqs(double** StdFreq, int StdAlphsize, +Blast_TrueAaToStdTargetFreqs(double** StdFreq, size_t StdAlphsize, const double* freq) { /* Note I'm using a rough convention for this routine that uppercase @@ -867,37 +907,37 @@ */ /* Shorter names for the sizes of the two alphabets */ const int small_alphsize = COMPO_NUM_TRUE_AA; - int A, B; /* characters in the std (big) alphabet */ - int a, b; /* characters in the small alphabet */ + size_t A, B; /* characters in the std (big) alphabet */ + size_t a, b; /* characters in the small alphabet */ double sum; /* sum of values in target_freq; used to normalize */ sum = 0.0; for (a = 0; a < small_alphsize; a++) { for (b = 0; b < small_alphsize; b++) { - sum += freq[a * 20 + b]; + sum += freq[a * TRUE_AA + b]; } } for (A = 0; A < StdAlphsize; A++) { /* for all rows */ - //if (alphaConvert[A] < 0) { - // /* the row corresponds to a nonstandard reside */ - // for (B = 0; B < StdAlphsize; B++) { - // StdFreq[A][B] = 0.0; - // } - //} - //else { + if (A >= TRUE_AA) { + /* the row corresponds to a nonstandard reside */ + for (B = 0; B < StdAlphsize; B++) { + StdFreq[A][B] = 0.0; + } + } + else { /* the row corresponds to a standard reside */ a = A; for (B = 0; B < StdAlphsize; B++) { /* for all columns */ - if (B < 0) { + if (B >= TRUE_AA) { /* the column corresponds to a nonstandard reside */ StdFreq[A][B] = 0.0; } else { /* the column corresponds to a standard reside */ b = B; - StdFreq[A][B] = freq[a * 20 + b] / sum; + StdFreq[A][B] = freq[a * TRUE_AA + b] / sum; } } /* Set values for two-character ambiguities */ @@ -906,7 +946,7 @@ //if (StdAlphsize > eJchar) { // StdFreq[A][eJchar] = StdFreq[A][eIchar] + StdFreq[A][eLchar]; //} - //} + } } /* Add rows to set values for two-character ambiguities */ //memcpy(StdFreq[eBchar], StdFreq[eDchar], StdAlphsize * sizeof(double)); @@ -938,8 +978,6 @@ } } - - /** * Given a set of target frequencies and two sets of character * probabilities for the true amino acids in the ARND alphabet, @@ -957,7 +995,7 @@ * @param Lambda the desired scale of this matrix */ static int -s_ScoresStdAlphabet(int** Matrix, int Alphsize, +s_ScoresStdAlphabet(int** Matrix, size_t Alphsize, const double* target_freq, const double row_prob[], const double col_prob[], double Lambda) @@ -967,7 +1005,6 @@ * and lowercase letters refer to the true amino acid (smaller) * alphabet. */ - int i; /* row and column probabilities in the NCBIstdaa alphabet */ //double RowProb[COMPO_LARGEST_ALPHABET]; //double ColProb[COMPO_LARGEST_ALPHABET]; @@ -983,9 +1020,9 @@ //s_SetPairAmbigProbsToSum(ColProb, Alphsize); Blast_TrueAaToStdTargetFreqs(Scores, Alphsize, target_freq); - Blast_CalcFreqRatios(Scores, Alphsize, row_prob, col_prob); + Blast_CalcFreqRatios(Scores, TRUE_AA, row_prob, col_prob); Blast_FreqRatioToScore(Scores, Alphsize, Alphsize, Lambda); - //s_SetXUOScores(Scores, Alphsize, RowProb, ColProb); + s_SetXUOScores(Scores, TRUE_AA, row_prob, col_prob); s_RoundScoreMatrix(Matrix, Alphsize, Alphsize, Scores); Nlm_DenseMatrixFree(&Scores); @@ -998,9 +1035,8 @@ } /* Documented in composition_adjustment.h. */ -int +static int Blast_CompositionMatrixAdj(int** matrix, - int alphsize, EMatrixAdjustRule matrix_adjust_rule, int length1, int length2, @@ -1010,11 +1046,17 @@ const double* joint_probs, const double* background_freqs) { + /*for (int i = 0; i < 20; ++i) + printf("%.10f ", stdaa_row_probs[i]); + printf("\n"); + for (int i = 0; i < 20; ++i) + printf("%.10f ", stdaa_col_probs[i]); + printf("\n");*/ int iteration_count, status; double row_probs[COMPO_NUM_TRUE_AA], col_probs[COMPO_NUM_TRUE_AA]; /* Target RE when optimizing the matrix; zero if the relative entropy should not be constrained. */ - double dummy, desired_re = 0.0; + double desired_re = 0.0; std::copy(stdaa_row_probs, stdaa_row_probs + 20, row_probs); std::copy(stdaa_col_probs, stdaa_col_probs + 20, col_probs); @@ -1051,7 +1093,7 @@ Blast_ApplyPseudocounts(col_probs, length2, background_freqs); - vector mat_final(20 * 20); + vector mat_final(TRUE_AA * TRUE_AA); status = Blast_OptimizeTargetFrequencies(mat_final.data(), @@ -1068,7 +1110,7 @@ return status; return - s_ScoresStdAlphabet(matrix, alphsize, mat_final.data(), + s_ScoresStdAlphabet(matrix, AMINO_ACID_COUNT, mat_final.data(), row_probs, col_probs, lambda); } @@ -1215,9 +1257,9 @@ which_rule = eUserSpecifiedRelEntropy; } else { - if ((D_m_q >= config.query_match_distance_threshold) && - (len_large / len_small > config.length_ratio_threshold) && - (angle > config.cbs_angle)) { + if ((D_m_q > comp_based_stats.query_match_distance_threshold) && + (len_large / len_small > comp_based_stats.length_ratio_threshold) && + (angle > comp_based_stats.angle)) { which_rule = eCompoScaleOldMatrix; } else { @@ -1228,13 +1270,12 @@ } vector CompositionMatrixAdjust(int query_len, int target_len, const double* query_comp, const double* target_comp, int scale, double ungapped_lambda, const double* joint_probs, const double* background_freqs) { - vector v(TRUE_AA * TRUE_AA); + vector v(AMINO_ACID_COUNT * AMINO_ACID_COUNT); vector p; - p.reserve(TRUE_AA); - for (size_t i = 0; i < TRUE_AA; ++i) - p.push_back(&v[i * TRUE_AA]); + p.reserve(AMINO_ACID_COUNT); + for (size_t i = 0; i < AMINO_ACID_COUNT; ++i) + p.push_back(&v[i * AMINO_ACID_COUNT]); int r = Blast_CompositionMatrixAdj(p.data(), - TRUE_AA, eUserSpecifiedRelEntropy, query_len, target_len, @@ -1244,9 +1285,9 @@ joint_probs, background_freqs); if (r != 0) { - for (size_t i = 0; i < TRUE_AA; ++i) - for (size_t j = 0; j < TRUE_AA; ++j) - v[i * 20 + j] = score_matrix(i, j) * scale; + for (size_t i = 0; i < AMINO_ACID_COUNT; ++i) + for (size_t j = 0; j < AMINO_ACID_COUNT; ++j) + v[i * AMINO_ACID_COUNT + j] = score_matrix(i, j) * scale; //throw std::runtime_error("Error computing composition matrix adjust."); } return v; diff -Nru diamond-aligner-2.0.6/src/stats/matrix_adjust_eigen.cpp diamond-aligner-2.0.7/src/stats/matrix_adjust_eigen.cpp --- diamond-aligner-2.0.6/src/stats/matrix_adjust_eigen.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/stats/matrix_adjust_eigen.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -1,6 +1,6 @@ /**** DIAMOND protein aligner -Copyright (C) 2020 Max Planck Society for the Advancement of Science e.V. +Copyright (C) 2020-2021 Max Planck Society for the Advancement of Science e.V. Code developed by Benjamin Buchfink @@ -18,35 +18,432 @@ along with this program. If not, see . ****/ +#include +#include #include "../lib/Eigen/Dense" +#include "../basic/value.h" +#include "../util/profiler.h" + +// #define DYNAMIC using namespace Eigen; +using std::cout; +using std::endl; + +//#define DEBUG_OUT(x) cout << (x) << endl +#define DEBUG_OUT(x) + +static const size_t N = TRUE_AA; +typedef float Float; +const auto StorageOrder = RowMajor; +#ifdef DYNAMIC +typedef Matrix MatrixN; +typedef Matrix Matrix2Nx; +typedef Matrix VectorN; +typedef Matrix Vector2N; +typedef Matrix Vector2Nx; +typedef Matrix VectorNN; +typedef Matrix Vector2NN; +typedef decltype(Vector2Nx().head(Index())) Block2N; +#else +typedef Matrix MatrixN; +typedef Matrix Matrix2Nx; +typedef Matrix VectorN; +typedef Matrix Vector2N; +typedef Matrix Vector2Nx; +typedef Matrix VectorNN; +typedef Matrix Vector2NN; +typedef decltype(Vector2Nx().head<2 * N - 1>()) Block2N; +#endif +using Values = Float[2]; + +typedef struct ReNewtonSystem { +#ifdef DYNAMIC + ReNewtonSystem(): + W(2*N,2*N), + Dinv(N,N), + grad_re(N*N) + {} +#endif + Matrix2Nx W; /**< A lower-triangular matrix + representing a factorization of + the (2,2) block, -J D^{-1} J^T, of + the condensed linear system */ + MatrixN Dinv; /**< The diagonal elements of the + inverse of the necessarily + diagonal (1,1) block of the linear + system */ + VectorNN grad_re; /**< the gradient of the + relative-entropy constraint, if + this constraint is used. */ +} ReNewtonSystem; + +static void ScaledSymmetricProductA(Matrix2Nx& W, const MatrixN& diagonal) +{ + Profiler prof("ScaledSymmetricProductA"); + W.fill(0.0); + for (size_t i = 0; i < N; i++) { + for (size_t j = 0; j < N; j++) { + const Float dd = diagonal(i, j); + + W(j, j) += dd; + if (i > 0) { + W(i + N - 1, j) += dd; + W(i + N - 1, i + N - 1) += dd; + } + } + } +} + +static void MultiplyByA(Float beta, Block2N y, Float alpha, const MatrixN& x) +{ + Profiler prof("MultiplyByA"); + if (beta == 0.0) + y.fill(0.0); + else if (beta != 1.0) { + /* rescale y */ + y *= beta; + } + + VectorN sc = x.colwise().sum() * alpha, sr = x.rowwise().sum() * alpha; + + for (size_t i = 0; i < N; ++i) + y(i) += sc(i); + + for (size_t i = 1; i < N; ++i) + y(i + N - 1) += sr(i); +} + +static void MultiplyByAtranspose(Float beta, MatrixN& y, Float alpha, const Vector2N& x) +{ + Profiler prof("MultiplyByAtranspose"); + if (beta == 0.0) { + /* Initialize y to zero, without reading any elements from y */ + y.fill(0.0); + } + else if (beta != 1.0) { + /* rescale y */ + y *= beta; + } + + VectorN v = x.head(); + v *= alpha; + y.rowwise() += v.transpose(); + + v = x.tail(); + v[0] = 0.0; + v *= alpha; + y.colwise() += v; +} + +static void ResidualsLinearConstraints(Block2N rA, const MatrixN& x, const VectorN& row_sums, const VectorN& col_sums) +{ +#ifdef DYNAMIC + rA.head(N) = col_sums; + rA.tail(N-1) = row_sums.tail(N-1); +#else + rA.head() = col_sums; + rA.tail() = row_sums.tail(); +#endif + MultiplyByA(1.0, rA, -1.0, x); + DEBUG_OUT("ResidualsLinearConstraints rA"); + DEBUG_OUT(rA); +} + +static void DualResiduals(MatrixN& resids_x, const Vector2NN& grads, const Vector2Nx& z) +{ + Float eta = z[2 * N - 1]; /* dual variable for the relative + entropy constraint */ + + VectorNN v = -grads.row(0) + eta * grads.row(1); + for (size_t i = 0; i < N; ++i) + resids_x.row(i) = v.segment(i * N); + MultiplyByAtranspose(1.0, resids_x, 1.0, z.head<2*N-1>()); + DEBUG_OUT("DualResiduals resids_x "); + DEBUG_OUT(resids_x); +} + +static void CalculateResiduals(Float* rnorm, + MatrixN& resids_x, + Vector2Nx& resids_z, + const Values& values, + const Vector2NN& grads, + const VectorN& row_sums, + const VectorN& col_sums, + const MatrixN& x, + const Vector2Nx& z, + Float relative_entropy) +{ + /* Euclidean norms of the primal and dual residuals */ + Float norm_resids_z, norm_resids_x; + + DualResiduals(resids_x, grads, z); + norm_resids_x = resids_x.norm(); + +#ifdef DYNAMIC + ResidualsLinearConstraints(resids_z.head(2*N-1), x, row_sums, col_sums); +#else + ResidualsLinearConstraints(resids_z.head<2 * N - 1>(), x, row_sums, col_sums); +#endif + resids_z[2 * N - 1] = relative_entropy - values[1]; + norm_resids_z = resids_z.norm(); + *rnorm = sqrt(norm_resids_x * norm_resids_x + norm_resids_z * norm_resids_z); + DEBUG_OUT("CalculateResiduals rnorm="); + DEBUG_OUT(*rnorm); +} + +static void FactorReNewtonSystem(ReNewtonSystem& newton_system, + const MatrixN& x, + const Vector2Nx& z, + const Vector2NN& grads, + MatrixN& workspace) +{ + Profiler prof("FactorReNewtonSystem"); + int n; /* the length of x */ + int m; /* the length of z */ + + /* Pointers to fields in newton_systems; the names of the local + * variables match the names of the fields. */ + Matrix2Nx& W = newton_system.W; + MatrixN& Dinv = newton_system.Dinv; + VectorNN& grad_re = newton_system.grad_re; + + n = N * N; + m = 2 * N; + + /* The original system has the form + * + * (D J^T) + * (J 0 ). + * + * We block reduce the system to + * + * (D J^T ) + * (0 -J D^{-1} J^T). + * + * First we find the inverse of the diagonal matrix D. */ + + Float eta; /* dual variable for the relative + entropy constraint */ + eta = z[m - 1]; + + Dinv = x; + Dinv /= 1 - eta; + + /* Then we compute J D^{-1} J^T; First fill in the part that corresponds + * to the linear constraints */ + ScaledSymmetricProductA(W, Dinv); + DEBUG_OUT("FactorReNewtonSystem W="); + DEBUG_OUT(W); + + /* Save the gradient of the relative entropy constraint. */ + grad_re = grads.row(1); + + /* Fill in the part of J D^{-1} J^T that corresponds to the relative + * entropy constraint. */ + W(m - 1, m - 1) = 0.0; + for (size_t i = 0; i < N; ++i) { +#ifdef DYNAMIC + workspace.row(i) = Dinv.row(i).cwiseProduct(grad_re.segment(i * N, N)); + W(m - 1, m - 1) += workspace.row(i).dot(grad_re.segment(i * N, N)); +#else + workspace.row(i) = Dinv.row(i).cwiseProduct(grad_re.segment(i * N)); + W(m - 1, m - 1) += workspace.row(i).dot(grad_re.segment(i * N)); +#endif + } + + Vector2Nx r = W.row(m - 1); +#ifdef DYNAMIC + MultiplyByA(0.0, r.head(2*N-1), 1.0, workspace); +#else + MultiplyByA(0.0, r.head<2 * N - 1>(), 1.0, workspace); +#endif + W.row(m - 1) = r; +} + +static void SolveReNewtonSystem(MatrixN& x, Vector2Nx& z, const ReNewtonSystem& newton_system, MatrixN& workspace) { + Profiler prof("SolveReNewtonSystem"); + const Matrix2Nx& W = newton_system.W; + const MatrixN& Dinv = newton_system.Dinv; + const VectorNN& grad_re = newton_system.grad_re; + + const int m = 2 * N; + + /* Apply the same block reduction to the right-hand side as was + * applied to the matrix: + * + * rzhat = rz - J D^{-1} rx + */ + workspace = x.cwiseProduct(Dinv); +#ifdef DYNAMIC + MultiplyByA(1.0, z.head(2*N-1), -1.0, workspace); +#else + MultiplyByA(1.0, z.head<2 * N - 1>(), -1.0, workspace); +#endif + + for (size_t i = 0; i < N; ++i) + z[m - 1] -= grad_re.segment(i * N).dot(workspace.row(i)); + + /* Solve for step in z, using the inverse of J D^{-1} J^T */ + Profiler prof2("LLT"); + z = W.llt().solve(z); + prof.finish(); + + /* Backsolve for the step in x, using the newly-computed step in z. + * + * x = D^{-1) (rx + J\T z) + */ + for (size_t i = 0; i < N; ++i) + x.row(i) += grad_re.segment(i * N) * z[m - 1]; + + MultiplyByAtranspose(1.0, x, 1.0, z.head<2 * N - 1>()); + x.array() *= Dinv.array(); +} + +static void EvaluateReFunctions(Values& values, Vector2NN& grads, const MatrixN& x, const MatrixN& q, const MatrixN& scores) +{ + Profiler prof("EvaluateReFunctions"); + values[0] = 0.0; values[1] = 0.0; + + MatrixN tmp = (x.array() / q.array()).log(); + for (size_t i = 0; i < N; ++i) { + values[0] += x.row(i).dot(tmp.row(i)); + grads.row(0).segment(i * N) = tmp.row(i); + grads.row(0).segment(i * N).array() += 1.0; + } + + tmp.array() += scores.array(); + for (size_t i = 0; i < N; ++i) { + values[1] += x.row(i).dot(tmp.row(i)); + grads.row(1).segment(i * N) = tmp.row(i); + grads.row(1).segment(i * N).array() += 1.0; + } +} + +static void ComputeScoresFromProbs(MatrixN& scores, + const MatrixN& target_freqs, + const VectorN& row_freqs, + const VectorN& col_freqs) +{ + for (size_t i = 0; i < N; i++) { + for (size_t j = 0; j < N; j++) { + scores(i, j) = log(target_freqs(i, j) / (row_freqs[i] * col_freqs[j])); + } + } +} + +static Float Nlm_StepBound(const MatrixN& x, const MatrixN& step_x, Float max) +{ + Float alpha = max; /* current largest permitted step */ + + for (size_t i = 0; i < N; i++) { + for (size_t j = 0; j < N; ++j) { + Float alpha_i; /* a step to the boundary for the current i */ + + alpha_i = -x(i, j) / step_x(i, j); + if (alpha_i >= 0 && alpha_i < alpha) { + alpha = alpha_i; + } + } + } + return alpha; +} + +static bool Blast_OptimizeTargetFrequencies(MatrixN& x, + const MatrixN& q, + const VectorN& row_sums, + const VectorN& col_sums, + Float relative_entropy, + Float tol, + int maxits) +{ + DEBUG_OUT("========================================================================================================"); +#ifdef DYNAMIC + Values values; /* values of the nonlinear functions at this iterate */ + Vector2NN grads(2,N*N); /* gradients of the nonlinear functions at this iterate */ + ReNewtonSystem newton_system; /* factored matrix of the linear system to be solved at this iteration */ + Vector2Nx z(2*N); /* dual variables (Lagrange multipliers) */ + z.fill(0.0); + MatrixN resids_x(N,N); /* dual residuals (gradient of Lagrangian) */ + Vector2Nx resids_z(2*N); /* primal (constraint) residuals */ + Float rnorm; /* norm of the residuals for the current iterate */ + MatrixN old_scores(N,N); /* a scoring matrix, with lambda = 1, generated from q, row_sums and col_sums */ + MatrixN workspace(N,N); /* A vector for intermediate computations */ +#else + Values values; /* values of the nonlinear functions at this iterate */ + Vector2NN grads; /* gradients of the nonlinear functions at this iterate */ + ReNewtonSystem newton_system; /* factored matrix of the linear system to be solved at this iteration */ + Vector2Nx z; /* dual variables (Lagrange multipliers) */ + z.fill(0.0); + MatrixN resids_x; /* dual residuals (gradient of Lagrangian) */ + Vector2Nx resids_z; /* primal (constraint) residuals */ + Float rnorm; /* norm of the residuals for the current iterate */ + MatrixN old_scores; /* a scoring matrix, with lambda = 1, generated from q, row_sums and col_sums */ + MatrixN workspace; /* A vector for intermediate computations */ +#endif + + ComputeScoresFromProbs(old_scores, q, row_sums, col_sums); + DEBUG_OUT(old_scores); + + /* Use q as the initial value for x */ + x = q; + int its = 0; /* Initialize the iteration count. Note that we may converge in zero iterations if the initial x is optimal. */ + while (its <= maxits) { + /* Compute the residuals */ + EvaluateReFunctions(values, grads, x, q, old_scores); + DEBUG_OUT("Values "); + DEBUG_OUT(values[0]); + DEBUG_OUT(values[1]); + DEBUG_OUT("Grads"); + DEBUG_OUT(grads); + + CalculateResiduals(&rnorm, resids_x, resids_z, values, + grads, row_sums, col_sums, x, z, relative_entropy); + + /* and check convergence; the test correctly handles the case + in which rnorm is NaN (not a number). */ + if (!(rnorm > tol)) { + /* We converged at the current iterate */ + break; + } + + if (++its <= maxits) { + FactorReNewtonSystem(newton_system, x, z, grads, workspace); + SolveReNewtonSystem(resids_x, resids_z, newton_system, workspace); + + /* Calculate a value of alpha that ensure that x is positive */ + const Float alpha = Nlm_StepBound(x, resids_x, Float(1.0 / .95)) * 0.95; + + x += alpha * resids_x; + z += alpha * resids_z; + } + + } + DEBUG_OUT(x); + return its <= maxits && rnorm <= tol && z[2*N-1] < 1.0; +} + +namespace Stats { + +bool OptimizeTargetFrequencies(double* out, const double* joints_prob, const double* row_probs, const double* col_probs, double relative_entropy, double tol, int maxits) { +#ifdef DYNAMIC + MatrixN x(N,N) , q(N,N); + VectorN row_sums(N), col_sums(N); +#else + MatrixN x, q; + VectorN row_sums, col_sums; +#endif + for (size_t i = 0; i < N; ++i) { + for (size_t j = 0; j < N; ++j) + q(i, j) = (Float)joints_prob[i * N + j]; + row_sums[i] = (Float)row_probs[i]; + col_sums[i] = (Float)col_probs[i]; + } + bool r = Blast_OptimizeTargetFrequencies(x, q, row_sums, col_sums, relative_entropy, tol, maxits); + for (size_t i = 0; i < N; ++i) + for (size_t j = 0; j < N; ++j) + out[i * N + j] = x(i, j); + return r; +} -//static void -//ScaledSymmetricProductA(double** W, const Matrix& diagonal, int alphsize) -//{ -// int rowW, colW; /* iteration indices over the rows and columns of W */ -// int i, j; /* iteration indices over characters in the alphabet */ -// int m; /* The number of rows in A; also the size of W */ -// -// m = 2 * alphsize - 1; -// -// for (rowW = 0; rowW < m; rowW++) { -// for (colW = 0; colW <= rowW; colW++) { -// W[rowW][colW] = 0.0; -// } -// } -// for (i = 0; i < alphsize; i++) { -// for (j = 0; j < alphsize; j++) { -// double dd; /* an individual diagonal element */ -// -// dd = diagonal[i * alphsize + j]; -// -// W[j][j] += dd; -// if (i > 0) { -// W[i + alphsize - 1][j] += dd; -// W[i + alphsize - 1][i + alphsize - 1] += dd; -// } -// } -// } -//} \ No newline at end of file +} \ No newline at end of file diff -Nru diamond-aligner-2.0.6/src/stats/score_matrix.cpp diamond-aligner-2.0.7/src/stats/score_matrix.cpp --- diamond-aligner-2.0.6/src/stats/score_matrix.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/stats/score_matrix.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -43,11 +43,13 @@ gap_extend_ (gap_extend == -1 ? standard_matrix_->default_gap_extend : gap_extend), frame_shift_(frameshift), db_letters_ ((double)db_letters), + scale_(scale), name_(matrix), matrix8_(standard_matrix_->scores.data(), stop_match_score), matrix32_(standard_matrix_->scores.data(), stop_match_score), matrix32_scaled_(standard_matrix_->freq_ratios, standard_matrix_->ungapped_constants().Lambda, standard_matrix_->scores.data(), scale), bias_((char)(-low_score())), + ideal_lambda_(Stats::ideal_lambda(matrix32_.pointers().data())), matrix8u_(standard_matrix_->scores.data(), stop_match_score, bias_), matrix8_low_(standard_matrix_->scores.data(), stop_match_score, 0, 16), matrix8_high_(standard_matrix_->scores.data(), stop_match_score, 0, 16, 16), @@ -186,7 +188,7 @@ if (i < TRUE_AA && j < TRUE_AA) data[i * 32 + j] = std::round(std::log(freq_ratios[Stats::ALPH_TO_NCBI[i]][Stats::ALPH_TO_NCBI[j]]) / lambda * scale); else if (i < n && j < n) - data[i * 32 + j] = std::max((int)scores[i * n + j] * scale, SCHAR_MIN); + data[i * 32 + j] = (int)scores[i * n + j] * scale; else data[i * 32 + j] = SCHAR_MIN; } @@ -204,7 +206,12 @@ double Score_matrix::evalue(int raw_score, unsigned query_len, unsigned subject_len) const { - return evaluer.evalue(raw_score, query_len, subject_len) * (double)db_letters_ / (double)subject_len; + return evaluer.evalue((double)raw_score / scale_, query_len, subject_len) * (double)db_letters_ / (double)subject_len; +} + +double Score_matrix::evalue_norm(int raw_score, unsigned query_len, unsigned subject_len) const +{ + return evaluer.evalue((double)raw_score / scale_, query_len, subject_len) * (double)1e9 / (double)subject_len; } bool Score_matrix::report_cutoff(int score, double evalue) const { diff -Nru diamond-aligner-2.0.6/src/stats/score_matrix.h diamond-aligner-2.0.7/src/stats/score_matrix.h --- diamond-aligner-2.0.6/src/stats/score_matrix.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/stats/score_matrix.h 2021-02-12 11:50:56.000000000 +0000 @@ -102,8 +102,11 @@ char bias() const { return bias_; } - double bitscore(int raw_score) const - { return ( lambda() * raw_score - ln_k()) / LN_2; } + double bitscore(double raw_score) const + { + const double s = std::round(raw_score / scale_); // maintain compatibility with BLAST + return ( lambda() * s - ln_k()) / LN_2; + } double rawscore(double bitscore, double) const { return (bitscore*LN_2 + ln_k()) / lambda(); } @@ -112,14 +115,15 @@ { return (int)ceil(rawscore(bitscore, double ())); } double evalue(int raw_score, unsigned query_len, unsigned subject_len) const; + double evalue_norm(int raw_score, unsigned query_len, unsigned subject_len) const; double evalue_norm(int raw_score, int query_len) const { - return 1e9 * query_len * pow(2, -bitscore(raw_score)); + return 1e9 * query_len * pow(2, -bitscore(raw_score * scale_)); } - double bitscore(double evalue, unsigned query_len) const - { return -log(evalue/db_letters_/query_len)/log(2); } + /*double bitscore(double evalue, unsigned query_len) const + { return -log(evalue/db_letters_/query_len)/log(2); }*/ double bitscore_norm(double evalue, unsigned query_len) const { @@ -181,6 +185,14 @@ return standard_matrix_->ungapped_constants().Lambda; } + double ideal_lambda() const { + return ideal_lambda_; + } + + const Stats::FreqRatios& freq_ratios() const { + return standard_matrix_->freq_ratios; + } + double avg_id_score() const; bool report_cutoff(int score, double evalue) const; @@ -217,12 +229,12 @@ const Stats::StandardMatrix* standard_matrix_; const int8_t* score_array_; int gap_open_, gap_extend_, frame_shift_; - double db_letters_; - double ln_k_; + double db_letters_, ln_k_, scale_; std::string name_; Scores matrix8_; Scores matrix32_, matrix32_scaled_; int8_t bias_; + double ideal_lambda_; Scores matrix8u_; Scores matrix8_low_; Scores matrix8_high_; diff -Nru diamond-aligner-2.0.6/src/stats/standard_matrix.h diamond-aligner-2.0.7/src/stats/standard_matrix.h --- diamond-aligner-2.0.6/src/stats/standard_matrix.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/stats/standard_matrix.h 2021-02-12 11:50:56.000000000 +0000 @@ -31,6 +31,7 @@ const double INT2_MAX = DBL_MAX; const size_t NCBI_ALPH = 28; +using FreqRatios = double[NCBI_ALPH][NCBI_ALPH]; struct StandardMatrix { @@ -50,10 +51,10 @@ int default_gap_exist, default_gap_extend; std::vector parameters; - std::array scores; + std::array scores; double joint_probs[TRUE_AA][TRUE_AA]; std::array background_freqs; - double freq_ratios[NCBI_ALPH][NCBI_ALPH]; + FreqRatios freq_ratios; const Parameters& constants(int gap_exist, int gap_extend) const; const Parameters& ungapped_constants() const; diff -Nru diamond-aligner-2.0.6/src/stats/stats.cpp diamond-aligner-2.0.7/src/stats/stats.cpp --- diamond-aligner-2.0.6/src/stats/stats.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/stats/stats.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -20,6 +20,7 @@ #include #include +#include #include "standard_matrix.h" using std::string; @@ -31,7 +32,7 @@ const StandardMatrix& StandardMatrix::get(const std::string& name) { string n; - std::transform(name.begin(), name.end(), std::back_inserter(n), [](unsigned char c) { return std::tolower(c); }); + std::transform(name.begin(), name.end(), std::back_inserter(n), [](unsigned char c) { return tolower(c); }); auto it = matrices.find(n); if (it == matrices.end()) throw std::runtime_error("Unknown scoring matrix: " + name); diff -Nru diamond-aligner-2.0.6/src/test/simulate.cpp diamond-aligner-2.0.7/src/test/simulate.cpp --- diamond-aligner-2.0.6/src/test/simulate.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/test/simulate.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -93,7 +93,7 @@ } void mutate() { - TextInputFile in(config.query_file); + TextInputFile in(config.single_query_file()); string id; vector seq; input_value_traits = value_traits = nucleotide_traits; diff -Nru diamond-aligner-2.0.6/src/test/test_cases.cpp diamond-aligner-2.0.7/src/test/test_cases.cpp --- diamond-aligner-2.0.6/src/test/test_cases.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/test/test_cases.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -48,26 +48,26 @@ }; const vector ref_hashes = { -0xa941ea1bcaae9cb3, -0xa941ea1bcaae9cb3, -0x7992486f9bc878e8, -0x6f1103a94ffd1a2b, -0xb5b52a7733b476a2, -0xeaf960fbf7c63ff9, -0xa839eaaf7c454ff2, -0x6f1103a94ffd1a2b, -0x4b7ca89df4038f3d, -0x8b983cbaaff963ff, -0x113e1df71d47ee35, -0xc955cd40c085e64d, -0x92297cfae3e80486, -0x563e4f33df3c673d, -0x6a9d5bf640fc1f4b, -0xfc5ca0d04d30faca, -0xf3dadda955ca2d30, -0x45e4056064e260c6, -0xdffb0103534fe08f, -0x778a9e9e5f7a6d64, +0x602762c977aa8682, +0x602762c977aa8682, +0x38498d4f4d3eb7c9, +0x44d8f0f470123331, +0x544de0f64c7b7b0, +0x9164580efdd15995, +0x9a54b156f8f2146a, +0x44d8f0f470123331, +0xc5e10fd8002fb6eb, +0x794cb4944f11ffdc, +0xdfe1489c8ea1b4b6, +0xd0350017fe8f8fda, +0xc56f46f150f65fc1, +0xf1274743d0f712bc, +0x5298dd163b9666b3, +0xd20b29b1abecd9c4, +0xae7bc1145b22152f, +0xc43258834622128e, +0x5d81f357aacf347c, +0x58c74e056adf9a71, }; } \ No newline at end of file diff -Nru diamond-aligner-2.0.6/src/test/test.cpp diamond-aligner-2.0.7/src/test/test.cpp --- diamond-aligner-2.0.6/src/test/test.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/test/test.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -25,6 +25,7 @@ #include #include #include +#include #include "../util/io/temp_file.h" #include "../util/io/text_input_file.h" #include "test.h" @@ -41,10 +42,11 @@ using std::string; using std::vector; using std::cout; +using std::list; namespace Test { -size_t run_testcase(size_t i, DatabaseFile &db, TextInputFile &query_file, size_t max_width, bool bootstrap, bool log, bool to_cout) { +size_t run_testcase(size_t i, DatabaseFile &db, list &query_file, size_t max_width, bool bootstrap, bool log, bool to_cout) { vector args = tokenize(test_cases[i].command_line, " "); args.emplace(args.begin(), "diamond"); if (log) @@ -53,7 +55,7 @@ statistics.reset(); Workflow::Search::Options opt; opt.db = &db; - query_file.rewind(); + query_file.front().rewind(); opt.query_file = &query_file; if (to_cout) { @@ -94,7 +96,8 @@ TempFile proteins; for (size_t i = 0; i < seqs.size(); ++i) Util::Sequence::format(sequence::from_string(seqs[i].second.c_str()), seqs[i].first.c_str(), nullptr, proteins, "fasta", amino_acid_traits); - TextInputFile query_file(proteins); + list query_file; + query_file.emplace_back(proteins); timer.finish(); config.command = Config::makedb; @@ -110,7 +113,7 @@ cout << endl << "#Test cases passed: " << passed << '/' << n << endl; // << endl; - query_file.close_and_delete(); + query_file.front().close_and_delete(); db.close(); delete db_file; return passed == n ? 0 : 1; diff -Nru diamond-aligner-2.0.6/src/tools/benchmark.cpp diamond-aligner-2.0.7/src/tools/benchmark.cpp --- diamond-aligner-2.0.6/src/tools/benchmark.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/tools/benchmark.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -37,6 +37,8 @@ #include "../util/simd/vector.h" #include "../util/simd/transpose.h" #include "../dp/scan_diags.h" +#include "../stats/cbs.h" +#include "../util/profiler.h" void benchmark_io(); @@ -212,19 +214,19 @@ high_resolution_clock::time_point t1 = high_resolution_clock::now(); for (size_t i = 0; i < n; ++i) { - volatile list v = ::DP::BandedSwipe::swipe(query, target8, target16, nullptr, Frame(0), nullptr, DP::FULL_MATRIX, stat); + volatile list v = ::DP::BandedSwipe::swipe(query, target8, target16, {}, nullptr, Frame(0), nullptr, DP::FULL_MATRIX, stat); } cout << "SWIPE (int8_t):\t\t\t" << (double)duration_cast(high_resolution_clock::now() - t1).count() / (n * query.length() * s2.length() * CHANNELS) * 1000 << " ps/Cell" << endl; t1 = high_resolution_clock::now(); for (size_t i = 0; i < n; ++i) { - volatile list v = ::DP::BandedSwipe::swipe(query, target8, target16, nullptr, Frame(0), &cbs, DP::FULL_MATRIX, stat); + volatile list v = ::DP::BandedSwipe::swipe(query, target8, target16, {}, nullptr, Frame(0), &cbs, DP::FULL_MATRIX, stat); } cout << "SWIPE (int8_t, CBS):\t\t" << (double)duration_cast(high_resolution_clock::now() - t1).count() / (n * query.length() * s2.length() * CHANNELS) * 1000 << " ps/Cell" << endl; t1 = high_resolution_clock::now(); for (size_t i = 0; i < n; ++i) { - volatile list v = ::DP::BandedSwipe::swipe(query, target8, target16, nullptr, Frame(0), nullptr, DP::FULL_MATRIX | DP::TRACEBACK, stat); + volatile list v = ::DP::BandedSwipe::swipe(query, target8, target16, {}, nullptr, Frame(0), nullptr, DP::FULL_MATRIX | DP::TRACEBACK, stat); } cout << "SWIPE (int8_t, TB):\t\t" << (double)duration_cast(high_resolution_clock::now() - t1).count() / (n * query.length() * s2.length() * CHANNELS) * 1000 << " ps/Cell" << endl; } @@ -241,19 +243,19 @@ Bias_correction cbs(s1); high_resolution_clock::time_point t1 = high_resolution_clock::now(); for (size_t i = 0; i < n; ++i) { - volatile auto out = ::DP::BandedSwipe::swipe(s1, target8, target16, nullptr, Frame(0), &cbs, 0, stat); + volatile auto out = ::DP::BandedSwipe::swipe(s1, target8, target16, {}, nullptr, Frame(0), &cbs, 0, stat); } cout << "Banded SWIPE (int16_t, CBS):\t" << (double)duration_cast(high_resolution_clock::now() - t1).count() / (n * s1.length() * 65 * 16) * 1000 << " ps/Cell" << endl; t1 = high_resolution_clock::now(); for (size_t i = 0; i < n; ++i) { - volatile auto out = ::DP::BandedSwipe::swipe(s1, target8, target16, nullptr, Frame(0), nullptr, 0, stat); + volatile auto out = ::DP::BandedSwipe::swipe(s1, target8, target16, {}, nullptr, Frame(0), nullptr, 0, stat); } cout << "Banded SWIPE (int16_t):\t\t" << (double)duration_cast(high_resolution_clock::now() - t1).count() / (n * s1.length() * 65 * 16) * 1000 << " ps/Cell" << endl; t1 = high_resolution_clock::now(); for (size_t i = 0; i < n; ++i) { - volatile auto out = ::DP::BandedSwipe::swipe(s1, target8, target16, nullptr, Frame(0), &cbs, DP::TRACEBACK, stat); + volatile auto out = ::DP::BandedSwipe::swipe(s1, target8, target16, {}, nullptr, Frame(0), &cbs, DP::TRACEBACK, stat); } cout << "Banded SWIPE (int16_t, CBS, TB):" << (double)duration_cast(high_resolution_clock::now() - t1).count() / (n * s1.length() * 65 * 16) * 1000 << " ps/Cell" << endl; } @@ -289,6 +291,39 @@ cout << "Evalue (ALP):\t\t\t" << (double)duration_cast(high_resolution_clock::now() - t1).count() / (n) << " ns" << endl; } +void matrix_adjust(const sequence& s1, const sequence& s2) { + static const size_t n = 10000llu; + high_resolution_clock::time_point t1 = high_resolution_clock::now(); + vector mat_final(TRUE_AA * TRUE_AA); + int iteration_count; + const double* joint_probs = (const double*)(Stats::blosum62.joint_probs); + auto row_probs = Stats::composition(s1), col_probs = Stats::composition(s2); + config.cbs_err_tolerance = 0.0001; + + for (size_t i = 0; i < n; ++i) { + Stats::Blast_OptimizeTargetFrequencies(mat_final.data(), + TRUE_AA, + &iteration_count, + joint_probs, + row_probs.data(), col_probs.data(), + true, + 0.44, + config.cbs_err_tolerance, + config.cbs_it_limit); + } + + cout << "Matrix adjust:\t\t\t" << (double)duration_cast(high_resolution_clock::now() - t1).count() / (n) << " ms" << endl; + + t1 = high_resolution_clock::now(); + for (size_t i = 0; i < n; ++i) { + Stats::OptimizeTargetFrequencies(mat_final.data(), joint_probs, row_probs.data(), col_probs.data(), 0.44, config.cbs_err_tolerance, config.cbs_it_limit); + } + + cout << "Matrix adjust (vectorized):\t" << (double)duration_cast(high_resolution_clock::now() - t1).count() / (n) << " micros" << endl; + + //Profiler::print(n); +} + void benchmark() { if (config.type == "io") { benchmark_io(); @@ -305,6 +340,8 @@ sequence ss1 = sequence(s1).subseq(34, s1.size()); sequence ss2 = sequence(s2).subseq(33, s2.size()); + matrix_adjust(s1, s2); + #ifdef __SSE4_1__ swipe(s3, s4); diag_scores(s1, s2); diff -Nru diamond-aligner-2.0.6/src/tools/roc.cpp diamond-aligner-2.0.7/src/tools/roc.cpp --- diamond-aligner-2.0.6/src/tools/roc.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/tools/roc.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -109,23 +109,27 @@ struct Histogram { - Histogram() { - std::fill(false_positives.begin(), false_positives.end(), 0); - std::fill(coverage.begin(), coverage.end(), 0.0); - } + static constexpr double MAX_EV = 10000.0; + + Histogram() : + bin_offset(-std::floor((double)DBL_MIN_EXP* log(2.0)* config.log_evalue_scale)), + bin_count(bin_offset + int(std::round(std::log(MAX_EV) * config.log_evalue_scale)) + 1), + false_positives(bin_count, 0), + coverage(bin_count, 0.0) + {} - static int bin(double evalue) { + int bin(double evalue) { if (evalue == 0.0) return 0; - int bin = int(std::round(std::log2(evalue) * MULT)) - (DBL_MIN_EXP*MULT); + int bin = int(std::round(std::log(evalue) * config.log_evalue_scale)) + bin_offset; bin = std::max(bin, 0); - if (bin < 0 || bin >= BINS) + if (bin < 0 || bin >= bin_count) throw std::runtime_error("Evalue exceeds binning range."); return bin; } Histogram& operator+=(const Histogram& h) { - for (int i = 0; i < BINS; ++i) { + for (int i = 0; i < bin_count; ++i) { false_positives[i] += h.false_positives[i]; coverage[i] += h.coverage[i]; } @@ -133,15 +137,14 @@ } friend std::ostream& operator<<(std::ostream& os, const Histogram& h) { - for (int i = 0; i < BINS; ++i) + for (int i = 0; i < h.bin_count; ++i) os << (h.coverage[i] / config.query_count) << '\t' << ((double)h.false_positives[i] / config.query_count) << endl; return os; } - enum { MULT = 1, BINS = (-DBL_MIN_EXP + 15)*MULT }; - - array false_positives; - array coverage; + int bin_offset, bin_count; + vector false_positives; + vector coverage; }; @@ -169,13 +172,14 @@ auto i = acc2fam.equal_range(query); int n = 0; for (auto j = i.first; j != i.second; ++j) { - family_idx[j->second] = family_idx.size(); + const int next = (int)family_idx.size(); + family_idx[j->second] = next; ++n; } - false_positives.insert(false_positives.end(), Histogram::BINS, 0); + false_positives.insert(false_positives.end(), histogram.bin_count, 0); true_positives.reserve(n); for (int i = 0; i < n; ++i) - true_positives.emplace_back(Histogram::BINS, 0); + true_positives.emplace_back(histogram.bin_count, 0); } } @@ -185,7 +189,7 @@ auto it = family_idx.find(family); if (it == family_idx.end()) return; - ++true_positives[it->second][Histogram::bin(evalue)]; + ++true_positives[it->second][histogram.bin(evalue)]; } enum { UNKNOWN = 0, TP = 1, FP = 2 }; @@ -204,7 +208,7 @@ if (sseqid[0] == '\\') { have_rev_hit = true; if (get_roc) - ++false_positives[Histogram::bin(evalue)]; + ++false_positives[histogram.bin(evalue)]; return FP; } else { @@ -213,6 +217,7 @@ if (i.first == i.second) throw std::runtime_error("Accession not mapped."); bool same_fold = false; + for (auto j = i.first; j != i.second; ++j) { if (!have_rev_hit) ++count[j->second]; @@ -225,7 +230,7 @@ if (!config.no_forward_fp && !same_fold) { have_rev_hit = true; if(get_roc) - ++false_positives[Histogram::bin(evalue)]; + ++false_positives[histogram.bin(evalue)]; return FP; } return match_query ? TP : UNKNOWN; @@ -250,13 +255,13 @@ void update_hist(Histogram& hist, const vector& fam_count) { int t = 0; - for (int i = 0; i < Histogram::BINS; ++i) { + for (int i = 0; i < histogram.bin_count; ++i) { t += false_positives[i]; hist.false_positives[i] += (size_t)t; } const int n = family_count(); vector s(n, 0); - for (int i = 0; i < Histogram::BINS; ++i) { + for (int i = 0; i < histogram.bin_count; ++i) { double cov = 0.0; for(auto it = family_idx.begin(); it != family_idx.end(); ++it) { s[it->second] += true_positives[it->second][i]; @@ -297,6 +302,7 @@ int c = stats.add(acc, evalue, acc2fam); if ((c == QueryStats::TP && config.output_hits) || (c == QueryStats::FP && config.output_fp)) cout << line << endl; + } const double a = stats.auc1(fam_count, acc2fam_query); if (get_roc) @@ -338,6 +344,7 @@ } void roc() { + histogram = Histogram(); get_roc = !config.roc_file.empty(); if (config.family_map.empty()) @@ -371,7 +378,7 @@ threads.emplace_back(worker); timer.go("Processing alignments"); - TextInputFile in(config.query_file); + TextInputFile in(config.single_query_file()); string query, acc; size_t n = 0, queries = 0, buf_size = 0; string* buf = new string; diff -Nru diamond-aligner-2.0.6/src/tools/rocid.cpp diamond-aligner-2.0.7/src/tools/rocid.cpp --- diamond-aligner-2.0.6/src/tools/rocid.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/tools/rocid.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -16,28 +16,31 @@ uint8_t id, fam_idx; }; -static string query_aln; +static string query_aln, query_mapped; static vector> totals, counts; static map fam2idx; static unordered_multimap acc2id; -static size_t unmapped_query = 0; +static size_t unmapped_query = 0, total_unmapped = 0; -static void fetch_map(TextInputFile& map_in, const string& query) { - string q, target, family; +static bool fetch_map(TextInputFile& map_in, const string& query) { + string q, target, family, next_query; float id; acc2id.clear(); fam2idx.clear(); counts.clear(); totals.clear(); + query_mapped.clear(); while (map_in.getline(), !map_in.eof()) { Util::String::Tokenizer(map_in.line, "\t") >> q >> target >> id >> family; - if (q != query) { - if (q < query) - continue; - else { - map_in.putback_line(); - return; - } + if (next_query.empty()) { + next_query = q; + query_mapped = q; + if (next_query > query) + return true; + } + if (q != next_query) { + map_in.putback_line(); + return next_query == query; } auto it = fam2idx.emplace(family, (uint8_t)fam2idx.size()); if (it.second) { @@ -51,12 +54,16 @@ acc2id.emplace(target, Assoc { (uint8_t)bin, fam_idx }); ++totals[(size_t)fam_idx][bin]; } + return (next_query == query) || (next_query.empty() && map_in.eof()); } static void print() { - if (unmapped_query) + if (unmapped_query || query_mapped > query_aln || query_mapped.empty()) { + message_stream << "Unmapped query: " << query_aln << endl; + ++total_unmapped; return; - cout << query_aln; + } + cout << query_mapped; for (int i = 0; i < 10; ++i) { double s = 0.0, n = 0.0; for (size_t fam_idx = 0; fam_idx < fam2idx.size(); ++fam_idx) { @@ -71,7 +78,7 @@ } void roc_id() { - TextInputFile in(config.query_file); + TextInputFile in(config.single_query_file()); string query, target; size_t n = 0, queries = 0, unmapped = 0, hits = 0; @@ -84,10 +91,12 @@ ++hits; if (query != query_aln) { print(); - fetch_map(map_in, query); + unmapped_query = 0; query_aln = query; + while (!fetch_map(map_in, query)) { + print(); + } ++queries; - unmapped_query = 0; if (queries % 1000 == 0) message_stream << queries << ' ' << hits << ' ' << unmapped << endl; } @@ -106,5 +115,6 @@ in.close(); map_in.close(); message_stream << "Queries = " << queries << endl; + message_stream << "Unmapped = " << total_unmapped << endl; } \ No newline at end of file diff -Nru diamond-aligner-2.0.6/src/tools/tools.cpp diamond-aligner-2.0.7/src/tools/tools.cpp --- diamond-aligner-2.0.6/src/tools/tools.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/tools/tools.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -36,7 +36,7 @@ } void split() { - TextInputFile in(config.query_file); + TextInputFile in(config.single_query_file()); string id; vector seq; size_t n = 0, f = 0, b = (size_t)(config.chunk_size * 1e9); diff -Nru diamond-aligner-2.0.6/src/util/algo/hash_join.h diamond-aligner-2.0.7/src/util/algo/hash_join.h --- diamond-aligner-2.0.6/src/util/algo/hash_join.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/util/algo/hash_join.h 2021-02-12 11:50:56.000000000 +0000 @@ -207,11 +207,16 @@ template std::pair, DoubleArray> hash_join(Relation<_t> R, Relation<_t> S, unsigned total_bits = 32) { + const bool swap = config.hash_join_swap && R.n > S.n; + if (swap) + std::swap(R, S); _t *buf_r = (_t*)malloc(sizeof(_t) * R.n), *buf_s = (_t*)malloc(sizeof(_t) * S.n); DoubleArray out_r((void*)R.data), out_s((void*)S.data); hash_join(R, S, buf_r, buf_s, out_r, out_s, total_bits); free(buf_r); free(buf_s); + if (swap) + std::swap(out_r, out_s); return { out_r, out_s }; } diff -Nru diamond-aligner-2.0.6/src/util/algo/upgma.cpp diamond-aligner-2.0.7/src/util/algo/upgma.cpp --- diamond-aligner-2.0.6/src/util/algo/upgma.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/util/algo/upgma.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -172,7 +172,7 @@ } void upgma() { - TextInputFile in(config.query_file); + TextInputFile in(config.single_query_file()); string query, target; double evalue; EdgeList edges; diff -Nru diamond-aligner-2.0.6/src/util/algo/upgma_mc.cpp diamond-aligner-2.0.7/src/util/algo/upgma_mc.cpp --- diamond-aligner-2.0.6/src/util/algo/upgma_mc.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/util/algo/upgma_mc.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -250,7 +250,7 @@ void upgma() { const double max_dist = (dist_type() == DistType::BITSCORE) ? 0.0 : 10.0; message_stream << "Reading edges..." << endl; - EdgeVec all_edges(config.query_file.c_str()); + EdgeVec all_edges(config.single_query_file().c_str()); message_stream << "Read " << all_edges.nodes() << " nodes, " << all_edges.size() << " edges." << endl; EdgeList edges; diff -Nru diamond-aligner-2.0.6/src/util/data_structures/deque.h diamond-aligner-2.0.7/src/util/data_structures/deque.h --- diamond-aligner-2.0.6/src/util/data_structures/deque.h 1970-01-01 00:00:00.000000000 +0000 +++ diamond-aligner-2.0.7/src/util/data_structures/deque.h 2021-02-12 11:50:56.000000000 +0000 @@ -0,0 +1,49 @@ +#pragma once + +#include +#include +#include "../../basic/config.h" + +template +struct Deque { + + typedef std::vector<_t> Bucket; + + Deque(): + max_size_(config.deque_bucket_size / sizeof(_t)) + { + buckets.emplace_back(); + buckets.back().reserve(max_size_); + } + + void push_back(const _t* ptr, size_t n) { + if (buckets.back().size() + n > max_size_) { + buckets.emplace_back(); + buckets.back().reserve(max_size_); + } + buckets.back().insert(buckets.back().end(), ptr, ptr + n); + } + + size_t size() const { + size_t n = 0; + for (const Bucket& b : buckets) + n += b.size(); + return n; + } + + void move(std::vector<_t>& dst) { + if (buckets.size() == 1 && dst.empty()) + dst = std::move(buckets.front()); + else { + for (const Bucket& b : buckets) + dst.insert(dst.end(), b.begin(), b.end()); + } + buckets.clear(); + } + +private: + + std::list buckets; + const size_t max_size_; + +}; \ No newline at end of file diff -Nru diamond-aligner-2.0.6/src/util/hash_table.h diamond-aligner-2.0.7/src/util/hash_table.h --- diamond-aligner-2.0.6/src/util/hash_table.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/util/hash_table.h 2021-02-12 11:50:56.000000000 +0000 @@ -16,15 +16,14 @@ along with this program. If not, see . ****/ -#ifndef HASH_TABLE_H_ -#define HASH_TABLE_H_ - +#pragma once #include #include #include #include #include #include "hash_function.h" +#include "simd.h" struct hash_table_overflow_exception : public std::exception { @@ -168,13 +167,50 @@ table(new fp[size]), size_(size) { - memset(table.get(), 0, size_ * sizeof(fp)); + memset(table, 0, size_ * sizeof(fp)); + } + + PHash_set(fp* data, size_t size) : + table(data), + size_(size) + {} + + ~PHash_set() { + delete[] table; } bool contains(uint64_t key) const { +#ifdef USE_AVX + const uint64_t hash = _hash()(key); + fp* p = table.get() + modulo<_mod>(hash >> (sizeof(fp) * 8), size_); + __m256i r = _mm256_loadu_si256((const __m256i*)p); + __m256i z = _mm256_setzero_si256(); + const int zm = _mm256_movemask_epi8(_mm256_cmpeq_epi8(r, z)); + if (zm == 0) + return true; + + const fp f = finger_print(hash); + __m256i fr = _mm256_set1_epi8(f); + const int fm = _mm256_movemask_epi8(_mm256_cmpeq_epi8(r, fr)); + return fm != 0; +#elif defined(__SSE2__) + const uint64_t hash = _hash()(key); + fp* p = table + modulo<_mod>(hash >> (sizeof(fp) * 8), size_); + __m128i r = _mm_loadu_si128((const __m128i*)p); + __m128i z = _mm_setzero_si128(); + const int zm = _mm_movemask_epi8(_mm_cmpeq_epi8(r, z)); + if (zm == 0) + return true; + + const fp f = finger_print(hash); + __m128i fr = _mm_set1_epi8(f); + const int fm = _mm_movemask_epi8(_mm_cmpeq_epi8(r, fr)); + return fm != 0; +#else fp *p; return get_entry(key, p); +#endif } void insert(uint64_t key) @@ -190,15 +226,22 @@ return size_; } - double load() const + size_t load() const { size_t n = 0; for (size_t i = 0; i < size_; ++i) - if (*(table.get() + i) != 0) + if (*(table + i) != 0) ++n; - return (double)n / size_; + return n; + } + + const fp* data() const { + return table; } + fp* table; + size_t size_; + private: static fp finger_print(uint64_t hash) @@ -209,8 +252,9 @@ bool get_entry(uint64_t key, fp*& p) const { - const uint64_t hash = _hash()(key), f = finger_print(hash); - p = table.get() + modulo<_mod>(hash >> sizeof(fp) * 8, size_); + const uint64_t hash = _hash()(key); + const fp f = finger_print(hash); + p = table + modulo<_mod>(hash >> (sizeof(fp) * 8), size_); bool wrapped = false; while (true) { if (*p == f) @@ -218,18 +262,13 @@ if (*p == (fp)0) return false; ++p; - if (p == table.get() + size_) { + if (p == table + size_) { if (wrapped) throw hash_table_overflow_exception(); - p = table.get(); + p = table; wrapped = true; } } } - std::unique_ptr table; - size_t size_; - }; - -#endif /* HASH_TABLE_H_ */ diff -Nru diamond-aligner-2.0.6/src/util/log_stream.h diamond-aligner-2.0.7/src/util/log_stream.h --- diamond-aligner-2.0.6/src/util/log_stream.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/util/log_stream.h 2021-02-12 11:50:56.000000000 +0000 @@ -129,4 +129,4 @@ unsigned level_; const char *msg_; std::chrono::high_resolution_clock::time_point t; -}; \ No newline at end of file +}; diff -Nru diamond-aligner-2.0.6/src/util/profiler.h diamond-aligner-2.0.7/src/util/profiler.h --- diamond-aligner-2.0.6/src/util/profiler.h 1970-01-01 00:00:00.000000000 +0000 +++ diamond-aligner-2.0.7/src/util/profiler.h 2021-02-12 11:50:56.000000000 +0000 @@ -0,0 +1,31 @@ +#pragma once +#include +#include "log_stream.h" + +struct Profiler { + + Profiler(const char* key) : + timer(), + key(key) + {} + + ~Profiler() { + finish(); + } + + void finish() { + if (!key) return; + times[key] += timer.nanoseconds(); + key = nullptr; + } + + static void print(size_t n) { + for (const auto& i : times) + message_stream << i.first << ": " << (double)i.second/n/1e3 << " micros" << std::endl; + } + + task_timer timer; + const char* key; + static std::map times; + +}; \ No newline at end of file diff -Nru diamond-aligner-2.0.6/src/util/ptr_vector.h diamond-aligner-2.0.7/src/util/ptr_vector.h --- diamond-aligner-2.0.6/src/util/ptr_vector.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/util/ptr_vector.h 2021-02-12 11:50:56.000000000 +0000 @@ -16,53 +16,51 @@ along with this program. If not, see . ****/ -#ifndef PTR_VECTOR_H_ -#define PTR_VECTOR_H_ - +#pragma once #include -using std::vector; - template -struct PtrVector : public vector<_t*> +struct PtrVector : public std::vector<_t*> { + using vector = std::vector<_t*>; + PtrVector(): - vector<_t*>() + vector() {} - PtrVector(typename vector<_t*>::size_type n): - vector<_t*>(n) + PtrVector(typename vector::size_type n): + vector(n) {} - _t& operator[](typename vector<_t*>::size_type n) - { return *vector<_t*>::operator[](n); } + _t& operator[](typename vector::size_type n) + { return *vector::operator[](n); } - const _t& operator[](typename vector<_t*>::size_type n) const - { return *vector<_t*>::operator[](n); } + const _t& operator[](typename vector::size_type n) const + { return *vector::operator[](n); } - _t*& get(typename vector<_t*>::size_type n) + _t*& get(typename vector::size_type n) { - return vector<_t*>::operator[](n); + return vector::operator[](n); } _t& back() { - return *vector<_t*>::back(); + return *vector::back(); } - typename vector<_t*>::iterator erase(typename vector<_t*>::iterator first, typename vector<_t*>::iterator last) + typename vector::iterator erase(typename vector::iterator first, typename vector::iterator last) { - for (typename vector<_t*>::iterator i = first; i < last; ++i) + for (typename vector::iterator i = first; i < last; ++i) delete *i; - return vector<_t*>::erase(first, last); + return vector::erase(first, last); } void clear() { - for (typename vector<_t*>::iterator i = this->begin(); i != this->end(); ++i) + for (typename vector::iterator i = this->begin(); i != this->end(); ++i) delete *i; - vector<_t*>::clear(); + vector::clear(); } ~PtrVector() @@ -72,4 +70,3 @@ }; -#endif /* PTR_VECTOR_H_ */ diff -Nru diamond-aligner-2.0.6/src/util/scores/cutoff_table.h diamond-aligner-2.0.7/src/util/scores/cutoff_table.h --- diamond-aligner-2.0.6/src/util/scores/cutoff_table.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/util/scores/cutoff_table.h 2021-02-12 11:50:56.000000000 +0000 @@ -27,7 +27,7 @@ struct CutoffTable { CutoffTable(double evalue) { - for (int b = 1; b <= MAX_BITS; ++b) { + for (int b = 1; b <= MAX_BITS; ++b) { data_[b] = score_matrix.rawscore(score_matrix.bitscore_norm(evalue, 1 << (b - 1))); } } @@ -45,4 +45,33 @@ }; +struct CutoffTable2D { + + CutoffTable2D(double evalue) { + for (int b1 = 1; b1 <= MAX_BITS; ++b1) + for (int b2 = 1; b2 <= MAX_BITS; ++b2) { + data_[b1][b2] = calc_min_score(1 << (b1 - 1), 1 << (b2 - 1), evalue); + } + } + + int operator()(int query_len, int target_len) const { + const int b1 = 32 - clz((uint32_t)query_len), b2 = 32 - clz((uint32_t)target_len); + return data_[b1][b2]; + } + +private: + + int calc_min_score(unsigned qlen, unsigned slen, double evalue) { + for (int i = 10; i < 1000; ++i) + if (score_matrix.evalue_norm(i, qlen, slen) <= evalue) + return i; + return 1000; + } + + enum { MAX_BITS = 31 }; + + int data_[MAX_BITS + 1][MAX_BITS + 1]; + +}; + }} \ No newline at end of file diff -Nru diamond-aligner-2.0.6/src/util/string/string.h diamond-aligner-2.0.7/src/util/string/string.h --- diamond-aligner-2.0.6/src/util/string/string.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/util/string/string.h 2021-02-12 11:50:56.000000000 +0000 @@ -49,6 +49,8 @@ // Workaround since sprintf is inconsistent in double rounding for different implementations. inline int format_double(double x, char *p) { + if (x >= 100.0) + return sprintf(p, "%lli", (long long)std::floor(x)); // for keeping output compatible with BLAST long long i = std::llround(x*10.0); return sprintf(p, "%lli.%lli", i / 10, i % 10); } diff -Nru diamond-aligner-2.0.6/src/util/system/system.cpp diamond-aligner-2.0.7/src/util/system/system.cpp --- diamond-aligner-2.0.6/src/util/system/system.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/util/system/system.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -1,3 +1,5 @@ +#include +#include #include #include #include @@ -10,6 +12,9 @@ #else #include #include + #include + #include + #include #ifndef __APPLE__ #ifdef __FreeBSD__ #include @@ -150,4 +155,41 @@ return 0.0; return (double)info.totalram / 1e9; #endif +} + +#define handle_error(msg) \ + do { perror(msg); exit(EXIT_FAILURE); } while (0) + +std::tuple mmap_file(const char* filename) { +#ifdef WIN32 + return { nullptr, 0, -1 }; +#else + void* addr; + int fd; + struct stat sb; + size_t length; + + fd = open(filename, O_RDONLY); + if (fd == -1) + return std::tuple(nullptr, 0, -1); + + if (fstat(fd, &sb) == -1) /* To obtain file size */ + handle_error("fstat"); + + length = sb.st_size; + + addr = mmap(NULL, length, PROT_READ, + MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) + handle_error("mmap"); + return std::tuple((char*)addr, length, fd); +#endif +} + +void unmap_file(char* ptr, size_t size, int fd) { +#ifdef WIN32 +#else + munmap((void*)ptr, size); + close(fd); +#endif } \ No newline at end of file diff -Nru diamond-aligner-2.0.6/src/util/system/system.h diamond-aligner-2.0.7/src/util/system/system.h --- diamond-aligner-2.0.6/src/util/system/system.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/util/system/system.h 2021-02-12 11:50:56.000000000 +0000 @@ -1,8 +1,7 @@ -#ifndef UTIL_SYSTEM_SYSTEM_H_ -#define UTIL_SYSTEM_SYSTEM_H_ - +#pragma once #include #include +#include enum class Color { RED, GREEN, YELLOW }; @@ -17,6 +16,8 @@ void log_rss(); size_t file_size(const char* name); double total_ram(); +std::tuple mmap_file(const char* filename); +void unmap_file(char* ptr, size_t size, int fd); #ifdef _MSC_VER #define POPEN _popen @@ -24,6 +25,4 @@ #else #define POPEN popen #define PCLOSE pclose -#endif - #endif \ No newline at end of file diff -Nru diamond-aligner-2.0.6/src/util/system.h diamond-aligner-2.0.7/src/util/system.h --- diamond-aligner-2.0.6/src/util/system.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/util/system.h 2021-02-12 11:50:56.000000000 +0000 @@ -43,12 +43,4 @@ #else #define POSIX_OPEN(x,y,z) open(x,y,z) #define POSIX_OPEN2(x,y) open(x,y) -#endif - -#if defined(__s390x__) && defined(__clang__) -#define TLS_FIX_S390X -#define TLS_FIX_S390X_MOVE(x) x -#else -#define TLS_FIX_S390X thread_local -#define TLS_FIX_S390X_MOVE(x) std::move(x) #endif \ No newline at end of file diff -Nru diamond-aligner-2.0.6/src/util/text_buffer.h diamond-aligner-2.0.7/src/util/text_buffer.h --- diamond-aligner-2.0.6/src/util/text_buffer.h 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/util/text_buffer.h 2021-02-12 11:50:56.000000000 +0000 @@ -209,7 +209,10 @@ TextBuffer& print_e(double x) { reserve(32); - ptr_ += sprintf(ptr_, "%.1e", x); + if (x == 0.0) + ptr_ += sprintf(ptr_, "0.0"); + else + ptr_ += sprintf(ptr_, "%.2e", x); return *this; } diff -Nru diamond-aligner-2.0.6/src/util/util.cpp diamond-aligner-2.0.7/src/util/util.cpp --- diamond-aligner-2.0.6/src/util/util.cpp 2020-12-16 14:15:21.000000000 +0000 +++ diamond-aligner-2.0.7/src/util/util.cpp 2021-02-12 11:50:56.000000000 +0000 @@ -24,6 +24,7 @@ #include "log_stream.h" #include "util.h" #include "escape_sequences.h" +#include "profiler.h" using namespace std; @@ -31,6 +32,7 @@ Message_stream verbose_stream (false); Message_stream log_stream (false); std::mutex Message_stream::mtx; +std::map Profiler::times; #ifndef _MSC_VER const char dir_separator = '/';