diff -Nru hhsuite-3.0~beta2+dfsg/CMakeLists.txt hhsuite-3.0~beta3+dfsg/CMakeLists.txt --- hhsuite-3.0~beta2+dfsg/CMakeLists.txt 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/CMakeLists.txt 2017-07-14 10:34:03.000000000 +0000 @@ -19,6 +19,25 @@ set(CMAKE_COMPILER_IS_ICC 1) endif() + +message("-- Processor is ${CMAKE_SYSTEM_PROCESSOR}") + +if(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") + set(X86_64 1) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*") + set(X86 1) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)") + set(ARM 1) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)") + set(AARCH64 1) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "PPC64*|ppc64*|powerpc64*") + set(PPC64 1) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "PPC*|ppc*|powerpc*") + set(PPC 1) +endif() + + + set (HHSUITE_VERSION_MAJOR 3) set (HHSUITE_VERSION_MINOR 0) set (HHSUITE_VERSION_PATCH 0) @@ -39,12 +58,12 @@ include_directories(src) include_directories(lib/ffindex/src) +add_subdirectory(lib/ffindex) add_subdirectory(src) add_subdirectory(data) add_subdirectory(scripts) add_subdirectory(test) #add_subdirectory(doc) -add_subdirectory(lib/ffindex) SET (CPACK_GENERATOR "DEB;TGZ;ZIP;RPM") diff -Nru hhsuite-3.0~beta2+dfsg/debian/changelog hhsuite-3.0~beta3+dfsg/debian/changelog --- hhsuite-3.0~beta2+dfsg/debian/changelog 2017-01-18 15:52:21.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/debian/changelog 2018-08-21 19:09:09.000000000 +0000 @@ -1,3 +1,22 @@ +hhsuite (3.0~beta3+dfsg-1) unstable; urgency=medium + + [ Steffen Moeller ] + * New upstream version. + * debian/upstream/metadata: + - yamllint cleanliness + - Added references to registries + * d/control + - compliance with policy 4.2 + - adapted Vcs URLs to salsa.d.o + * Caveat: Does not compile with openmpi libraries discovered. + See README.source. + + [ Andreas Tille ] + * Fix Vcs-Browser + * Do not parse d/changelog to obtain version + + -- Andreas Tille Tue, 21 Aug 2018 21:09:09 +0200 + hhsuite (3.0~beta2+dfsg-3) unstable; urgency=medium * Specify correct format in printf statement (Thanks for the patch to diff -Nru hhsuite-3.0~beta2+dfsg/debian/control hhsuite-3.0~beta3+dfsg/debian/control --- hhsuite-3.0~beta2+dfsg/debian/control 2017-01-18 15:52:21.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/debian/control 2018-08-21 19:09:09.000000000 +0000 @@ -6,15 +6,15 @@ Section: science Priority: optional Build-Depends: debhelper (>= 10), - libffindex0-dev (>= 0.9.9.6-2), + libffindex0-dev (>= 0.9.9.7+soedinglab), cmake, texlive-latex-base, texlive-latex-recommended, imagemagick, help2man -Standards-Version: 3.9.8 -Vcs-Browser: https://anonscm.debian.org/cgit/debian-med/hhsuite.git -Vcs-Git: https://anonscm.debian.org/git/debian-med/hhsuite.git +Standards-Version: 4.2.0 +Vcs-Browser: https://salsa.debian.org/med-team/hhsuite +Vcs-Git: https://salsa.debian.org/med-team/hhsuite.git Homepage: http://toolkit.genzentrum.lmu.de/sections/search Package: hhsuite diff -Nru hhsuite-3.0~beta2+dfsg/debian/patches/build-own-manual-PDF.patch hhsuite-3.0~beta3+dfsg/debian/patches/build-own-manual-PDF.patch --- hhsuite-3.0~beta2+dfsg/debian/patches/build-own-manual-PDF.patch 2017-01-18 15:52:21.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/debian/patches/build-own-manual-PDF.patch 2018-08-21 19:09:09.000000000 +0000 @@ -1,9 +1,11 @@ Description: build own manual PDF The LaTeX source is now included, so we can build the manual directly. Author: Sascha Steinbiss ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -29,7 +29,6 @@ +Index: hhsuite/CMakeLists.txt +=================================================================== +--- hhsuite.orig/CMakeLists.txt ++++ hhsuite/CMakeLists.txt +@@ -48,7 +48,6 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMA set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) INSTALL(FILES @@ -11,12 +13,12 @@ LICENSE README PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ -@@ -43,7 +42,7 @@ +@@ -63,7 +62,7 @@ add_subdirectory(src) add_subdirectory(data) add_subdirectory(scripts) add_subdirectory(test) -#add_subdirectory(doc) +add_subdirectory(doc) - #add_subdirectory(lib/ffindex) + SET (CPACK_GENERATOR "DEB;TGZ;ZIP;RPM") diff -Nru hhsuite-3.0~beta2+dfsg/debian/patches/disable-AVX-SSE.patch hhsuite-3.0~beta3+dfsg/debian/patches/disable-AVX-SSE.patch --- hhsuite-3.0~beta2+dfsg/debian/patches/disable-AVX-SSE.patch 2017-01-18 15:52:21.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/debian/patches/disable-AVX-SSE.patch 2018-08-21 19:09:09.000000000 +0000 @@ -1,9 +1,11 @@ Description: do not use AVX to build binaries Author: Sascha Steinbiss ---- a/src/CMakeLists.txt -+++ b/src/CMakeLists.txt -@@ -9,22 +9,22 @@ - endif() +Index: hhsuite/src/CMakeLists.txt +=================================================================== +--- hhsuite.orig/src/CMakeLists.txt ++++ hhsuite/src/CMakeLists.txt +@@ -12,22 +12,22 @@ endif() + if(X86 OR X86_64) #SSE -if(${HAVE_AVX2}) @@ -32,10 +34,12 @@ + endif(${HAVE_SSSE3_EXTENSIONS}) endif() - ---- a/cmake/CheckSSEFeatures.cmake -+++ b/cmake/CheckSSEFeatures.cmake -@@ -209,23 +209,23 @@ + else() # +Index: hhsuite/cmake/CheckSSEFeatures.cmake +=================================================================== +--- hhsuite.orig/cmake/CheckSSEFeatures.cmake ++++ hhsuite/cmake/CheckSSEFeatures.cmake +@@ -209,23 +209,23 @@ macro(PCL_CHECK_FOR_SSE) set(CMAKE_REQUIRED_FLAGS) if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG) diff -Nru hhsuite-3.0~beta2+dfsg/debian/patches/disable-ffindex-build.patch hhsuite-3.0~beta3+dfsg/debian/patches/disable-ffindex-build.patch --- hhsuite-3.0~beta2+dfsg/debian/patches/disable-ffindex-build.patch 2017-01-18 15:52:21.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/debian/patches/disable-ffindex-build.patch 2018-08-21 19:09:09.000000000 +0000 @@ -1,21 +1,18 @@ Description: do not try to build ffindex Author: Sascha Steinbiss ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -37,14 +37,14 @@ +Index: hhsuite/CMakeLists.txt +=================================================================== +--- hhsuite.orig/CMakeLists.txt ++++ hhsuite/CMakeLists.txt +@@ -56,9 +56,9 @@ INSTALL(FILES ) include_directories(src) -include_directories(lib/ffindex/src) +#include_directories(lib/ffindex/src) +-add_subdirectory(lib/ffindex) ++#add_subdirectory(lib/ffindex) add_subdirectory(src) add_subdirectory(data) add_subdirectory(scripts) - add_subdirectory(test) - #add_subdirectory(doc) --add_subdirectory(lib/ffindex) -+#add_subdirectory(lib/ffindex) - - - SET (CPACK_GENERATOR "DEB;TGZ;ZIP;RPM") diff -Nru hhsuite-3.0~beta2+dfsg/debian/patches/fix-namespace.patch hhsuite-3.0~beta3+dfsg/debian/patches/fix-namespace.patch --- hhsuite-3.0~beta2+dfsg/debian/patches/fix-namespace.patch 2017-01-18 15:52:21.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/debian/patches/fix-namespace.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,13 +0,0 @@ -Description: add stdlib.h to header - This fixes compilation with Debian's ffindex headers. -Author: Sascha Steinbiss ---- a/src/hhdatabase.h -+++ b/src/hhdatabase.h -@@ -15,6 +15,7 @@ - class FFindexDatabase; - - extern "C" { -+#include - #include - } - diff -Nru hhsuite-3.0~beta2+dfsg/debian/patches/fix_type_mismatch_in_printf.patch hhsuite-3.0~beta3+dfsg/debian/patches/fix_type_mismatch_in_printf.patch --- hhsuite-3.0~beta2+dfsg/debian/patches/fix_type_mismatch_in_printf.patch 2017-01-18 15:52:21.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/debian/patches/fix_type_mismatch_in_printf.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,30 +0,0 @@ -Author: Daniel Shahaf -Last-Update: Wed, Jan 18, 2017 at 15:22:26 +0000 -Bug-Debian: https://bugs.debian.org/851764 -Description: hhblits.cpp uses the %i printf(3) conversion to format a 'double' value - Thats undefined and prints garbage - the patch inserts the correct format - (and also fixes the doc about the correct default value - ---- a/doc/hhsuite-userguide.tex -+++ b/doc/hhsuite-userguide.tex -@@ -1466,7 +1466,7 @@ - -maxfilt max number of hits allowed to pass 2nd prefilter (default=20000) - -min_prefilter_hits min number of hits to pass prefilter (default=100) - -prepre_smax_thresh min score threshold of ungapped prefilter (default=10) -- -pre_evalue_thresh max E-value threshold of Smith-Waterman prefilter score (default=0) -+ -pre_evalue_thresh max E-value threshold of Smith-Waterman prefilter score (default=1000.0) - -pre_bitfactor prefilter scores are in units of 1 bit / pre_bitfactor (default=4) - -pre_gap_open gap open penalty in prefilter Smith-Waterman alignment (default=20) - -pre_gap_extend gap extend penalty in prefilter Smith-Waterman alignment (default=4) ---- a/src/hhblits.cpp -+++ b/src/hhblits.cpp -@@ -330,7 +330,7 @@ - printf(" -maxfilt max number of hits allowed to pass 2nd prefilter (default=%i) \n", par.maxnumdb); - printf(" -min_prefilter_hits min number of hits to pass prefilter (default=%i) \n", par.min_prefilter_hits); - printf(" -prepre_smax_thresh min score threshold of ungapped prefilter (default=%i) \n", par.preprefilter_smax_thresh); -- printf(" -pre_evalue_thresh max E-value threshold of Smith-Waterman prefilter score (default=%i)\n", par.prefilter_evalue_thresh); -+ printf(" -pre_evalue_thresh max E-value threshold of Smith-Waterman prefilter score (default=%.1f)\n", par.prefilter_evalue_thresh); - printf(" -pre_bitfactor prefilter scores are in units of 1 bit / pre_bitfactor (default=%i)\n", par.prefilter_bit_factor); - printf(" -pre_gap_open gap open penalty in prefilter Smith-Waterman alignment (default=%i)\n", par.prefilter_gap_open); - printf(" -pre_gap_extend gap extend penalty in prefilter Smith-Waterman alignment (default=%i)\n", par.prefilter_gap_extend); - diff -Nru hhsuite-3.0~beta2+dfsg/debian/patches/gcc-6.1.patch hhsuite-3.0~beta3+dfsg/debian/patches/gcc-6.1.patch --- hhsuite-3.0~beta2+dfsg/debian/patches/gcc-6.1.patch 2017-01-18 15:52:21.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/debian/patches/gcc-6.1.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,18 +0,0 @@ -Description: Use system provided log2 and log10 of gcc-6.1 -Bug-Debian: https://bugs.debian.org/831115 -Author: Andreas Tille -Last-Update: Sat, 13 Aug 2016 07:27:28 +0200 - ---- a/src/util.C -+++ b/src/util.C -@@ -55,8 +55,8 @@ inline int iround(double x) {return int( - inline double fmean(double x, double y, double d) { return pow( (pow(x,d)+pow(y,d))/2 ,1./d);} - - // log base 2 --inline float log2(float x) {return (x<=0? (float)(-100000):1.442695041*log(x));} --inline float log10(float x) {return (x<=0? (float)(-100000):0.434294481*log(x));} -+//inline float log2(float x) {return (x<=0? (float)(-100000):1.442695041*log(x));} -+//inline float log10(float x) {return (x<=0? (float)(-100000):0.434294481*log(x));} - - - ///////////////////////////////////////////////////////////////////////////////////// diff -Nru hhsuite-3.0~beta2+dfsg/debian/patches/gcc-6.1_use_inline.patch hhsuite-3.0~beta3+dfsg/debian/patches/gcc-6.1_use_inline.patch --- hhsuite-3.0~beta2+dfsg/debian/patches/gcc-6.1_use_inline.patch 2017-01-18 15:52:21.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/debian/patches/gcc-6.1_use_inline.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,79 +0,0 @@ -Description: Rename redefined log2 to enable build using gcc-6.1 - Note: May be its just better to use the system provided log2 - ---> While this patch works syntactically the system provided - log2 and log10 are really used since this patch is - deactivated but left for discussion as alternative to - gcc-6.1 -Bug-Debian: https://bugs.debian.org/831115 -Author: Andreas Tille -Last-Update: Sat, 13 Aug 2016 07:27:28 +0200 - ---- a/src/util.C -+++ b/src/util.C -@@ -55,7 +55,7 @@ inline int iround(double x) {return int( - inline double fmean(double x, double y, double d) { return pow( (pow(x,d)+pow(y,d))/2 ,1./d);} - - // log base 2 --inline float log2(float x) {return (x<=0? (float)(-100000):1.442695041*log(x));} -+inline float log2_i(float x) {return (x<=0? (float)(-100000):1.442695041*log(x));} - inline float log10(float x) {return (x<=0? (float)(-100000):0.434294481*log(x));} - - -@@ -63,29 +63,29 @@ inline float log10(float x) {return (x<= - // fast log base 2 - ///////////////////////////////////////////////////////////////////////////////////// - --// Fast log2 -+// Fast log2_i - // ATTENTION: need to compile with g++ -fno-strict-aliasing when using -O2 or -O3!!! - // Maximum deviation: +/- 2.1E-5 --// Run time: ~1.2E-8s on Intel core2 2.13GHz, log2(): 5.4E-8s -+// Run time: ~1.2E-8s on Intel core2 2.13GHz, log2_i(): 5.4E-8s - // For a negative argument, -128 is returned. - // The function makes use of the representation of 4-byte floating point numbers: - // seee eeee emmm mmmm mmmm mmmm mmmm mmmm - // s is the sign, eee eee e gives the exponent + 127 (in hex: 0x7f). - // The following 23 bits give the mantisse, the binary digits after the decimal - // point: x = (-1)^s * 1.mmmmmmmmmmmmmmmmmmmmmmm * 2^(eeeeeeee-127) --// Therefore, log2(x) = eeeeeeee-127 + log2(1.mmmmmm...) --// = eeeeeeee-127 + log2(1+y), where y = 0.mmmmmm... --// ~ eeeeeeee-127 + ((a*y+b)*y+c)*y -+// Therefore, log2_i(x) = eeeeeeee-127 + log2_(1.mmmmmm...) -+// = eeeeeeee-127 + log2_(1+y), where y = 0.mmmmmm... -+// ~ eeeeeeee-127 + ((a*y+b)*y+c)*y - // The coefficients a, b were determined by a least squares fit, and c=1-a-b to get 1 at y=1. - // Lower/higher order polynomials may be used for faster or more precise calculation: --// Order 1: log2(1+y) ~ y --// Order 2: log2(1+y) = (a*y + 1-a)*y, a=-0.3427 -+// Order 1: log2_i(1+y) ~ y -+// Order 2: log2_i(1+y) = (a*y + 1-a)*y, a=-0.3427 - // => max dev = +/- 8E-3, run time ~ ? --// Order 3: log2(1+y) = ((a*y+b)*y + 1-a-b)*y, a=0.1564, b=-0.5773 -+// Order 3: log2_i(1+y) = ((a*y+b)*y + 1-a-b)*y, a=0.1564, b=-0.5773 - // => max dev = +/- 1E-3, run time ~ ? --// Order 4: log2(1+y) = (((a*y+b)*y+c)*y + 1-a-b-c)*y, a=-0.0803 b=0.3170 c=-0.6748 -+// Order 4: log2_i(1+y) = (((a*y+b)*y+c)*y + 1-a-b-c)*y, a=-0.0803 b=0.3170 c=-0.6748 - // => max dev = +/- 1.4E-4, run time ~ ? --// Order 5: log2(1+y) = ((((a*y+b)*y+c)*y+d)*y + 1-a-b-c-d)*y, -+// Order 5: log2_i(1+y) = ((((a*y+b)*y+c)*y+d)*y + 1-a-b-c-d)*y, - // a=0.0440047 b=-0.1903190 c=0.4123442 d=-0.7077702 1-a-b-c-d=1.441740 - // => max dev = +/- 2.1E-5, run time ~ 1.2E-8s - inline float flog2(float x) -@@ -155,7 +155,7 @@ __m128 _mm_flog2_ps(__m128 X) - R = _mm_add_ps(R, CONST32_D); // R = ((a*X+b)*X+c)*X+d - R = _mm_mul_ps(R, X); // R = (((a*X+b)*X+c)*X+d)*X - R = _mm_add_ps(R, CONST32_E); // R = (((a*X+b)*X+c)*X+d)*X+e -- R = _mm_mul_ps(R, X); // R = ((((a*X+b)*X+c)*X+d)*X+e)*X ~ log2(1+X) !! -+ R = _mm_mul_ps(R, X); // R = ((((a*X+b)*X+c)*X+d)*X+e)*X ~ log2_i(1+X) !! - R = _mm_add_ps(R, _mm_cvtepi32_ps(E)); // convert integer exponent to float and add to mantisse - return R; - } -@@ -172,7 +172,7 @@ __m128 _mm_flog2_ps(__m128 X) - // In the code, *(int *)&x is an integer which contains the bytes as the - // floating point variable x is represented in memory. The expression - // (((*(int *)&x) & 0x7f800000 ) >>23 )-0x7f is the exponent eeeeeeee, --// i.e., the largest integer that is smaller than log2(x) (e.g. -1 for 0.9). -+// i.e., the largest integer that is smaller than log2_i(x) (e.g. -1 for 0.9). - inline float fast_log2(float x) - { - static float lg2[1025]; // lg2[i] = log2[1+x/1024] diff -Nru hhsuite-3.0~beta2+dfsg/debian/patches/help hhsuite-3.0~beta3+dfsg/debian/patches/help --- hhsuite-3.0~beta2+dfsg/debian/patches/help 2017-01-18 15:52:21.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/debian/patches/help 1970-01-01 00:00:00.000000000 +0000 @@ -1,30 +0,0 @@ -Description: fix paths in help pages - Fix issues where the path in help pages is taken from the build-time location of the executables. -Forwarded: not-needed -Author: Laszlo Kajan ---- a/src/hhblits.C -+++ b/src/hhblits.C -@@ -411,7 +411,12 @@ void help(char all=0) - printf("An extended list of options can be obtained by calling 'hhblits -help'\n"); - } - printf("\n"); -- printf("Example: %s -i query.fas -oa3m query.a3m -n 1 \n",program_name); -+ printf("Examples:\n"); -+ printf("%s -i query.fas -o query.hhr -d ./uniprot20\n",program_name); -+ printf("\n"); -+ printf("%s -i query.fas -o query.hhr -oa3m query.a3m -n 1 -d ./uniprot20\n",program_name); -+ printf("\n"); -+ printf("Download databases from .\n"); - cout< // FLT_MIN - #include // islower, isdigit etc - #include // clock_gettime etc. (in realtime library (-lrt compiler option)) -+#include - #include // perror() - #include - #include diff -Nru hhsuite-3.0~beta2+dfsg/debian/patches/Makefile hhsuite-3.0~beta3+dfsg/debian/patches/Makefile --- hhsuite-3.0~beta2+dfsg/debian/patches/Makefile 2017-01-18 15:52:21.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/debian/patches/Makefile 1970-01-01 00:00:00.000000000 +0000 @@ -1,119 +0,0 @@ -Description: Patch to top level Makefile - * Make Makefile DESTDIR-aware. - * Fix install mode of data files. -Forwarded: upstream is aware of these patches - Upstream is Andreas Hauser . -Author: Laszlo Kajan ---- a/Makefile -+++ b/Makefile -@@ -1,5 +1,5 @@ --# This can be overridden e.g.: make install INSTALL_DIR=... --INSTALL_DIR?=$(PWD) -+# This can be overridden e.g.: make install PREFIX=... -+PREFIX:=/usr - - # Guess wether to use lib or lib64 - #libdir=`([ -d /usr/lib64 ] && echo lib64) || echo lib` -@@ -8,19 +8,17 @@ libdir=lib - - # Overriding this is currently not fully supported as the code won't know - # to what this is set then. You can try setting HHLIB. --INSTALL_LIB_DIR?=$(INSTALL_DIR)/$(libdir)/hh --INSTALL_SCRIPTS_DIR?=$(INSTALL_LIB_DIR)/scripts --INSTALL_DATA_DIR?=$(INSTALL_LIB_DIR)/data -+INSTALL_LIB_DIR?=$(PREFIX)/$(libdir)/hhsuite -+INSTALL_SCRIPTS_DIR?=$(PREFIX)/share/hhsuite/scripts -+INSTALL_DATA_DIR?=$(PREFIX)/share/hhsuite/data -+INSTALL_BDATA_DIR?=$(INSTALL_LIB_DIR)/data - INSTALL_LIB_BIN_DIR?=$(INSTALL_LIB_DIR)/bin - - dist_name=hhsuite-2.0.16 - --.PHONY: all_static --all_static: ffindex_static -- $(MAKE) -C src all_static -- - .PHONY: all --all: ffindex -+all: -+ $(MAKE) -C data all - $(MAKE) -C src all - - doc: -@@ -40,35 +38,34 @@ ffindex_static: - $(MAKE) -C lib/ffindex FFINDEX_STATIC=1 - - install: -- $(MAKE) -C lib/ffindex install INSTALL_DIR=$(INSTALL_DIR) -- mkdir -p $(INSTALL_DIR)/bin -- install src/hhblits $(INSTALL_DIR)/bin/hhblits -- install src/hhalign $(INSTALL_DIR)/bin/hhalign -- install src/hhconsensus $(INSTALL_DIR)/bin/hhconsensus -- install src/hhfilter $(INSTALL_DIR)/bin/hhfilter -- install src/hhmake $(INSTALL_DIR)/bin/hhmake -- install src/hhsearch $(INSTALL_DIR)/bin/hhsearch -- mkdir -p $(INSTALL_LIB_DIR) -- mkdir -p $(INSTALL_LIB_BIN_DIR) -- install src/cstranslate $(INSTALL_LIB_BIN_DIR)/cstranslate -- mkdir -p $(INSTALL_DATA_DIR) -- install -m 0644 data/context_data.lib $(INSTALL_DATA_DIR)/context_data.lib -- install -m 0644 data/cs219.lib $(INSTALL_DATA_DIR)/cs219.lib -- install -m 0644 data/do_not_delete $(INSTALL_DATA_DIR)/do_not_delete -- install -m 0644 data/do_not_delete.phr $(INSTALL_DATA_DIR)/do_not_delete.phr -- install -m 0644 data/do_not_delete.pin $(INSTALL_DATA_DIR)/do_not_delete.pin -- install -m 0644 data/do_not_delete.psq $(INSTALL_DATA_DIR)/do_not_delete.psq -- mkdir -p $(INSTALL_SCRIPTS_DIR) -- install -m 0644 scripts/Align.pm $(INSTALL_SCRIPTS_DIR)/Align.pm -- install -m 0644 scripts/HHPaths.pm $(INSTALL_SCRIPTS_DIR)/HHPaths.pm -- install scripts/addss.pl $(INSTALL_SCRIPTS_DIR)/addss.pl -- install scripts/create_profile_from_hhm.pl $(INSTALL_SCRIPTS_DIR)/create_profile_from_hhm.pl -- install scripts/create_profile_from_hmmer.pl $(INSTALL_SCRIPTS_DIR)/create_profile_from_hmmer.pl -- install scripts/hhmakemodel.pl $(INSTALL_SCRIPTS_DIR)/hhmakemodel.pl -- install scripts/reformat.pl $(INSTALL_SCRIPTS_DIR)/reformat.pl -- install scripts/splitfasta.pl $(INSTALL_SCRIPTS_DIR)/splitfasta.pl -- install scripts/multithread.pl $(INSTALL_SCRIPTS_DIR)/multithread.pl -- install scripts/hhblitsdb.pl $(INSTALL_SCRIPTS_DIR)/hhblitsdb.pl -+ mkdir -p $(DESTDIR)$(PREFIX)/bin -+ install src/hhblits $(DESTDIR)$(PREFIX)/bin/hhblits -+ install src/hhalign $(DESTDIR)$(PREFIX)/bin/hhalign -+ install src/hhconsensus $(DESTDIR)$(PREFIX)/bin/hhconsensus -+ install src/hhfilter $(DESTDIR)$(PREFIX)/bin/hhfilter -+ install src/hhmake $(DESTDIR)$(PREFIX)/bin/hhmake -+ install src/hhsearch $(DESTDIR)$(PREFIX)/bin/hhsearch -+ mkdir -p $(DESTDIR)$(INSTALL_LIB_DIR) -+ mkdir -p $(DESTDIR)$(INSTALL_LIB_BIN_DIR) -+ install src/cstranslate $(DESTDIR)$(INSTALL_LIB_BIN_DIR)/cstranslate -+ mkdir -p $(DESTDIR)$(INSTALL_DATA_DIR) $(DESTDIR)$(INSTALL_BDATA_DIR) -+ install --mode=0644 data/context_data.lib $(DESTDIR)$(INSTALL_DATA_DIR)/context_data.lib -+ install --mode=0644 data/cs219.lib $(DESTDIR)$(INSTALL_DATA_DIR)/cs219.lib -+ install --mode=0644 data/do_not_delete $(DESTDIR)$(INSTALL_BDATA_DIR)/do_not_delete -+ install --mode=0644 data/do_not_delete.phr $(DESTDIR)$(INSTALL_BDATA_DIR)/do_not_delete.phr -+ install --mode=0644 data/do_not_delete.pin $(DESTDIR)$(INSTALL_BDATA_DIR)/do_not_delete.pin -+ install --mode=0644 data/do_not_delete.psq $(DESTDIR)$(INSTALL_BDATA_DIR)/do_not_delete.psq -+ mkdir -p $(DESTDIR)$(INSTALL_SCRIPTS_DIR) -+ install --mode=0644 scripts/Align.pm $(DESTDIR)$(INSTALL_SCRIPTS_DIR)/Align.pm -+ install --mode=0644 scripts/HHPaths.pm $(DESTDIR)$(INSTALL_SCRIPTS_DIR)/HHPaths.pm -+ install scripts/addss.pl $(DESTDIR)$(INSTALL_SCRIPTS_DIR)/addss.pl -+ install scripts/create_profile_from_hhm.pl $(DESTDIR)$(INSTALL_SCRIPTS_DIR)/create_profile_from_hhm.pl -+ install scripts/create_profile_from_hmmer.pl $(DESTDIR)$(INSTALL_SCRIPTS_DIR)/create_profile_from_hmmer.pl -+ install scripts/hhmakemodel.pl $(DESTDIR)$(INSTALL_SCRIPTS_DIR)/hhmakemodel.pl -+ install scripts/reformat.pl $(DESTDIR)$(INSTALL_SCRIPTS_DIR)/reformat.pl -+ install scripts/splitfasta.pl $(DESTDIR)$(INSTALL_SCRIPTS_DIR)/splitfasta.pl -+ install scripts/multithread.pl $(DESTDIR)$(INSTALL_SCRIPTS_DIR)/multithread.pl -+ install scripts/hhblitsdb.pl $(DESTDIR)$(INSTALL_SCRIPTS_DIR)/hhblitsdb.pl - - deinstall: - $(MAKE) -C lib/ffindex deinstall INSTALL_DIR=$(INSTALL_DIR) -@@ -89,9 +86,11 @@ deinstall: - - .PHONY: clean - clean: -- cd lib/ffindex && $(MAKE) clean - $(MAKE) -C src clean - -+.PHONY: distclean -+distclean: clean -+ - dist/$(dist_name).tar.gz: - make clean - mkdir -p dist diff -Nru hhsuite-3.0~beta2+dfsg/debian/patches/paths hhsuite-3.0~beta3+dfsg/debian/patches/paths --- hhsuite-3.0~beta2+dfsg/debian/patches/paths 2017-01-18 15:52:21.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/debian/patches/paths 1970-01-01 00:00:00.000000000 +0000 @@ -1,52 +0,0 @@ -Description: Set hhsuite paths. - Set paths to hhsuite components - as installed on Debian - correctly in absence of the (now superflouos) HHLIB environment variable. -Forwarded: not-needed -Author: Laszlo Kajan ---- a/scripts/HHPaths.pm -+++ b/scripts/HHPaths.pm -@@ -31,6 +31,7 @@ use Exporter; - our $VERSION = "version 2.0.16 (January 2013)"; - our @ISA = qw(Exporter); - our @EXPORT = qw($VERSION $hhlib $hhdata $hhbin $hhscripts $execdir $datadir $ncbidir $dummydb $pdbdir $dsspdir $dssp $cs_lib $context_lib $v); -+push @EXPORT, qw($hhshare $hhbdata); - - ############################################################################################## - # PLEASE COMPLETE THE PATHS ... TO PSIPRED AND OLD-STYLE BLAST (NOT BLAST+) (NEEDED FOR PSIPRED) -@@ -54,11 +55,13 @@ our $dssp = "/cluster/databases/dssp - # The lines below probably do not need to be changed - - # Setting paths for hh-suite perl scripts --our $hhlib = $ENV{"HHLIB"}; # main hh-suite directory --our $hhdata = $hhlib."/data"; # path to data directory for hhblits, example files -+our $hhlib = $ENV{"HHLIB"} || "/usr/lib/hhsuite"; # main hh-suite directory -+our $hhshare = $ENV{"HHLIB"} || "/usr/share/hhsuite"; # main hh-suite directory -+our $hhdata = $hhshare."/data"; # path to arch indep data directory for hhblits, example files -+our $hhbdata = $hhlib."/data"; # path to arch dep data directory for hhblits, example files - our $hhbin = $hhlib."/bin"; # path to cstranslate (path to hhsearch, hhblits etc. should be in environment variable PATH) --our $hhscripts= $hhlib."/scripts"; # path to hh perl scripts (addss.pl, reformat.pl, hhblitsdb.pl etc.) --our $dummydb = $hhdata."/do_not_delete"; # Name of dummy blast db for PSIPRED (single sequence formatted with NCBI formatdb) -+our $hhscripts= $hhshare."/scripts"; # path to hh perl scripts (addss.pl, reformat.pl, hhblitsdb.pl etc.) -+our $dummydb = $hhbdata."/do_not_delete"; # Name of dummy blast db for PSIPRED (single sequence formatted with NCBI formatdb) - - # HHblits data files - our $cs_lib = "$hhdata/cs219.lib"; ---- a/src/hhdecl.C -+++ b/src/hhdecl.C -@@ -285,7 +285,7 @@ void Parameters::SetDefaultPaths(char *p - if(getenv("HHLIB")) - strcpy(hhlib, getenv("HHLIB")); - else -- strcpy(hhlib, "/usr/lib/hh"); -+ strcpy(hhlib, "/usr/lib/hhsuite"); - - strcat(strcpy(hhdata, hhlib), "/data"); - strcat(strcpy(clusterfile, hhdata), "/context_data.lib"); -@@ -300,7 +300,7 @@ void Parameters::SetDefaultPaths(char *p - /* we did not find HHLIB, if called with full path or in dist dir, we can try relative to program path */ - if(program_path != NULL) - { -- strcat(strcpy(hhlib, program_path), "../lib/hh"); -+ strcat(strcpy(hhlib, program_path), "../lib/hhsuite"); - strcat(strcpy(hhdata, hhlib), "/data"); - strcat(strcpy(clusterfile, hhdata), "/context_data.lib"); - strcat(strcpy(cs_library, hhdata), "/cs219.lib"); diff -Nru hhsuite-3.0~beta2+dfsg/debian/patches/perl_interpreter hhsuite-3.0~beta3+dfsg/debian/patches/perl_interpreter --- hhsuite-3.0~beta2+dfsg/debian/patches/perl_interpreter 2017-01-18 15:52:21.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/debian/patches/perl_interpreter 2018-08-21 19:09:09.000000000 +0000 @@ -1,83 +1,13 @@ Description: patch Perl interpreter as required by Polich 5.1 Perl Script Magic - http://www.debian.org/doc/packaging-manuals/perl-policy/ch-programs.html#s-hash_bang Author: Laszlo Kajan ---- a/scripts/addss.pl -+++ b/scripts/addss.pl -@@ -1,4 +1,4 @@ --#!/usr/bin/env perl -+#!/usr/bin/perl - # - # addss.pl - # Add PSIPRED secondary structure prediction (and DSSP annotation) to an MSA or HMMER file. ---- a/scripts/create_profile_from_hhm.pl -+++ b/scripts/create_profile_from_hhm.pl +Index: hhsuite/scripts/create_profile_from_hhm.pl +=================================================================== +--- hhsuite.orig/scripts/create_profile_from_hhm.pl ++++ hhsuite/scripts/create_profile_from_hhm.pl @@ -1,4 +1,4 @@ -#!/usr/bin/env perl +#!/usr/bin/perl # # create_profile_from_hhm.pl # Create a profile (.prf) from a given HHM file ---- a/scripts/create_profile_from_hmmer.pl -+++ b/scripts/create_profile_from_hmmer.pl -@@ -1,4 +1,4 @@ --#!/usr/bin/env perl -+#!/usr/bin/perl - # - # create_profile_from_hmmer.pl - # Create a profile (.prf) from a given HMMER/HMMER3 file ---- a/scripts/hhmakemodel.pl -+++ b/scripts/hhmakemodel.pl -@@ -1,4 +1,4 @@ --#! /usr/bin/env perl -+#!/usr/bin/perl - # - # hhmakemodel.pl - # Generate a model from an output alignment of hhsearch. ---- a/scripts/multithread.pl -+++ b/scripts/multithread.pl -@@ -1,4 +1,4 @@ --#!/usr/bin/env perl -+#!/usr/bin/perl - # - # multithread.pl: - # Run a command with different file names as arguments on multiple threads in parallel ---- a/scripts/pdb2fasta.pl -+++ b/scripts/pdb2fasta.pl -@@ -1,4 +1,4 @@ --#! /usr/bin/env perl -+#!/usr/bin/perl - # - # pdb2fasta.pl - generate FASTA nonredundant sequence file from SEQRES records of globbed pdb files. - ---- a/scripts/pdbfilter.pl -+++ b/scripts/pdbfilter.pl -@@ -1,4 +1,4 @@ --#! /usr/bin/env perl -+#!/usr/bin/perl - # pdbfilter.pl - Read pdb or SCOP sequences from infile and write representative set of sequences to outfile - - # ---- a/scripts/reformat.pl -+++ b/scripts/reformat.pl -@@ -1,4 +1,4 @@ --#! /usr/bin/env perl -+#!/usr/bin/perl - - # reformat.pl - # Reformat a multiple alignment file ---- a/scripts/renumberpdb.pl -+++ b/scripts/renumberpdb.pl -@@ -1,4 +1,4 @@ --#!/usr/bin/env perl -+#!/usr/bin/perl - # renumberpdb.pl - generate a PDB file with renumbered indices that match residue indices in input sequence - # Usage: renumberpdb.pl [options] infile [outfile] - # Example: renumberpdb.pl d1hz4a_.a3m d1hz4a_.pdb ---- a/scripts/splitfasta.pl -+++ b/scripts/splitfasta.pl -@@ -1,4 +1,4 @@ --#! /usr/bin/env perl -+#!/usr/bin/perl - # splitfasta.pl - # Split a file with multiple, FASTA formatted sequences into many single-sequence FASTA files - # diff -Nru hhsuite-3.0~beta2+dfsg/debian/patches/rest1 hhsuite-3.0~beta3+dfsg/debian/patches/rest1 --- hhsuite-3.0~beta2+dfsg/debian/patches/rest1 2017-01-18 15:52:21.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/debian/patches/rest1 1970-01-01 00:00:00.000000000 +0000 @@ -1,40 +0,0 @@ -Description: Assorted patches - * Set paths to hhsuite components for Perl scripts. - * Add a Makefile to generate data/do_not_delete.phr,do_not_delete.pin,do_not_delete.psq . -Author: Laszlo Kajan ---- a/src/Makefile -+++ b/src/Makefile -@@ -14,13 +14,13 @@ endif - - ifdef ICC - CXX = env LANG=C icc -- CXXFLAGS = -fast -Wall -fno-strict-aliasing -finline-functions -funroll-loops -fp-model fast=2 -fast-transcendentals -wd981 -wd383 -wd2259 -wd1572 -D HH_SSE3 -I$(CS_DIR) -I$(FFINDEX_DIR) -+ CXXFLAGS += -fast -Wall -fno-strict-aliasing -finline-functions -funroll-loops -fp-model fast=2 -fast-transcendentals -wd981 -wd383 -wd2259 -wd1572 -D HH_SSE3 -I$(CS_DIR) - HHSEARCH_LIBS = -lpthread -lrt - HHBLITS_LIBS = -lpthread -lrt -openmp - OBJECT_LIBS = -openmp - else # ifndef ICC - CXX = g++ -- CXXFLAGS = -O3 -Wall -Wno-deprecated -Wno-char-subscripts -fno-strict-aliasing -I$(CS_DIR) $(BITS_FLAG) -I$(FFINDEX_DIR) -fopenmp -+ CXXFLAGS += -O3 -Wall -Wno-deprecated -Wno-char-subscripts -fno-strict-aliasing -I$(CS_DIR) $(BITS_FLAG) -fopenmp - HHSEARCH_LIBS = -lpthread - HHBLITS_LIBS = -lpthread -fopenmp - HHALIGN_LIBS = -@@ -51,7 +51,7 @@ else # ifndef ICC - endif - endif # ifndef ICC - --CPP = $(CXX) $(CXXFLAGS) -+CPP = $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) - CPP_VALGRIND = $(CXX) -g $(CXXFLAGS) - - -@@ -92,7 +92,7 @@ hhsearch_valgrind: hhsearch.C $(SOURCES) - $(CPP_VALGRIND) hhsearch.C -o hhsearch_valgrind $(CS_OBJECTS) $(HHSEARCH_LIBS) - - hhblits: hhblits.C $(SOURCES) $(HEADERS) $(CS_OBJECTS) -- $(CPP) hhblits.C -o hhblits $(CS_OBJECTS) $(HHBLITS_LIBS) $(FFINDEX_DIR)/libffindex.a -+ $(CPP) hhblits.C -o hhblits $(CS_OBJECTS) $(HHBLITS_LIBS) -lffindex - - hhblits_static: hhblits.C $(SOURCES) $(HEADERS)$(CS_OBJECTS) - $(CPP) -static hhblits.C -o hhblits $(CS_OBJECTS) $(HHBLITS_LIBS) $(FFINDEX_DIR)/libffindex.a diff -Nru hhsuite-3.0~beta2+dfsg/debian/patches/series hhsuite-3.0~beta3+dfsg/debian/patches/series --- hhsuite-3.0~beta2+dfsg/debian/patches/series 2017-01-18 15:52:21.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/debian/patches/series 2018-08-21 19:09:09.000000000 +0000 @@ -1,11 +1,8 @@ perlscript_uselib perl_interpreter add_Makefile_in_data.patch -spelling.patch disable-ffindex-build.patch -fix-namespace.patch drop-fmemopen.h.patch build-own-manual-PDF.patch set-default-HHLIB-dir.patch disable-AVX-SSE.patch -fix_type_mismatch_in_printf.patch diff -Nru hhsuite-3.0~beta2+dfsg/debian/patches/set-default-HHLIB-dir.patch hhsuite-3.0~beta3+dfsg/debian/patches/set-default-HHLIB-dir.patch --- hhsuite-3.0~beta2+dfsg/debian/patches/set-default-HHLIB-dir.patch 2017-01-18 15:52:21.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/debian/patches/set-default-HHLIB-dir.patch 2018-08-21 19:09:09.000000000 +0000 @@ -2,13 +2,15 @@ hhsuite needs to access some static data in /usr/share. This patch sets the correct search path if it is not given by the user in HHLIB environment var. Author: Sascha Steinbiss ---- a/src/hhdecl.cpp -+++ b/src/hhdecl.cpp -@@ -13,7 +13,7 @@ +Index: hhsuite/src/hhdecl.cpp +=================================================================== +--- hhsuite.orig/src/hhdecl.cpp ++++ hhsuite/src/hhdecl.cpp +@@ -13,7 +13,7 @@ void Parameters::SetDefaultPaths() { if (getenv("HHLIB")) strcpy(hhlib, getenv("HHLIB")); else -- strcpy(hhlib, "/usr/lib"); +- strcpy(hhlib, "/usr/lib/hhsuite"); + strcpy(hhlib, "/usr/share/hhsuite"); strcat(strcpy(hhdata, hhlib), "/data"); diff -Nru hhsuite-3.0~beta2+dfsg/debian/patches/spelling.patch hhsuite-3.0~beta3+dfsg/debian/patches/spelling.patch --- hhsuite-3.0~beta2+dfsg/debian/patches/spelling.patch 2017-01-18 15:52:21.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/debian/patches/spelling.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,68 +0,0 @@ -Description: Fix spelling -Author: Andreas Tille -Last-Update: Sat, 13 Aug 2016 07:27:28 +0200 - ---- a/src/cs/context_profile-inl.h -+++ b/src/cs/context_profile-inl.h -@@ -106,7 +106,7 @@ - assert(len & 1); - if (nalph != Abc::kSize) - throw Exception("Alphabet size of serialized context profile should be %d" -- "but is acutally %d!", Abc::kSize, nalph); -+ "but is actually %d!", Abc::kSize, nalph); - - // If everything went fine we can resize our data memmbers - probs.Resize(len); ---- a/doc/hhsuite-userguide.tex -+++ b/doc/hhsuite-userguide.tex -@@ -1575,7 +1575,7 @@ - -neffmax ]1,20] skip further search iterations when diversity Neff of query MSA - becomes larger than neffmax (default=10.0) - -cpu number of CPUs to use (for shared memory SMPs) (default=2) -- -scores write scores for all pairwise comparisions to file -+ -scores write scores for all pairwise comparisons to file - -atab write all alignments in tabular layout to file - -maxres max number of HMM columns (def=20001) - -maxmem [1,inf[ limit memory for realignment (in GB) (def=3.0) -@@ -1706,7 +1706,7 @@ - Other options: - -v verbose mode: 0:no screen output 1:only warings 2: verbose (def=2) - -cpu number of CPUs to use (for shared memory SMPs) (default=2) -- -scores write scores for all pairwise comparisions to file -+ -scores write scores for all pairwise comparisons to file - -atab write all alignments in tabular layout to file - -maxres max number of HMM columns (def=20001) - -maxmem [1,inf[ limit memory for realignment (in GB) (def=3.0) ---- a/src/cs/crf_state-inl.h -+++ b/src/cs/crf_state-inl.h -@@ -44,7 +44,7 @@ - assert(len & 1); - if (nalph != Abc::kSize) - throw Exception("Alphabet size of serialized CRF state should be %d" -- "but is acutally %d!", Abc::kSize, nalph); -+ "but is actually %d!", Abc::kSize, nalph); - - // If everything went fine we can resize our data memmbers - context_weights.Resize(len); ---- a/src/hhblits.cpp -+++ b/src/hhblits.cpp -@@ -457,7 +457,7 @@ - printf(" becomes larger than neffmax (default=%.1f)\n", par.neffmax); - printf(" -cpu number of CPUs to use (for shared memory SMPs) (default=%i) \n", par.threads); - if (all) { -- printf(" -scores write scores for all pairwise comparisions to file \n"); -+ printf(" -scores write scores for all pairwise comparisons to file \n"); - printf(" -filter_matrices filter matrices for similarity to output at most 100 matrices\n"); - printf(" -atab write all alignments in tabular layout to file \n"); - printf(" -maxres max number of HMM columns (def=%5i) \n", par.maxres); ---- a/src/hhsearch.cpp -+++ b/src/hhsearch.cpp -@@ -241,7 +241,7 @@ - printf(" -v verbose mode: 0:no screen output 1:only warings 2: verbose (def=%i)\n", par.v); - printf(" -cpu number of CPUs to use (for shared memory SMPs) (default=%i) \n", par.threads); - if (all) { -- printf(" -scores write scores for all pairwise comparisions to file \n"); -+ printf(" -scores write scores for all pairwise comparisons to file \n"); - printf(" -atab write all alignments in tabular layout to file \n"); - printf(" -maxres max number of HMM columns (def=%5i) \n", par.maxres); - printf(" -maxmem [1,inf[ limit memory for realignment (in GB) (def=%.1f) \n", par.maxmem); diff -Nru hhsuite-3.0~beta2+dfsg/debian/patches/unnecessary_libz hhsuite-3.0~beta3+dfsg/debian/patches/unnecessary_libz --- hhsuite-3.0~beta2+dfsg/debian/patches/unnecessary_libz 2017-01-18 15:52:21.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/debian/patches/unnecessary_libz 1970-01-01 00:00:00.000000000 +0000 @@ -1,14 +0,0 @@ -Description: remove unnecessary libz link -Forwarded: yes -Author: Laszlo Kajan ---- a/src/Makefile -+++ b/src/Makefile -@@ -46,7 +46,7 @@ else # ifndef ICC - HHSEARCH_LIBS = -lpthread -lrt - HHBLITS_LIBS+=-lrt - ifndef NO_PNG -- HHALIGN_LIBS = -DNO_FREETYPE -DHH_PNG -lpng -lz -+ HHALIGN_LIBS = -DNO_FREETYPE -DHH_PNG -lpng - endif - endif - endif # ifndef ICC diff -Nru hhsuite-3.0~beta2+dfsg/debian/patches/unused_variables_warning hhsuite-3.0~beta3+dfsg/debian/patches/unused_variables_warning --- hhsuite-3.0~beta2+dfsg/debian/patches/unused_variables_warning 2017-01-18 15:52:21.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/debian/patches/unused_variables_warning 1970-01-01 00:00:00.000000000 +0000 @@ -1,21 +0,0 @@ -Description: fix unused variables warning -Forwarded: yes -Author: Laszlo Kajan ---- a/src/pngwriter.cc -+++ b/src/pngwriter.cc -@@ -3343,13 +3343,13 @@ void pngwriter::scale_wh(int finalwidth, - { - std::cerr << " PNGwriter::scale_wh - ERROR **: Negative or zero final width or height not allowed." << std::endl; - } -- -+/* - double kx; - double ky; - - kx = ((double)finalwidth)/((double)width_); - ky = ((double)finalheight)/((double)height_); -- -+*/ - pngwriter temp(finalwidth, finalheight, 0, "temp"); - - int red, green, blue; diff -Nru hhsuite-3.0~beta2+dfsg/debian/patches/version_bump hhsuite-3.0~beta3+dfsg/debian/patches/version_bump --- hhsuite-3.0~beta2+dfsg/debian/patches/version_bump 2017-01-18 15:52:21.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/debian/patches/version_bump 1970-01-01 00:00:00.000000000 +0000 @@ -1,13 +0,0 @@ -Description: fix version where it is not up to date -Author: Laszlo Kajan ---- a/src/hhdecl.C -+++ b/src/hhdecl.C -@@ -10,7 +10,7 @@ - //// Constants - ///////////////////////////////////////////////////////////////////////////////////// - --EXTERN const char VERSION_AND_DATE[]="version 2.0.15 (June 2012)"; -+EXTERN const char VERSION_AND_DATE[]="version 2.0.16 (January 2013)"; - EXTERN const char REFERENCE[]="Remmert M, Biegert A, Hauser A, and Soding J.\nHHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.\nNat. Methods 9:173-175 (2011).\n"; - EXTERN const char COPYRIGHT[]="(C) Johannes Soeding, Michael Remmert, Andreas Biegert, Andreas Hauser\n"; - EXTERN const int MAXSEQ=65535; //max number of sequences in input alignment (must be <~30000 on cluster nodes??) diff -Nru hhsuite-3.0~beta2+dfsg/debian/README.source hhsuite-3.0~beta3+dfsg/debian/README.source --- hhsuite-3.0~beta2+dfsg/debian/README.source 2017-01-18 15:52:21.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/debian/README.source 2018-08-21 19:09:09.000000000 +0000 @@ -1,6 +1,12 @@ hhsuite for Debian ------------------ +The ffindex package features an MQP library to work with OpenMPI. +But only when OpenMPI is found to be installed. +The ffindex packages builds with OpenMPI but does not export that +library. To overcome this hurdle please deinstall the OpenMPI packages +for building this package. + Patches ....... Patches are documented in their headers. diff -Nru hhsuite-3.0~beta2+dfsg/debian/rules hhsuite-3.0~beta3+dfsg/debian/rules --- hhsuite-3.0~beta2+dfsg/debian/rules 2017-01-18 15:52:21.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/debian/rules 2018-08-21 19:09:09.000000000 +0000 @@ -5,7 +5,8 @@ # Uncomment this to turn on verbose mode. #export DH_VERBOSE=1 -VERSION=$(shell dpkg-parsechangelog -SVersion | cut -f1 -d+) +include /usr/share/dpkg/default.mk + TIMESTAMP=$(shell dpkg-parsechangelog -STimestamp) DATE=$(shell date --date=@$(TIMESTAMP) '+%d %B %Y') @@ -25,10 +26,13 @@ dh_installman $(HHMANPAGES): %.1 : debian/tmp/usr/bin/% - unset LD_PRELOAD && export LD_LIBRARY_PATH=debian/tmp/usr/lib && export HHLIB=. && help2man --no-info --help-option '-h all' --version-string $(VERSION) -n "$$n_$*" '$<' | sed -e 's/\(^\|[^\]\)-/\1\\-/g;' > '$@' + unset LD_PRELOAD && export LD_LIBRARY_PATH=debian/tmp/usr/lib && export HHLIB=. && help2man --no-info --help-option '-h all' --version-string $(DEB_VERSION_UPSTREAM) -n "$$n_$*" '$<' | sed -e 's/\(^\|[^\]\)-/\1\\-/g;' > '$@' -override_dh_clean: +override_dh_auto_clean: + dh_auto_clean rm -f $(HHMANPAGES) + find . -name "*.rej" -delete + find . -name "*~" -delete dh_clean override_dh_auto_build: diff -Nru hhsuite-3.0~beta2+dfsg/debian/upstream/metadata hhsuite-3.0~beta3+dfsg/debian/upstream/metadata --- hhsuite-3.0~beta2+dfsg/debian/upstream/metadata 2017-01-18 15:52:21.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/debian/upstream/metadata 2018-08-21 19:09:09.000000000 +0000 @@ -1,9 +1,11 @@ Name: hhsuite Contact: Andreas Hauser -Homepage: http://toolkit.genzentrum.lmu.de/hhblits Reference: - - Author: Michael Remmert and Andreas Biegert and Andreas Hauser and Johannes Söding - Title: 'HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.' + - Author: > + Michael Remmert and Andreas Biegert and Andreas Hauser and Johannes Söding + Title: > + HHblits: Lightning-fast iterative protein sequence searching + by HMM-HMM alignment. Journal: Nat. Methods DOI: 10.1038/NMETH.1818 Year: 2011 @@ -22,4 +24,12 @@ PMID: 15531603 DOI: 10.1093/bioinformatics/bti125 URL: http://bioinformatics.oxfordjournals.org/content/21/7/951.abstract - eprint: http://bioinformatics.oxfordjournals.org/content/21/7/951.full.pdf+html \ No newline at end of file + eprint: "http://bioinformatics.oxfordjournals.org/content/\ + 21/7/951.full.pdf+html" +Registry: + - Name: OMICtools + Entry: OMICS_28407 + - Name: bio.tools + Entry: NA + - Name: SciCrunch + Entry: SCR_010277 diff -Nru hhsuite-3.0~beta2+dfsg/doc/hhsuite-userguide.tex hhsuite-3.0~beta3+dfsg/doc/hhsuite-userguide.tex --- hhsuite-3.0~beta2+dfsg/doc/hhsuite-userguide.tex 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/doc/hhsuite-userguide.tex 2017-07-14 10:34:03.000000000 +0000 @@ -387,12 +387,14 @@ hhsearch Search an HHsuite database with a query MSA or HMM hhmake Build an HMM from an input MSA hhfilter Filter an MSA by max sequence identity, coverage, and other criteria -hhalign Calculate pairwise alignments, dot plots etc. for two HMMs/MSAs +hhalign Calculate pairwise alignments etc. for two HMMs/MSAs hhconsensus Calculate the consensus sequence for an A3M/FASTA input file reformat.pl Reformat one or many MSAs addss.pl Add PSIPRED predicted secondary structure to an MSA or HHM file hhmakemodel.pl Generate MSAs or coarse 3D models from HHsearch or HHblits results +hhmakemodel.py Generates coarse 3D models from HHsearch or HHblits results and modifies + cif files such that they are compatible with MODELLER hhsuitedb.py Build HHsuite database with prefiltering, packed MSA/HMM, and index files splitfasta.pl Split a multiple-sequence FASTA file into multiple single-sequence files renumberpdb.pl Generate PDB file with indices renumbered to match input sequence indices @@ -401,7 +403,10 @@ mergeali.pl Merge MSAs in A3M format according to an MSA of their seed sequences pdb2fasta.pl Generate FASTA sequence file from SEQRES records of globbed pdb files +cif2fasta.py Generate a FASTA sequence from the pdbx_seq_one_letter_code entry + of the entity_poly of globbed cif files pdbfilter.pl Generate representative set of PDB/SCOP sequences from pdb2fasta.pl output +pdbfilter.py Generate representative set of PDB/SCOP sequences from cif2fasta.py output \end{verbatim} \normalsize @@ -526,6 +531,81 @@ Previous versions of HHsuite (the 'HHsearch package') included a perl script \verb`buildali.pl` to build MSAs for a query sequence using PSI-BLAST as its search engine. Because HHblits performs better than PSI-BLAST in all aspects that we have tested, we decided to remove this script from HHsuite. It can still be downloaded as part of HHsearch version 1.5.0. +\subsection*{Example: Comparative protein structure modeling using HHblits and MODELLER} + +A three-dimensional (3D) structure greatly facilitates the functional characterization of proteins. However, for many proteins there are no experimental structures available, and thus, comparative modeling to known protein structures may provide useful insights. In this method, a 3D structure of a given protein sequence (target) is predicted based on alignments to one or more proteins of known structures (templates). In the following, we demonstrate how to create alignments for a unresolved protein with HHblits and the PDB70 database. We then convert search results from HHblits to build a comparative model using MODELLER (v9.16). + +In 1999, Wu et all. reported the genomic sequence and evolutionary analysis of lactate dehydrogenase genes from \textit{Trichomonas vaginalis} (TvLDH). Surprisingly, the corresponding protein sequence was most similar to the malate dehydrogenase (TvMDH) of the same organism implying TvLDH arose from TvMDH by convergent evolution. In the meantime, the structure of TvLDH has been resolved, however, for instructional purposes suppose that there is no 3D structure for the protein. + +To get started we obtain the protein sequence of TvLDH from GeneBank (accession number \href{http://www.ncbi.nlm.nih.gov/nuccore/AF060233}{AF060233.1}). We first copy-paste the protein sequence into a new file named \verb`query.seq`. The content of this file should somewhat look similar to this. + +\footnotesize +\begin{verbatim} +>TvLDH +MSEAAHVLITGAAGQIGYILSHWIASGELYGDRQVYLHLLDIPPAMNRLTALTMELEDCAFPHLAGFVATTDP +KAAFKDIDCAFLVASMPLKPGQVRADLISSNSVIFKNTGEYLSKWAKPSVKVLVIGNPDNTNCEIAMLHAKNL +KPENFSSLSMLDQNRAYYEVASKLGVDVKDVHDIIVWGNHGESMVADLTQATFTKEGKTQKVVDVLDHDYVFD +TFFKKIGHRAWDILEHRGFTSAASPTKAAIQHMKAWLFGTAPGEVLSMGIPVPEGNPYGIKPGVVFSFPCNVD +KEGKIHVVEGFKVNDWLREKLDFTEKDLFHEKEIALNHLAQ +\end{verbatim} +\normalsize + +The search results obtained by querying the TvLDH sequence against the PDB70 will be significantly better if we use a MSA instead of a single sequence. For this reason, we first query the protein sequence of TvLDH against the uniprot20 database which covers the whole protein sequence space. By executing + +\begin{verbatim} +$ hhblits -i query.seq -d uniprot -oa3m query.a3m -cpu 4 -n 1 +\end{verbatim} + +we obtain a MSA in a3m format which contains several sequences that are similar to TvLDH. Now we can use the \verb`query.a3m` and search the PDB70 database for similar protein structures. + +\begin{verbatim} +$ hhblits -i query.a3m -o results.hhr -d pdb70 -cpu 4 -n 1 +\end{verbatim} + +Note that we now output a hhr file \verb`results.hhr` instead of an a3m file. Before we convert the search results to a format that is readable by MODELLER, let us quickly inspect \verb`results.hhr`. + +\footnotesize +\begin{verbatim} +Query TvLDH +Match_columns 333 +No_of_seqs 2547 out of 8557 +Neff 11.6151 +Searched_HMMs 1566 +Date Tue Aug 16 11:35:02 2016 +Command hhblits -i query.a3m -o results.hhr -d pdb70 -cpu 32 -n 2 + + No Hit Prob E-value P-value Score SS Cols Query HMM Template HMM + 1 7MDH_C MALATE DEHYDROGENASE; C 100.0 1.5E-39 1.3E-43 270.1 0.0 326 3-333 31-358 (375) + 2 4UUL_A L-LACTATE DEHYDROGENASE 100.0 1.8E-35 1.6E-39 244.5 0.0 332 1-332 1-332 (341) + 3 4UUP_A MALATE DEHYDROGENASE (E 100.0 2.2E-35 1.9E-39 243.8 0.0 332 1-332 1-332 (341) + 4 4UUM_B L-LACTATE DEHYDROGENASE 100.0 5.7E-35 5.1E-39 241.5 0.0 333 1-333 1-333 (341) + 5 1CIV_A NADP-MALATE DEHYDROGENA 100.0 1.6E-34 1.5E-38 241.3 0.0 326 2-332 40-367 (385) + 6 1Y7T_A Malate dehydrogenase(E. 100.0 3.4E-34 3.1E-38 235.3 0.0 324 1-331 1-325 (327) + 7 4I1I_A Malate dehydrogenase (E 100.0 3.8E-34 3.4E-38 236.4 0.0 319 2-327 22-343 (345) + 8 1BMD_A MALATE DEHYDROGENASE (E 99.9 5.9E-34 5.3E-38 233.9 0.0 324 1-331 1-325 (327) + 9 4H7P_B Malate dehydrogenase (E 99.9 9.9E-34 8.9E-38 233.9 0.0 319 2-327 22-343 (345) + 10 2EWD_B lactate dehydrogenase, 99.9 1.1E-33 9.5E-38 231.1 0.0 306 1-328 1-314 (317) + + (...) +\end{verbatim} +\normalsize + +We find that there are several templates that have a high similarity to our query. Interestingly, the hit with the most significant E-value score is also malate dehydrogenase. We will use this structure as a basis for our comparative model. In order to build the model we first have to obtain the template structure(s). We can get 7MDH by typing the following commands + +\begin{verbatim} +$ mkdir templates +$ cd templates +$ wget http://files.rcsb.org/download/7MDH.cif +$ cd .. +\end{verbatim} + +To convert our search results \verb`results.hhr` into an alignment that is readable by MODELLER we use \verb`hhmakemodel.py`. + +\begin{verbatim} +$ python3 hhmakemodel.py results.hhr templates/ TvLDH.pir ./ -m 1 +\end{verbatim} + +This script takes four positional arguments: the results file in hhr format, the path to the folder containing all templates in cif format, the output pir file, and folder where the processed cif files should be written to. The \verb`-m` flag tells \verb`hhmakemodel.py` to only include the first hit in the pir alignment. The pir file together with processed cifs can be used as an input for MODELLER (please refer to the MODELLER documentation for further help). \subsection{Visually checking an MSA for corrupted regions} @@ -555,10 +635,6 @@ %\end{center} %\end{figure} - - - - \subsection{Building customized databases} \label{building_dbs} It is simple to build custom HHsuite databases using the same tools we use to build the standard HHsuite databases (except uniprot20). An example application is to search for homologs among all proteins of an organism. To build your own HHsuite database from a set of sequences, you first need to generate an MSA with predicted secondary structure for every sequence in the set. @@ -584,15 +660,15 @@ Now, add PSIPRED-predicted secondary structure and if possible DSSP secondary structure annotation to all MSAs: \begin{verbatim} $ mpirun -np ffindex_apply_mpi _a3m_wo_ss.ff{data,index} \\ - -i _a3m.ffindex -d _a3m.ffdata -- addss.pl stdin stdout -$ rm _wo_ss_a3m.ff{data,index} + -i _a3m.ffindex -d _a3m.ffdata -- addss.pl -v 0 stdin stdout +$ rm _a3m_wo_ss_a3m.ff{data,index} \end{verbatim} We also need to generate an HHM file for each MSA file: \begin{verbatim} $ mpirun -np ffindex_apply_mpi _a3m.ff{data,index} \\ - -i _hhm.ffindex -d _hhm.ffindex -- hhmake -i stdin -o stdout -v 0 + -i _hhm.ffindex -d _hhm.ffdata -- hhmake -i stdin -o stdout -v 0 \end{verbatim} In order to build the ffindex containing the column state sequences for prefiltering for each a3m we run: @@ -622,6 +698,44 @@ As with all perl/python scripts and binaries in the HHsuite, a list of additional options can be retrieved by calling the scripts without parameters. +\subsection*{Example: Building a database from the PDB} + +%Applications in protein-protein or protein-drug interactions often require our users to find homologous protein sequences from the PDB. +To make efficient sequence searches in the PDB we provide a precompiled PDB70 database containing PDB sequences clustered at 70 \% sequence identity. However, to find a larger variety of PDB templates, larger databases with more redundancy are required. For this reason, the HHsuite provides tools to build a custom PDB database. In this tutorial, we describe the steps required to build a custom PDB database. + +First, download the entire PDB database from \href{http://www.rcsb.org/pdb/static.do?p=download/ftp/index.html}{RSCB} in cif file format (this is the successor of the pdb file format) by executing + +\begin{verbatim} +$ rsync --progress -rlpt -v -z --port=33444 rsync.wwpdb.org::ftp/data/structures/ +divided/mmCIF +\end{verbatim} + +and unzip the files into a single directory \verb``. Then, run \verb`cif2fasta.py` by typing + +\begin{verbatim} +$ python3 cif2fasta.py -i -o pdb100.fas -c -p pdb_filter.dat +\end{verbatim} + +The script scans the folder \verb`` for files with the suffix \verb`*.cif` and write each sequence and its associated chain identifier annotated in \verb`pdbx_seq_one_letter_code` entry from the \verb`entity_poly` table to the fasta file \verb`pdb100.fas`. By specifying the optional \verb`-p` flag, \verb`cif2fasta.py` creates an additional file \verb`pdb_filter.dat` which is required by \verb`pdbfilter.py` in a later step. Note that \verb`cif2fasta.py` by default removes sequences which are shorter than 30 residues and/or comprise only the residue 'X'. + +If you wish to exhaustively search the PDB, skip the following steps and continue with the instructions described in Section \ref{building_dbs}. However, to increase the speed of database searches, e.g. on systems with limited resources, you can reduce the number of sequences by clustering them with MMSeqs and selecting representative sequences with \verb`pdbfilter.py`. To cluster the sequences of \verb`pdb100.fas` at a sequence identity of \verb`X` and a coverage of \verb`Y` run MMSeqs using these options. + +\begin{verbatim} +$ mmseqs createdb pdb100.fas /pdb100 +$ mmseqs clusteringworkflow /pdb100 /pdbXX_clu +/tmp/clustering -c Y --min-seq-id X +$ mmseqs createtsv /pdb100 /pdb100 /pdbXX_clu +/pdbXX_clu.tsv +\end{verbatim} + +MMSeqs yields tab separated file \verb`pdbXX_clu.tsv` which contains cluster assignments for all sequences. Representative sequences are selected by \verb`pdbfilter.py` which chooses up to three sequences for each cluster by identifying the ones having either the highest resolution ($\mathring{A}$), the largest R-free value or the largest "completeness"\footnote{We compute the completeness of a protein structure by dividing the number of residues that are found in the ATOM section by the total number of residues declared in the pdbx\_seq\_one\_letter\_code entry of the entity\_poly table.}. + +\begin{verbatim} +$ python3 pdbfilter.py pdb100.fas pdbXX_clu.tsv pdb_filter.dat pdbXX.fas +-i pdb70_to_include.dat -r pdb70_to_remove.dat +\end{verbatim} + +\verb`pdbfilter.py` takes the original fasta file (\verb`pdb100.fas`) and the annotation file \verb`pdb_filter.dat` which both were created by \verb`cif2fasta.py`, and the cluster assignments from MMSeqs (\verb`pdb70_clu.tsv`) as input and outputs the final \verb`pdbXX.fas`. Use this fasta file to complete the creation of your database (see Section \ref{building_dbs}). \subsection{Modifying or extending existing databases} @@ -861,6 +975,21 @@ \item For some of the alignments with differing scores, check if the db aligment on the server is the same as the one in your local database. You can download the db MSA by clicking the alignment logo above the query-template pairwise profile-profile alignment. \end{itemize} +{\bf I find an alignment of my query and a template with the exact same sequence, but they are aligned incorrectly.} I find a gap inserted at residue position 92 in query and position 99 in template: + +\begin{verbatim} +Q ss_pred CcEEEeeCCCC-CCCeEEEEEECCeeEEEEEEECCCCcEEEEEEeeCCHHHHHHHHhhCCCccceeEEEecccCCCCCCe +Q Pr_1 81 GAFLIRESESA-PGDFSLSVKFGNDVQHFKVLRDGAGKYFLWVVKFNSLNELVDYHRSTSVSRNQQIFLRDIEQVPQQPT 159 (217) +Q Consensus 81 ~~~~~~~~~~~-~~~~~~s~~~~~~~~~~~i~~~~~~~~~~~~~~f~s~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 159 (217) + |.+++|.+... ++.+.+ +..++.++|+++.....+.|+.+...|.++..++.++...+.................... +T Consensus 81 G~flvR~s~~~~~~~~~l-v~~~~~v~h~~i~~~~~g~~~~~~~~f~sl~~Lv~~y~~~~~~~~~~~~l~~~~~~~~~~~ 159 (217) +T 1gri_A 81 GAFLIRESESAPGDFSLS-VKFGNDVQHFKVLRDGAGKYFLWVVKFNSLNELVDYHRSTSVSRNQQIFLRDIEQVPQQPT 159 (217) +T ss_dssp TCEEEEECSSSTTCEEEE-EEETTEEEEEEEEECSSSCEESSSCEESSHHHHHHHHHHSCSSSSTTCCCCBCCCCCCCCC +T ss_pred CeEEEEECCCCCCCcEEE-EECCCceeEEEEEECCCCcEEEeeEecCCHHHHHHHhhcCCCccccceeccccccccCCce +\end{verbatim} + +The seemingly incorrect alignment is explained by the fact that hhsearch / hhblits aligns profile HMMs, not sequences directly. In this example, the query and template multiple sequence alignments (MSAs) from which the profile HMMs were built are \emph{not} identical, as one can see for instance from the differing Consensus annotation strings and different predicted secondary structure strings. The underying MSAs probably contain the exact same alignment ambiguity in this region, with some member sequences being shifted by one positions between positions 92 and 99 relative to the master sequence of the MSA. + \section{HHsearch/HHblits output: hit list and pairwise alignments}\label{outformat} @@ -1466,7 +1595,7 @@ -maxfilt max number of hits allowed to pass 2nd prefilter (default=20000) -min_prefilter_hits min number of hits to pass prefilter (default=100) -prepre_smax_thresh min score threshold of ungapped prefilter (default=10) - -pre_evalue_thresh max E-value threshold of Smith-Waterman prefilter score (default=0) + -pre_evalue_thresh max E-value threshold of Smith-Waterman prefilter score (default=1000.0) -pre_bitfactor prefilter scores are in units of 1 bit / pre_bitfactor (default=4) -pre_gap_open gap open penalty in prefilter Smith-Waterman alignment (default=20) -pre_gap_extend gap extend penalty in prefilter Smith-Waterman alignment (default=4) @@ -1575,7 +1704,7 @@ -neffmax ]1,20] skip further search iterations when diversity Neff of query MSA becomes larger than neffmax (default=10.0) -cpu number of CPUs to use (for shared memory SMPs) (default=2) - -scores write scores for all pairwise comparisions to file + -scores write scores for all pairwise comparisons to file -atab write all alignments in tabular layout to file -maxres max number of HMM columns (def=20001) -maxmem [1,inf[ limit memory for realignment (in GB) (def=3.0) @@ -1706,7 +1835,7 @@ Other options: -v verbose mode: 0:no screen output 1:only warings 2: verbose (def=2) -cpu number of CPUs to use (for shared memory SMPs) (default=2) - -scores write scores for all pairwise comparisions to file + -scores write scores for all pairwise comparisons to file -atab write all alignments in tabular layout to file -maxres max number of HMM columns (def=20001) -maxmem [1,inf[ limit memory for realignment (in GB) (def=3.0) @@ -1822,12 +1951,11 @@ If only one alignment/HMM is given it is compared to itself and the best off-diagonal alignment plus all further non-overlapping alignments above significance threshold are shown. The command also allows to sample -alignments randomly, to generate png-files with dot plots showing -alignments or to print out a list of indices of aligned residue pairs. +alignments randomly or to print out a list of indices of aligned residue pairs. \small \begin{verbatim} -Usage: hhalign -i query [-t template] [options] +Usage: hhalign -i query -t template [options] -i input/query: single sequence or multiple sequence alignment (MSA) in a3m, a2m, or FASTA format, or HMM in hhm format -t input/template: single sequence or multiple sequence alignment (MSA) @@ -1942,7 +2070,11 @@ special annotation sequences with names \verb`>ss_pred`, \verb`>ss_conf`, and \verb`>ss_dssp` to the top of the output A3M alignment. If no output file is given, the output file will have the same name as the input file, except for the extension being replaced by \verb`'.a3m'`. Allowed input formats are A2M/FASTA (default), -A3M (-a3m), CLUSTAL (-clu), STOCKHOLM (-sto), HMMER (-hmm). +A3M (-a3m), CLUSTAL (-clu), STOCKHOLM (-sto), HMMER (-hmm). Note that in order to add PSIPRED and DSSP annotations corresponding paths +have to be set in \verb`HHPaths.pm`. To add DSSP annotations \verb`addss.pl` first checks a folder of precomputed dssp files which +can be obtained from \href{http://www.cmbi.ru.nl/dssp.html}{here}. If the dssp file for a particular structure is not available, \verb`addss.pl` tries to +execute the DSSP binary \verb`mkdssp`. Please keep in mind that only the current DSSP version (2.2.1) supports structures in cif format, and that compiling the software +may cause problems on several systems. If you have troubles to compile DSSP you may try to delete all occurrences of \verb`"static"` in its Makefile. If the input file contains HMMER models, records SSPRD and SSCON containing predicted secondary structure and confidence values are added to each model. In this case the output file name is @@ -1956,7 +2088,7 @@ \normalsize -\subsection{{\tt hhmakemodel.pl} -- generate MSAs or coarse 3D models from HHsearch results file} +\subsection{{\tt hhmakemodel.pl} and {\tt hhmakemodel.py} -- generate MSAs or coarse 3D models from HHsearch results file} From the top hits in an hhsearch output file (hhr), you can \begin{itemize} @@ -2003,6 +2135,32 @@ \end{verbatim} \normalsize +Analogously, \verb`hhmakemodel.py` generates a MSA containing all representative template sequences from all selected alignments in PIR format. Note that in \verb`hhmakemodel.py` you have to specify the folder containing the *.cif files which make up the alignment. The script will modify the ATOM section of those cif files according to the residue numbering in the alignment. Use these renumbered cifs file together with the PIR alignment output in MODELLER\cite{Sali:1993}. + +\small +\begin{verbatim} +usage: hhmakemodel.py [-h] [-v] [-m INT [INT ...]] [-e FLOAT] [-r FLOAT] [-c] + FILE DIR FILE DIR + +Creates a MODELLER alignment (*.pir) from a HHSearch results file (*.hhr). + +positional arguments: + FILE results file from HHsearch with hit list and alignment + DIR path to the folder containing cif files + FILE output file (PIR-formatted multiple alignment) + DIR path to the folder where modified cif files should be + written to + +optional arguments: + -h, --help show this help message and exit + -v, --verbose verbose mode + -m INT [INT ...] pick hits with specified indices (e.g. -m 2 5) + -e FLOAT maximum E-Value threshold (e.g. -e 0.001) + -r FLOAT residue ratio (filter alignments that have contribute at + least residues according to the specified ratio). + -c convert non-canonical residues (default = True) +\end{verbatim} +\normalsize \subsection{{\tt hhsuitedb.pl} -- Build an HHsuite database } @@ -2079,6 +2237,58 @@ \end{verbatim} \normalsize +\subsection{{\tt cif2fasta.py} -- Create a fasta file from cif files} + +An example of the usage of \verb`cif2fasta.py` is provided in Section \ref{building_dbs}. + +\small +\begin{verbatim} +Usage: cif2fasta.py -i cif_folder -o *.fasta -c num_cores -v + +cif2fasta.py takes a folder that contains cif files as input and outputs their +sequences into fasta file. + +Options: + -h, --help show this help message and exit + -i DIR input cif folder. + -o FILE output fasta file. + -p FILE output PDB filter file (optional). + -s FILE SCOP annotation. + -c INT number of cores (default = 1). + -l INT Remove chains with a length < X (default = 30). + -v INT Verbose Mode (quiet = 0, full verbosity = 2). +\end{verbatim} +\normalsize + +\subsection{{\tt pdbfilter.py} -- Filter sequences from the PDB (requires MMSeqs)} + +An example of the usage of \verb`pdbfilter.py` is provided in Section \ref{building_dbs}. Note that the annotations file which is required to select the proper PDB is created by \verb`cif2fasta.py` using the -p flag. + +\small +\begin{verbatim} +usage: pdbfilter.py [-h] [-i FILE] [-r FILE] [-v] FILE FILE FILE FILE + +pdbfilter.py selects from sequence clusters (determined by MMSeqs) the +sequences which have the best resolution, R-free factor and/or completness and +writes them to a fasta file. + +positional arguments: + FILE input fasta file (created by cif2fasta.py) + FILE sequence clusters (MMseqs) + FILE annotations file (created by cif2fasta using the -p + flag, contains information about resolution, R-free + and completness of sequences). + FILE output fasta file + +optional arguments: + -h, --help show this help message and exit + -i FILE, --include FILE + include PDB chains + -r FILE, --remove FILE + exclude PDB chains + -v, --verbose verbose mode +\end{verbatim} +\normalsize \section{Selected changes from previous versions} @@ -2290,7 +2500,7 @@ reliable as for -local. The Probabilities were recalibrated. } \item{A new binary hhalign has been added. It is similar to hhsearch, but performs - only pairwise comparisons. It can produce dot plots, tables of aligned + only pairwise comparisons. It can produce tables of aligned residues, and it can sample alternative alignments stochastically. It uses the MAC algorithm by default. } diff -Nru hhsuite-3.0~beta2+dfsg/README hhsuite-3.0~beta3+dfsg/README --- hhsuite-3.0~beta2+dfsg/README 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/README 2017-07-14 10:34:03.000000000 +0000 @@ -16,6 +16,7 @@ To compile from source, you will need: * a recent C/C++ compiler * CMake 2.8.12 or later + * pdbx python library for example from https://github.com/soedinglab/pdbx ***************************************************************************** 2. Installation diff -Nru hhsuite-3.0~beta2+dfsg/README.md hhsuite-3.0~beta3+dfsg/README.md --- hhsuite-3.0~beta2+dfsg/README.md 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/README.md 2017-07-14 10:34:03.000000000 +0000 @@ -10,12 +10,20 @@ The HH-suite is an open-source software package for sensitive protein sequence searching based on the pairwise alignment of hidden Markov models (HMMs). +## WARNING ## +We had to rename our repository for ffindex, so it might be necessary to run the following commands to update the submodule in your clone: + + git pull + git submodule deinit + git submodule init + ## Requirements To compile from source, you will need: * a recent C/C++ compiler * [CMake](http://cmake.org/) 2.8.12 or later + * install [pdbx python library](https://github.com/soedinglab/pdbx) ## Installation diff -Nru hhsuite-3.0~beta2+dfsg/scripts/a3m.py hhsuite-3.0~beta3+dfsg/scripts/a3m.py --- hhsuite-3.0~beta2+dfsg/scripts/a3m.py 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/scripts/a3m.py 2017-07-14 10:34:03.000000000 +0000 @@ -1,242 +1,240 @@ #!/usr/bin/env python + class A3MFormatError(Exception): - def __init__(self, value): - self.value = "ERROR:"+value - - def __str__(self): - return repr(self.value) + def __init__(self, value): + self.value = "ERROR: "+value + + def __str__(self): + return repr(self.value) class A3M_Container: - def __init__(self): - self.header = None - self.annotations = dict() - self.consensus = None - self.sequences = [] - self.nr_match_states = None - - def get_number_sequences(self): - return len(self.sequences) - - - def check_and_add_sequence(self, header, sequence): - try: - if(self.check_and_add_annotation(header, sequence)): - pass - elif(self.check_and_add_consensus(header,sequence)): - pass - else: - match_states = self.check_sequence(sequence) - self.check_match_states(match_states) - self.sequences.append((header, sequence)) - except A3MFormatError as e: - print(header) - print(sequence) - raise e - - - def check_and_add_consensus(self, header, sequence): - header_name = header[1:].split()[0] - if header_name.endswith("_consensus"): - if self.consensus: - raise A3MFormatError("Multiple definitions of consensus!") - else: - match_states = self.check_sequence(sequence) - self.check_match_states(match_states) - self.consensus = (header, sequence) - return True - else: - return False - - - def check_and_add_annotation(self, header, sequence): - annotation_classes = [("ss_conf", self.check_ss_conf), ("ss_pred", self.check_ss_pred), ("ss_dssp", self.check_dssp)] - for annotation_class in annotation_classes: - if(header[1:].startswith(annotation_class[0])): - if(annotation_class in self.annotations): - raise A3MFormatError("Multiple definitions of "+annotation_class+"!") + RESIDUES = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + VALID_MATCH_STATES = set(RESIDUES) + VALID_INSERTION_STATES = set(RESIDUES.lower()) + VALID_GAP_STATES = set("-.") + VALID_SS_CONF_STATES = set("0123456789") + VALID_SS_STATES = set("ECH") + VALID_DSSP_STATES = set("CHBEGITS-") + + def __init__(self): + self.header = None + self.annotations = dict() + self.consensus = None + self.sequences = [] + self.nr_match_states = None + + @property + def number_sequences(self): + """get the current number of protein sequences""" + return len(self.sequences) + + def check_and_add_sequence(self, header, sequence): + try: + if (not self.check_and_add_annotation(header, sequence) and + not self.check_and_add_consensus(header, sequence)): + self.check_sequence(sequence) + self.sequences.append((header, sequence)) + except A3MFormatError as e: + raise e + + def check_and_add_consensus(self, header, sequence): + header_name = header[1:].split()[0] + if header_name.endswith("_consensus"): + if self.consensus: + raise A3MFormatError("Multiple definitions of consensus!") + else: + self.check_sequence(sequence) + self.consensus = (header, sequence) + return True else: - match_states = annotation_class[1](sequence) - self.check_match_states(match_states) - self.annotations[annotation_class[0]] = sequence - return True - - return False - - - def check_match_states(self, match_states): - if match_states == 0: - raise A3MFormatError("Sequence with zero match states!") - elif self.nr_match_states and match_states != self.nr_match_states: - raise A3MFormatError("Sequence with diverging number of match states ("+str(match_states)+" vs. "+str(self.nr_match_states)+")!") - else: - self.nr_match_states = match_states - - - def check_ss_conf(self, sequence): - allowed_ss_conf_states = set({"0","1","2","3","4","5","6","7","8","9"}) - allowed_gap_states = set({"-", "."}) - - match_states = 0 - - for c in sequence: - if c in allowed_ss_conf_states: - match_states += 1 - elif c not in allowed_gap_states: - raise A3MFormatError("Undefined character '"+c+"' in predicted secondary structure confidence!") - - return match_states - - - def check_ss_pred(self, sequence): - allowed_ss_states = set({"E","C","H"}) - allowed_gap_states = set({"-", "."}) - - match_states = 0 - - for c in sequence: - if c in allowed_ss_states: - match_states += 1 - elif c not in allowed_gap_states: - raise A3MFormatError("Undefined character '"+c+"' in predicted secondary structure!") - - return match_states - - - def check_dssp(self, sequence): - """ - H = alpha-helix - B = residue in isolated beta-bridge - E = extended strand, participates in beta ladder - G = 3-helix (310 helix) - I = 5 helix (pi-helix) - T = hydrogen bonded turn - S = bend - """ - allowed_dssp_states = set({"C", "H", "B", "E", "G", "I", "T", "S", "-"}) - - match_states = 0 - - for c in sequence: - if not c in allowed_dssp_states: - raise A3MFormatError("Undefined character '"+c+"' in dssp annotation!") - else: - match_states += 1 - - return match_states - - - def check_sequence(self, sequence): - allowed_match_states = set({'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'}) - allowed_insertion_states = set({'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'}) - allowed_gap_states = set({"-", "."}) - - match_states = 0 - - for c in sequence: - if c in allowed_match_states: - match_states += 1 - elif c in allowed_gap_states: - match_states += 1 - elif (c not in allowed_insertion_states): - raise A3MFormatError("Undefined character '"+c+"' in protein sequence!") - - return match_states - - def get_sub_sequence(self, sequence, limits): - allowed_match_states = set({'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'}) - allowed_gap_states = set({"-", "."}) - - sub_sequence = "" - - for (start, end) in limits: - start_pos = 0 - pos = -1 - for i in range(len(sequence)): - if sequence[i] in allowed_match_states or sequence[i] in allowed_gap_states: - pos += 1 - if pos + 1 == start: - start_pos = i - break - - end_pos = 0 - pos = -1 - for i in range(len(sequence)): - if sequence[i] in allowed_match_states or sequence[i] in allowed_gap_states: - pos += 1 - if pos + 1 == end: - end_pos = i - break - sub_sequence += sequence[start_pos:end_pos+1] - - return sub_sequence - - - def get_content(self): - content = "" - - if self.header: - content += self.header+"\n" - - if self.consensus: - content += self.consensus[0]+"\n" - content += self.consensus[1]+"\n" - - for (header, sequence) in self.sequences: - content += header+"\n" - content += sequence+"\n" - - return content - - - def split_a3m(self, limits): - new_a3m = A3M_Container() - - if self.consensus: - new_consensus_sequence = self.get_sub_sequence(self.consensus[1], limits) - new_a3m.consensus = ((self.consensus[0], new_consensus_sequence)) - - for (header, sequence) in self.sequences: - new_sequence = self.get_sub_sequence(sequence, limits) - new_a3m.sequences.append((header, new_sequence)) - - return new_a3m - - - def read_a3m(self, fh): - lines = fh.readlines() - self.read_a3m_from_lines(lines) - fh.close() - - def read_a3m_from_lines(self, lines): - sequence_header = None - sequence = "" - - is_first_line = True - - for line in lines: - line = line.strip() - if len(line) == 0: - continue - elif line[0] == "#": - if is_first_line: - self.header = line + return False + + def check_and_add_annotation(self, header, sequence): + annotation_classes = [ + ("ss_conf", self.check_ss_conf), + ("ss_pred", self.check_ss_pred), + ("ss_dssp", self.check_dssp) + ] + + for (annotation_name, check) in annotation_classes: + if(header[1:].startswith(annotation_name)): + if(annotation_name in self.annotations): + raise A3MFormatError( + "Multiple definitions of {}!".format(annotation_name) + ) + elif check(sequence): + self.annotations[annotation_name] = sequence + return True + return False + + def check_match_states(self, match_states): + if not self.nr_match_states: + self.nr_match_states = match_states + + if match_states == 0: + raise A3MFormatError("Sequence with zero match states!") + elif match_states != self.nr_match_states: + raise A3MFormatError( + ("Sequence with diverging number " + "of match states ({} vs. {})!").format( + match_states, + self.nr_match_states + ) + ) + + + def check_ss_conf(self, sequence): + count_match_states = sum((c in self.VALID_SS_CONF_STATES + or c in self.VALID_GAP_STATES) + for c in sequence) + self.check_match_states(count_match_states) + + invalid_states = set(sequence) - self.VALID_SS_CONF_STATES + invalid_states -= self.VALID_GAP_STATES + + if len(invalid_states): + raise A3MFormatError( + ("Undefined character(s) '{}' in predicted " + "secondary structure confidence!").format(invalid_states)) else: - #skip line - pass - elif line[0] == ">": - if sequence_header: - self.check_and_add_sequence(sequence_header, sequence) - sequence_header = None - sequence = "" - - sequence_header = line.rstrip() - else: - sequence += line.strip().strip("\x00") - - is_first_line = False - - if sequence_header: - self.check_and_add_sequence(sequence_header, sequence) + return True + + def check_ss_pred(self, sequence): + count_match_states = sum((c in self.VALID_SS_STATES + or c in self.VALID_GAP_STATES) + for c in sequence) + self.check_match_states(count_match_states) + + invalid_states = set(sequence) - self.VALID_SS_STATES + invalid_states -= self.VALID_GAP_STATES + + if len(invalid_states): + raise A3MFormatError( + ("Undefined character(s) '{}' in predicted " + "secondary structure!").format(invalid_states)) + else: + return True + + def check_dssp(self, sequence): + count_match_states = sum( + (c in self.VALID_DSSP_STATES) for c in sequence) + self.check_match_states(count_match_states) + + invalid_states = set(sequence) - self.VALID_DSSP_STATES + + if len(invalid_states): + raise A3MFormatError( + ("Undefined character(s) '{}' in " + "dssp annotation!").format(invalid_states)) + else: + return True + + def check_sequence(self, sequence): + count_match_states = sum((c in self.VALID_MATCH_STATES + or c in self.VALID_GAP_STATES) + for c in sequence) + self.check_match_states(count_match_states) + + + invalid_states = set(sequence) - self.VALID_MATCH_STATES + invalid_states -= self.VALID_GAP_STATES + invalid_states -= self.VALID_INSERTION_STATES + + if len(invalid_states): + raise A3MFormatError( + ("Undefined character(s) '{}' in " + "protein sequence!").format(invalid_states)) + else: + return True + + def get_sub_sequence(self, sequence, limits): + sub_sequence = [] + for (start, end) in limits: + start_pos = 0 + pos = -1 + for i in range(len(sequence)): + if (sequence[i] in self.VALID_MATCH_STATES or + sequence[i] in self.VALID_GAP_STATES): + pos += 1 + + if pos + 1 == start: + start_pos = i + break + + end_pos = 0 + pos = -1 + for i in range(len(sequence)): + if (sequence[i] in self.VALID_MATCH_STATES or + sequence[i] in self.VALID_GAP_STATES): + pos += 1 + if pos + 1 == end: + end_pos = i + break + sub_sequence.append(sequence[start_pos:end_pos+1]) + + return "".join(sub_sequence) + + def __str__(self): + content = [] + + if self.header: + content.append(self.header) + + if self.consensus: + content.append(self.consensus[0]) + content.append(self.consensus[1]) + + for (header, sequence) in self.sequences: + content.append(header) + content.append(sequence) + + return "\n".join(content) + + def split_a3m(self, limits): + new_a3m = A3M_Container() + + if self.consensus: + new_consensus_sequence = self.get_sub_sequence(self.consensus[1], + limits) + new_a3m.consensus = (self.consensus[0], new_consensus_sequence) + + for (header, sequence) in self.sequences: + new_sequence = self.get_sub_sequence(sequence, limits) + new_a3m.sequences.append((header, new_sequence)) + + return new_a3m + + def read_a3m(self, fh): + lines = fh.readlines() + self.read_a3m_from_lines(lines) + fh.close() + + def read_a3m_from_lines(self, lines): + sequence_header = None + sequence = [] + + is_first_line = True + + for line in lines: + line = line.strip() + if len(line) == 0: + continue + elif line[0] == "#": + if is_first_line: + self.header = line + elif line[0] == ">": + if sequence_header: + self.check_and_add_sequence(sequence_header, + "".join(sequence)) + sequence = [] + sequence_header = line.rstrip() + else: + sequence.append(line.strip().strip("\x00")) + + is_first_line = False + + if sequence_header: + self.check_and_add_sequence(sequence_header, "".join(sequence)) diff -Nru hhsuite-3.0~beta2+dfsg/scripts/addss.pl hhsuite-3.0~beta3+dfsg/scripts/addss.pl --- hhsuite-3.0~beta2+dfsg/scripts/addss.pl 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/scripts/addss.pl 2017-07-14 10:34:03.000000000 +0000 @@ -1,4 +1,4 @@ -#!/usr/bin/env perl +#!/usr/bin/perl # # addss.pl # Add PSIPRED secondary structure prediction (and DSSP annotation) to an MSA or HMMER file. @@ -226,7 +226,7 @@ } # Filter alignment to diversity $neff - if ( $v >= 1 ) { printf("Filtering alignment to diversity $neff ...\n"); } + if ( $v >= 1 ) { printf(STDERR "Filtering alignment to diversity $neff ...\n"); } &HHPaths::System( "hhfilter -v $v2 -neff $neff -i $tmpfile.in.a3m -o $tmpfile.in.a3m"); @@ -248,13 +248,13 @@ $ss_dssp =~ s/(\S{$numres})/$1\n/g; } printf( ALIFILE ">ss_dssp\n%s\n", $ss_dssp ); - if ( $v >= 1 ) { print("\nAdding DSSP state sequence ...\n"); } + if ( $v >= 1 ) { print(STDERR "\nAdding DSSP state sequence ...\n"); } } } # Secondary structure prediction with psipred if ( $v >= 2 ) { - print("Predicting secondary structure with PSIPRED ... "); + print(STDERR "Predicting secondary structure with PSIPRED ... "); } &RunPsipred("$tmpfile.sq"); @@ -302,7 +302,7 @@ close(IN) } - if ( $v >= 2 ) { print("done \n"); } + if ( $v >= 2 ) { print(STDERR "done \n"); } } ############################################################## # HMMER format @@ -503,7 +503,7 @@ &HHPaths::System("rm $tmpfile.mtx $tmpfile.ss $tmpfile.ss2"); if ( $v >= 2 ) { - printf( "Added PSIPRED secondary structure to %i models\n", $nmodels ); + printf( STDERR "Added PSIPRED secondary structure to %i models\n", $nmodels ); } } @@ -629,7 +629,7 @@ } # PDB ID? (8fab_A, 1a0i) - elsif ( $line =~ /^>(\d[a-z0-9]{3})_?(\S?)\s/ ) { + elsif ( $line =~ /^>(\d[A-Za-z0-9]{3})_?(\S?)\s/ ) { $pdbcode = $1; if ( $2 ne "" ) { $qrange = "$2:"; } else { $qrange = "-"; } @@ -646,7 +646,7 @@ else { if ( $v >= 3 ) { - print( + print( STDERR "Warning: no pdb code found in sequence name '$name'\n" ); } @@ -665,11 +665,12 @@ } close(QFILE); if ( $v >= 3 ) { - printf( "Searching DSSP state assignments: name=%s range=%s\n", + printf( STDERR "Searching DSSP state assignments: name=%s range=%s\n", $name, $qrange); } # Try to open dssp file + $pdbcode =~ tr/[A-Z]/[a-z]/; my $dsspfile = "$dsspdir/$pdbcode.dssp"; if ( -e $dsspfile ) { ( $aa_dssp, $ss_dssp, $sa_dssp ) = &readDSSP( $dsspfile, $qrange ); @@ -891,7 +892,8 @@ $pdbfile = "$pdbdir/divided/" . substr( $pdbcode, 1, 2 ) . "/"; } else { $pdbfile = "$pdbdir/"; } - if ( $pdbdir =~ /divided.?$/ ) { + + if ( $pdbdir =~ /divided.?$/ ) { $pdbfile .= substr( $pdbcode, 1, 2 ) . "/"; } if ( -e $pdbfile . "pdb$pdbcode.ent" ) { $pdbfile .= "pdb$pdbcode.ent"; } @@ -901,7 +903,8 @@ elsif ( -e $pdbfile . "pdb$pdbcode.ent.Z" ) { $pdbfile .= "pdb$pdbcode.ent.Z"; } - elsif ( -e $pdbfile . "$pdbcode.pdb" ) { $pdbfile . "$pdbcode.pdb"; } + elsif ( -e $pdbfile . "$pdbcode.pdb" ) { $pdbfile .= "$pdbcode.pdb"; } + elsif ( -e $pdbfile . "$pdbcode.cif" ) { $pdbfile .= "$pdbcode.cif"; } else { if ( $v >= 3 ) { printf( STDERR "Warning in $program: Cannot find pdb file $pdbfile" @@ -909,6 +912,7 @@ } return ""; } + if ( !open( PDBFILE, "$pdbfile" ) ) { if ( $v >= 3 ) { printf( STDERR "Error in $program: Cannot open pdb file: $!\n" ); diff -Nru hhsuite-3.0~beta2+dfsg/scripts/check_a3m.py hhsuite-3.0~beta3+dfsg/scripts/check_a3m.py --- hhsuite-3.0~beta2+dfsg/scripts/check_a3m.py 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/scripts/check_a3m.py 2017-07-14 10:34:03.000000000 +0000 @@ -7,18 +7,18 @@ def check_a3m(filename): a3m = A3M_Container() - + if(filename.lower() == "stdin"): fh = sys.stdin else: fh = open(filename, "r") - + try: a3m.read_a3m(fh) except A3MFormatError as e: - sys.stderr.write(e) + sys.stderr.write(str(e)) exit(1) - + def main(): filename = sys.argv[1] @@ -27,4 +27,3 @@ if __name__ == "__main__": main() - diff -Nru hhsuite-3.0~beta2+dfsg/scripts/cif2fasta.py hhsuite-3.0~beta3+dfsg/scripts/cif2fasta.py --- hhsuite-3.0~beta2+dfsg/scripts/cif2fasta.py 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/scripts/cif2fasta.py 2017-07-14 10:34:03.000000000 +0000 @@ -3,29 +3,38 @@ """ Created on Mon Jun 15 21:49:32 2015 -@author: Sagar +@author: Harald Voehringer """ -import itertools -from sys import argv, exit -import sys +import sys, os, glob, textwrap, itertools from optparse import OptionParser +from collections import defaultdict from os.path import splitext from pdbx.reader.PdbxReader import PdbxReader -from pdbx.writer.PdbxWriter import PdbxWriter -from pdbx.reader.PdbxContainers import * -from Bio import pairwise2 -from Bio.SubsMat import MatrixInfo as matlist -from Bio.Blast import NCBIStandalone -from Bio import SeqIO from multiprocessing import Pool -import os -import glob -import textwrap - -DEBUG_MODE = True +DEBUG_MODE = False +MIN_SEQ_LEN = None +SCOP_LIBRARY = False + +THREE2ONE = { + 'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K', 'ILE': 'I', 'PRO': 'P', + 'THR': 'T', 'PHE': 'F', 'ASN': 'N', 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', + 'TRP': 'W', 'ALA': 'A', 'VAL': 'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M', 'MSE': 'M', + 'HYP': 'P', 'MLY': 'K', 'SEP': 'S', 'TPO': 'T', 'CSO': 'C', 'PTR': 'Y', 'KCX': 'K', + 'CME': 'C', 'CSD': 'A', 'CAS': 'C', 'MLE': 'L', 'DAL': 'A', 'CGU': 'E', 'DLE': 'L', + 'FME': 'M', 'DVA': 'V', 'OCS': 'C', 'DPR': 'P', 'MVA': 'V', 'TYS': 'Y', 'M3L': 'K', + 'SMC': 'C', 'ALY': 'K', 'CSX': 'C', 'DCY': 'C', 'NLE': 'L', 'DGL': 'E', 'DSN': 'S', + 'CSS': 'C', 'DLY': 'K', 'MLZ': 'K', 'DPN': 'F', 'DAR': 'R', 'PHI': 'F', 'IAS': 'D', + 'DAS': 'D', 'HIC': 'H', 'MP8': 'P', 'DTH': 'T', 'DIL': 'I', 'MEN': 'N', 'DTY': 'Y', + 'CXM': 'M', 'DGN': 'G', 'DTR': 'W', 'SAC': 'S', 'DSG': 'N', 'MME': 'M', 'MAA': 'A', + 'YOF': 'Y', 'FP9': 'P', 'FVA': 'V', 'MLU': 'L', 'OMY': 'Y', 'FGA': 'E', 'MEA': 'F', + 'CMH': 'C', 'DHI': 'H', 'SEC': 'C', 'OMZ': 'Y', 'SCY': 'C', 'MHO': 'M', 'MED': 'M', + 'CAF': 'C', 'NIY': 'Y', 'OAS': 'S', 'SCH': 'C', 'MK8': 'L', 'SME': 'M', 'LYZ': 'K' + } +CANONICAL_RESIDUES = set(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', + 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']) class CIF2FASTA(object): @@ -36,16 +45,18 @@ def open_cif(self): """ Assumes a mmCif file and returns a data block used for subsequent procedures. """ # The "usual" procedure to open a mmCIF with pdbX/mmCIF - try: - with open(self.cif_path) as cif_fh: - data = [] - reader = PdbxReader(cif_fh) - reader.read(data) + + with open(self.cif_path) as cif_fh: + data = [] + reader = PdbxReader(cif_fh) + reader.read(data) + if len(data) == 0: + return None + else: return data[0] - except: - print "! Unexpected error during open_cif:", sys.exc_info()[0], in_file - return None - + + def is_valid(self): + return self.block is not None def chain_to_seq(self): """Extracts the sequence of the cif from entity_poly.pdbx_seq_one_letter_code""" @@ -54,69 +65,212 @@ non_polypeptide_chains = list() try: - entity_poly = self.block.getObj("entity_poly") - for row in range(0, entity_poly.getRowCount()): - if entity_poly.getValue("type", row) == "polypeptide(L)": - seq = entity_poly.getValue('pdbx_seq_one_letter_code', row) - parsed_seq = parse_seq(seq) # removes special amino acids and newlines + entity_poly = self.block.getObj('entity_poly') + except AttributeError: + + if DEBUG_MODE > 0: + print ('! {pdb} Could not extract entity_poly table.'.format( + pdb = self.pdb_entry())) + + return False + + try: + total_rows = entity_poly.getRowCount() + except AttributeError: + print ('! {pdb} Could not extract rows from entity_poly.'.format( + pdb = self.pdb_entry())) + + return False + + for row in range(0, total_rows): + if entity_poly.getValue('type', row) == 'polypeptide(L)': + seq = entity_poly.getValue('pdbx_seq_one_letter_code', row) + parsed_seq = parse_seq(seq) # removes special amino acids and newlines + + try: chains = entity_poly.getValue('pdbx_strand_id', row) - chain_list = chains.split(',') + except ValueError: + + if total_rows == 1: + print ('! {pdb} Only one polypeptide chain, but no chain identifiers, setting it to ".".'.format( + pdb = self.pdb_entry())) + + cif_chain_to_seq['.'] = parsed_seq + + return cif_chain_to_seq + + print ('! {pdb} Could not extract pdbx_strand_id from entity_poly table (polypeptide).'.format( + pdb = self.pdb_entry())) + + return False + + chain_list = chains.split(',') - for chain in chain_list: - cif_chain_to_seq[chain] = parsed_seq - else: + for chain in chain_list: + cif_chain_to_seq[chain] = parsed_seq + + else: + + try: chains = entity_poly.getValue('pdbx_strand_id', row) - non_polypeptide_chains.append(chains) - except AttributeError: - if DEBUG_MODE: - print "- Could not extract sequences from entity_poly (" + str(self.cif_path.split("/")[-1]) + ")." - return None - except: - print "! Unexpected error during chain_to_seq (entity_poly):", sys.exc_info()[0], in_file - return None + except ValueError: + print ('! {pdb} Could not extract pdbx_strand_id from entity_poly table (non-polypeptide).'.format( + pdb = self.pdb_entry())) + + return False + + non_polypeptide_chains.append(chains) + + chains = list(cif_chain_to_seq.keys()) + # remove chains that contain only unknown residues + for chain in chains: + # this is a very odd way to check whether a string contains only a single char + tmp_set = set(cif_chain_to_seq[chain]) + if len(tmp_set) == 1 and 'X' in tmp_set: + print ('! Removing {pdb}_{chain} (contains only unknown residues).'.format( + pdb = self.pdb_entry(), + chain = chain)) + + del cif_chain_to_seq[chain] + continue + + if len(cif_chain_to_seq[chain]) < MIN_SEQ_LEN: + print ('! Removing {pdb}_{chain} (sequence length < {min_len}).'.format( + pdb = self.pdb_entry(), + chain = chain, + min_len = MIN_SEQ_LEN)) + + del cif_chain_to_seq[chain] if len(cif_chain_to_seq) != 0: - if DEBUG_MODE: - print "- Extracted chains of " + str(self.cif_path.split("/")[-1]) + " " + " ".join( str(chain) + " (" + str(len(cif_chain_to_seq[chain])) + ")" for chain in cif_chain_to_seq.keys() ) + "." + + if DEBUG_MODE > 1: + print ('- Extracted chains of {pdb} {chains}.'.format( + pdb = self.pdb_entry(), + chains = ' '.join( str(chain) + ' (' + str(len(cif_chain_to_seq[chain])) + ')' for chain in cif_chain_to_seq.keys()))) + if len(non_polypeptide_chains) != 0: - print "- Following chains were non polypeptide chains " + ", ".join(non_polypeptide_chains) + " no polypeptide chains were found." + print ('- Following chains were non polypeptide chains {chains} no polypeptide chains were found.'.format( + chains = ', '.join(non_polypeptide_chains))) + return cif_chain_to_seq + else: - if DEBUG_MODE: - print "- No polypeptide chains were found in (" + str(self.cif_path.split("/")[-1]) + ")." + if DEBUG_MODE > 0: + print ('! {pdb} No polypeptide chains were found.'.format( + pdb = self.pdb_entry())) + + return False + + def chain_ratios(self, chain_to_seq): + """ Tries to extract Sequence from the atom section """ + + # chain_to_seq = self.chain_to_seq() + + if chain_to_seq != False: + chain_ratios = dict() - return None + # compute the lengths of sequences found in _entity_poly + entity_length = { chain : float(len(seq)) for chain, seq in chain_to_seq.items() } + entity_chains = entity_length.keys() + + # load the atomsite and set up dictionary to keep track of sequences + atom_site = self.block.getObj('atom_site') + atom_seq = defaultdict(str) + + current_residue = 0 + # Iterate through the atomsection of the cif file + for atom_row in range(0, atom_site.getRowCount()): + + # NMR structures contain many confomers + try: + model_num = int(atom_site.getValue('pdbx_PDB_model_num', atom_row)) + except ValueError: + model_num = 1 + + if model_num > 1: + continue + + atom_chain = atom_site.getValue('label_asym_id', atom_row) + + # get the alternative chain identifier too + try: + alt_chain = atom_site.getValue('auth_asym_id', atom_row) + except ValueError: + alt_chain = None + + # handle cases where there are no chains but only one structure + if atom_chain == '.' and entity_chains[0] == '.': + atom_chain = '.' + + # get the residue and the residue number + try: + res_num = int(atom_site.getValue("label_seq_id", atom_row)) + except ValueError: + continue + + if res_num != current_residue: + residue = atom_site.getValue('label_comp_id', atom_row) + + try: + residue = THREE2ONE[residue] + except KeyError: + residue = 'X' + + # try to get the chain identifier from alt_chain first, if this does not work use label_asym_id + if alt_chain is not None: + atom_seq[alt_chain] += residue + + # sometimes we find the right chain identifier not in the alt_chain + if not (atom_chain in atom_seq.keys()) and atom_chain is not None: + atom_seq[atom_chain] += residue + + current_residue = res_num + + for chain in entity_length.keys(): + if chain in atom_seq.keys(): + chain_ratios[chain] = len(atom_seq[chain]) / entity_length[chain] + else: + chain_ratios[chain] = 0 + + return chain_ratios + else: + return False def pdb_entry(self): """Extracts the PDB entry information of a cif file.""" try: - entry = self.block.getObj("entry") - entry_id = entry.getValue("id") - return entry_id.replace("\n", " ") - except AttributeError: - if DEBUG_MODE: - print "- Could not extract id from entry (" + str(self.cif_path.split("/")[-1]) + ")." - except: - print "! Unexpected error during pdb_entry (entry):", sys.exc_info()[0], in_file - return None - + entry = self.block.getObj('entry') + entry_id = entry.getValue('id') + return entry_id.replace('\n', ' ') + + except AttributeError: + if DEBUG_MODE > 0: + print ('! {pdb} Could not extract id from entry.'.format( + pdb = self.pdb_entry())) def protein_description(self): """Extracts the protein description annotated in struct.pdbx_descriptor of the cif file.""" try: # Get struct table which contains the protein description struct = self.block.getObj('struct') - # Get the pdbx description - protein_description = struct.getValue("pdbx_descriptor") - return protein_description.replace("\n", " ") - except AttributeError: - if DEBUG_MODE: - print "- Could not extract pdbx_descriptor from struct table (" + str(self.cif_path.split("/")[-1]) + ")." - return None - except: - print "! Unexpected error during protein_description (struct):", sys.exc_info()[0], in_file - return None + # Get the pdbx description and make format it appropritaly + protein_description = struct.getValue('pdbx_descriptor') + protein_description = protein_description.replace('\n', ' ') + protein_description = protein_description.replace(';', ' ') # to prevent parsing errors + + if len(protein_description.split(' ')) >= 5: + protein_description = ' '.join(protein_description.split(' ')[0:5]) # maximum of 5 words in header + + return protein_description.strip(',') + + except AttributeError: + if DEBUG_MODE > 1: + print ('! {pdb} Could not extract pdbx_descriptor from struct table.'.format( + pdb = self.pdb_entry())) + + return False def compounds(self): """ Extracts all compounds annotated in the HETATM section of the atom @@ -124,12 +278,13 @@ (HOH).""" atom_site = self.block.getObj('atom_site') - compounds = {} + for row in range(0, atom_site.getRowCount()): - if atom_site.getValue("group_PDB", row) == "HETATM": + if atom_site.getValue('group_PDB', row) == 'HETATM': - label_comp_id = atom_site.getValue("label_comp_id", row) + label_comp_id = atom_site.getValue('label_comp_id', row) + if label_comp_id not in compounds.keys(): compounds[label_comp_id] = 1 else: @@ -138,103 +293,199 @@ filtered_compounds = set() for compound in compounds.keys(): - if compounds[compound] >= 10 and compound != "HOH": + if compounds[compound] >= 10 and compound != 'HOH': filtered_compounds.add(compound) if len(filtered_compounds) == 0: - return None + return False else: - return ", ".join(filtered_compounds).replace("\n", " ") + return ', '.join(filtered_compounds).replace('\n', ' ') def resolution(self): """Extracts the resolution of the mmCIF.""" + + try: + refine = self.block.getObj('refine') + resolution = refine.getValue('ls_d_res_high') + + try: + resolution = float(resolution) + except ValueError: + return False + + return resolution + + except AttributeError: + if DEBUG_MODE > 1: + print ('! {pdb} Could not extract ls_d_res_high from refine table.'.format( + pdb = self.pdb_entry())) + try: reflns = self.block.getObj('reflns') # Extract the resolution of the crystal - resolution = reflns.getValue("d_resolution_high") - return resolution.replace("\n", " ") + resolution = reflns.getValue('d_resolution_high') + + try: + resolution = float(resolution) + except ValueError: + return False + + return resolution + + except AttributeError: + if DEBUG_MODE > 1: + print ('! {pdb} Could not extract d_resolution_high from reflns table.'.format( + pdb = self.pdb_entry())) + + # This is true for some Electron Microscopy structures + try: + em_3d = self.block.getObj('em_3d_reconstruction') + resolution = em_3d.getValue('resolution') + + try: + resolution = float(resolution) + except ValueError: + return False + + return resolution except AttributeError: - if DEBUG_MODE: - print "- Could not extract d_resolution_high from reflns table (" + str(self.cif_path.split("/")[-1]) + ")." - return None - except: - print "! Unexpected error during resolution (reflns):", sys.exc_info()[0], in_file - return None + if DEBUG_MODE > 1: + print ('! {pdb} Could not extract resolution from em_3d_reconstruction table.'.format( + pdb = self.pdb_entry())) + + return False def experimental_method(self): """Extracts the experimental method of the mmCIF.""" try: reflns = self.block.getObj('exptl') - method = reflns.getValue("method") - return method.replace("\n", " ") + method = reflns.getValue('method') + return method.replace('\n', ' ') + except AttributeError: - if DEBUG_MODE: - print "- Could not extract text from exptl table (" + str(self.cif_path.split("/")[-1]) + ")." - return None - except: - print "! Unexpected error during experimental_method (exptl):", sys.exc_info()[0], in_file - return None - + + if DEBUG_MODE > 1: + print ('! Could not extract text from exptl table.'.format( + pdb = self.pdb_entry())) + + return False def keywords(self): """Extracts the keywords of the mmCIF.""" try: - reflns = self.block.getObj("struct_keywords") - keywords = reflns.getValue("text") - return keywords.replace("\n", " ") - except AttributeError: - if DEBUG_MODE: - print "- Could not extract text from struct_keywords table (" + str(self.cif_path.split("/")[-1]) + ")." - return None - except: - print "! Unexpected error during keywords (struct_keywords):", sys.exc_info()[0], in_file - return None + reflns = self.block.getObj('struct_keywords') + keywords = reflns.getValue('text') + # perform some string modifications + keywords = keywords.replace('\n', ' ') + keywords = keywords.replace(';', ' ') + + if len(keywords.split(' ')) >= 5: + keywords = ' '.join(keywords.split(' ')[0:5]) + + return keywords.rstrip(',') + + except AttributeError: + + if DEBUG_MODE > 1: + print ('! {pdb} Could not extract text from struct_keywords table.'.format( + pdb = self.pdb_entry())) + + return False def organism(self): """Extracts the organism of the mmCIF.""" try: entity_src_nat = self.block.getObj('entity_src_nat') - organsim_scientific = entity_src_nat.getValue("pdbx_organism_scientific") - return organsim_scientific.replace("\n", " ") + organsim_scientific = entity_src_nat.getValue('pdbx_organism_scientific') + return organsim_scientific.replace('\n', ' ') + except AttributeError: - if DEBUG_MODE: - print "- Could not extract from pdbx_organism_scientific from entity_src_gen table (" + str(self.cif_path.split("/")[-1]) + ")." + + if DEBUG_MODE > 1: + print ("! {pdb} Could not extract from pdbx_organism_scientific from entity_src_gen table (".format( + pdb = self.pdb_entry())) pass - except: - print "! Unexpected error during get_organsm (pdbx_organism_scientific):", sys.exc_info()[0], in_file - return None try: entity_src_gen = self.block.getObj("entity_src_gen") src_scientific = entity_src_gen.getValue("pdbx_gene_src_scientific_name") return src_scientific.replace("\n", " ") + except AttributeError: - if DEBUG_MODE: - print "- Could not extract from pdbx_gene_src_scientific_name from entity_src_gen table (" + str(self.cif_path.split("/")[-1]) + ")." - return None - except: - print "! Unexpected error during organism (pdbx_gene_src_scientific_name):", sys.exc_info()[0], in_file - return None + + if DEBUG_MODE > 1: + print ('! {pdb} Could not extract from pdbx_gene_src_scientific_name from entity_src_gen table'.format( + pdb = self.pdb_entry())) + + return False + + def r_free(self): + + try: + refine = self.block.getObj('refine') + r_free = refine.getValue('ls_R_factor_R_free') + + try: + r_free = float(r_free) + except ValueError: + return False + + except AttributeError: + if DEBUG_MODE > 2: + print ('! Could not extract R free factor ({pdb})'.format( + pdb = self.pdb_entry())) + + except ValueError: + if DEBUG_MODE > 2: + print ('! R free factor is not annotated ({pdb})'.format( + pdb = self.pdb_entry())) + + + return False + # Helper functions def parse_seq(orginal_seq): + """Parses the cif fasta sequence and replaces non-canonical residues with their canonical counterparts.""" seq = orginal_seq while seq.find('(') != -1: start_pos = seq.find('(') stop_pos = seq.find(')') + residue = seq[start_pos + 1:stop_pos] + + try: + canonical = THREE2ONE[residue] + except KeyError: + canonical = 'X' + + if DEBUG_MODE > 1: + print ('! Replaced non canonical residue {nc} with {c}'.format( + nc = residue, + c = canonical)) if start_pos == 0: - seq = 'X' + seq[stop_pos+1:] + seq = canonical + seq[stop_pos+1:] elif stop_pos == len(seq): - seq = seq[0:start_pos] + 'X' + seq = seq[0:start_pos] + canonical else: pre_seq = seq[0:start_pos] post_seq = seq[stop_pos+1:] - seq = pre_seq + 'X' + post_seq + seq = pre_seq + canonical + post_seq + + seq = seq.replace('\n', '') + seq_array = [] + + for c in seq: + if c in CANONICAL_RESIDUES: + seq_array.append(c) + else: + seq_array.append('X') + + seq = ''.join(seq_array) - return seq.replace('\n', '') + return seq def get_paths(in_folder, out_folder): @@ -263,27 +514,58 @@ resolution = cif2fasta.resolution() method = cif2fasta.experimental_method() organism = cif2fasta.organism() + r_free = cif2fasta.r_free() # construct the header with the data given - header_information = "" + header_information = '' if protein_description: - header_information += protein_description + "; " + protein_description = protein_description.replace(';', ' ') # to prevent parsing errors + protein_description = ' '.join(protein_description.split(' ')[0:5]) # maximum of 5 words in header + header_information += 'DSC: ' + protein_description + '; ' + else: + header_information += 'DSC: N/A; ' - if keywords: - header_information += keywords + "; " + # if keywords: + # header_information += keywords + '; ' - if compounds: - header_information += "HET: " + compounds + "; " + if method: + header_information += 'MET: ' + method + '; ' + else: + header_information += 'MET: N/A; ' if resolution: - header_information += resolution +"A " - - elif method: - header_information += method + " " + header_information += 'RES: ' + resolution + '; ' + else: + header_information += 'RES: N/A; ' + + if r_free: + header_information += 'RFR: ' + r_free + '; ' + else: + header_information += 'RFR: N/A; ' if organism: - header_information += "{" + organism + "} " + header_information += 'ORG: ' + organism + '; ' + else: + header_information += 'ORG: N/A; ' + + if compounds: + header_information += 'HET: ' + compounds + '; ' + else: + header_information += 'HET: N/A; ' + + if SCOP_LIBRARY: + + try: + scop_idx = SCOP_LIBRARY[cif2fasta.pdb_entry()] + if len(scop_idx) != 0: + domains = ', '.join(scop_idx) + else: + domains = 'N/A' + except KeyError: + domains = 'N/A' + + header_information += 'SCOP: ' + domains + '; ' return header_information.strip() @@ -295,67 +577,209 @@ pdb_entry = cif2fasta.pdb_entry() header = construct_header(cif2fasta) chain_to_seq = cif2fasta.chain_to_seq() + chain_ratios = cif2fasta.chain_ratios(chain_to_seq) + #import pdb; pdb.set_trace() fasta_entry = "" if chain_to_seq and pdb_entry: - try: - for chain in chain_to_seq.keys(): - if len(chain_to_seq[chain]) != 0: - fasta_entry += ">" + pdb_entry + "_" + chain + " " + header + "\n" + "\n".join(textwrap.wrap(chain_to_seq[chain], 80)) + "\n" - return fasta_entry - except: - fasta_entry = None - print "Unexpected error during create_fasta_entry:", sys.exc_info()[0], in_file - return None + for chain in sorted(chain_to_seq.keys()): + if len(chain_to_seq[chain]) != 0: + + fasta_entry += '>{pdb}_{chain} {header} CMP: {rat:.2f}\n{seq}\n'.format( + pdb = pdb_entry, + chain = chain, + header = header, + rat = chain_ratios[chain], + seq = '\n'.join(textwrap.wrap(chain_to_seq[chain], 80))) + + # fasta_entry += '>' + pdb_entry + '_' + chain + ' ' + header + '\n' + '\n'.join(textwrap.wrap(chain_to_seq[chain], 80)) + '\n' + return fasta_entry else: return None + +def create_fasta_entry2(cif2fasta): + """" Creates a fasta entry.""" + + # Get all the information we need + pdb_entry = cif2fasta.pdb_entry() + protein_description = cif2fasta.protein_description() + keywords = cif2fasta.keywords() + compounds = cif2fasta.compounds() + resolution = cif2fasta.resolution() + method = cif2fasta.experimental_method() + organism = cif2fasta.organism() + r_free = cif2fasta.r_free() + + # get chains and sequences + chain_to_seq = cif2fasta.chain_to_seq() + chain_ratios = cif2fasta.chain_ratios(chain_to_seq) + + is_NMR = False + if 'NMR' in method: + is_NMR = True + + fasta_entry = '' + pdb_filter = '' # If needed create entries that are parseable by pdb filter + + if chain_to_seq and pdb_entry: + for chain, seq in chain_to_seq.items(): + + # check if we can find SCOP domains for the current chain + domains = False + + if SCOP_LIBRARY: + try: + scop_idx = SCOP_LIBRARY[pdb_entry + '_' + chain] + if len(scop_idx) != 0: + domains = ', '.join(scop_idx) + else: + domains = False + except KeyError: + domains = False + + if is_NMR: + fasta_entry += '>{p}_{c}{d}{k}{h}{r}{{{o}}}{s}\n{seq}\n'.format( + p = pdb_entry, + c = chain, + d = ' ' + protein_description + ';' if protein_description else '', + k = ' ' + keywords + ';' if keywords else '', + h = ' HET: ' + compounds + ';' if compounds else '', + r = ' NMR ', + o = organism if organism else 'N/A', + s = ' SCOP: ' + domains if domains else '', + seq = '\n'.join(textwrap.wrap(seq, 80))) + pdb_filter += '{p}_{c}\t{r}\t{f}\t{comp:.3f}\t{m}\n'.format( + p = pdb_entry, + c = chain, + m = method, + r = 'N/A', + f = round(r_free, 3) if r_free else 'N/A', + comp = round(chain_ratios[chain], 3)) + else: + fasta_entry += '>{p}_{c}{d}{k}{h}{r}{{{o}}}{s}\n{seq}\n'.format( + p = pdb_entry, + c = chain, + d = ' ' + protein_description + ';' if protein_description else '', + k = ' ' + keywords + ';' if keywords else '', + h = ' HET: ' + compounds + ';' if compounds else '', + r = ' ' + str(resolution) + 'A ' if resolution else '', + o = organism if organism else 'N/A', + s = ' SCOP: ' + domains if domains else '', + seq = '\n'.join(textwrap.wrap(seq, 80))) + # _\t\t\t + pdb_filter += '{p}_{c}\t{r}\t{f}\t{comp:.3f}\t{m}\n'.format( + p = pdb_entry, + c = chain, + m = method, + r = round(resolution, 3) if resolution else 'N/A', + f = round(r_free, 3) if r_free else 'N/A', + comp = round(chain_ratios[chain], 3)) + + return (fasta_entry, pdb_filter) + +def parse_scop(scop_file): + scop = defaultdict(set) + + with open(scop_file) as fh: + for line in fh: + if line.startswith('#'): + continue + scop_id, pdb_code, chain, scop_num = line.split('\t')[0:4] + chain = chain.split(':')[0] + entry = pdb_code.upper() + '_' + chain + + scop[entry].add(scop_num) + + return scop + def wrapper_function(paths): in_file = paths[0] out_file = paths[1] - try: - cif2fasta = CIF2FASTA(in_file) - fasta_entry = create_fasta_entry(cif2fasta) - except: + cif2fasta = CIF2FASTA(in_file) + if cif2fasta.is_valid(): + fasta_entry = create_fasta_entry2(cif2fasta) + else: + print("Warning: Could not read %".format(in_file), file=sys.stderr) fasta_entry = None - print "Unexpected error:", sys.exc_info()[0], in_file - pass - if fasta_entry: - with open(out_file, 'w') as ofile: - ofile.write(fasta_entry) - - + return fasta_entry + +def write_to_file(line_list, fname, pdb_filter): + """ + Input: A list containing all lines that have to be saved to the file (line_list). + A filename (str) is the name of the file that is created. + Output: A file containing the lines in line_list. + """ + if pdb_filter: + fasta_file = open(fname, 'w') + pdb_filter = open(pdb_filter, 'w') + + pdb_filter.write('#pdb_chain\tresolution\tr_free\tcompleteness\tmethod\n') + + for line in line_list: + if line is not None: + fasta_file.write(line[0]) + pdb_filter.write(line[1]) + + fasta_file.close() + pdb_filter.close() + else: + fasta_file = open(fname, 'w') + + for line in line_list: + if line is not None: + fasta_file.write(line[0]) def opt(): # Initiate a OptionParser Class - parser = OptionParser() + usage = "usage: cif2fasta.py -i cif_folder -o *.fasta -c num_cores -v" + description = "cif2fasta.py takes a folder that contains cif files as input and outputs their sequences into fasta file." + parser = OptionParser(usage = usage, description = description) # Call add_options to the parser - parser.add_option("-i", dest="input_files", - help="Input mmCIF folder.", metavar="GLOB") - parser.add_option("-o", dest="output_files", - help="Output fasta file", metavar="FILE") - parser.add_option('-c', dest="cores", type=int, default=1, - help="How many cores should be used?") + parser.add_option("-i", help = "input cif folder.", dest = "input_files", metavar = "DIR") + parser.add_option("-o", help = "output fasta file.", dest = "output_files", metavar = "FILE") + parser.add_option("-p", help = "output PDB filter file (optional).", dest= "pdb_filter", default = False, metavar = "FILE") + parser.add_option("-s", help = "SCOP annotation.", dest = "scop", default = False, metavar = "FILE") + parser.add_option('-c', help = "number of cores (default = 1).", dest = "cores", type = int, default = 1, metavar = "INT") + parser.add_option('-l', help = "Remove chains with a length < X (default = 30).", dest = "seq_len", type = int, default = 30, metavar = "INT") + parser.add_option('-v', help = 'Verbose Mode (quiet = 0, full verbosity = 2).', dest = 'bool', default = 0, type = int, metavar = "INT") + return parser def main(): parser = opt() - # parse the the parser object and save the data into options and args - # opts contains all values received via command line - # argv contains a list of all commited argument (options, argv) = parser.parse_args() + global DEBUG_MODE + + if options.bool: + DEBUG_MODE = options.bool + + global SCOP_LIBRARY + + if options.scop: + SCOP_LIBRARY = parse_scop(options.scop) + + global MIN_SEQ_LEN + + MIN_SEQ_LEN = options.seq_len + paths = get_paths(options.input_files, options.output_files) - print "Found: " + str(len(paths)) + " files." + print ("Found: " + str(len(paths)) + " files.") + + if options.cores > 1: + pool = Pool(options.cores) + fastas = pool.map(wrapper_function, paths) + else: + fastas = map(wrapper_function, paths) - pool = Pool(options.cores) - pool.map(wrapper_function, paths) + write_to_file(fastas, options.output_files, options.pdb_filter) if __name__ == "__main__": main() diff -Nru hhsuite-3.0~beta2+dfsg/scripts/CMakeLists.txt hhsuite-3.0~beta3+dfsg/scripts/CMakeLists.txt --- hhsuite-3.0~beta2+dfsg/scripts/CMakeLists.txt 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/scripts/CMakeLists.txt 2017-07-14 10:34:03.000000000 +0000 @@ -24,6 +24,9 @@ ffindex.py a3m.py get_a3m_size.py - is_huge_a3m.py + pdbfilter.py + cif2fasta.py + hhmakemodel.py + hh_reader.py DESTINATION scripts ) diff -Nru hhsuite-3.0~beta2+dfsg/scripts/create_profile_from_hmmer.pl hhsuite-3.0~beta3+dfsg/scripts/create_profile_from_hmmer.pl --- hhsuite-3.0~beta2+dfsg/scripts/create_profile_from_hmmer.pl 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/scripts/create_profile_from_hmmer.pl 2017-07-14 10:34:03.000000000 +0000 @@ -1,4 +1,4 @@ -#!/usr/bin/env perl +#!/usr/bin/perl # # create_profile_from_hmmer.pl # Create a profile (.prf) from a given HMMER/HMMER3 file diff -Nru hhsuite-3.0~beta2+dfsg/scripts/fasta_cut.py hhsuite-3.0~beta3+dfsg/scripts/fasta_cut.py --- hhsuite-3.0~beta2+dfsg/scripts/fasta_cut.py 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/scripts/fasta_cut.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,108 +0,0 @@ -#!/usr/bin/env python - -from optparse import OptionParser - - -def opt(): - parser = OptionParser() - parser.add_option("-i", dest="input_file", - help="Input fasta file", metavar="FILE") - parser.add_option("-o", dest="output_file", - help="Output fasta file", metavar="FILE") - parser.add_option("--max_length", dest="max_length", type=int, - help="Max length of fasta sequence", metavar="INT") - parser.add_option("--overlap", dest="overlap", type=int, - help="Length of overlap of cutted sequences", metavar="INT") - - parser.set_default(max_length=14999) - parser.set_default(overlap=100) - - return parser - - -def cut_sequence(header, sequence, max_length, overlap): - subsequences = [] - if len(sequence) == 0: - return subsequences - - if len(sequence) <= max_length: - subsequences.append((header, sequence)) - return subsequences - - total_length = len(sequence) - number = 0 - offset = 0 - while offset < total_length: - subsequence = sequence[max(0, offset - overlap) : min(total_length, offset - overlap + max_length)] - offset = offset - overlap + max_length - - number += 1 - new_header = adjust_header(header, number) - - subsequences.append((new_header, subsequence)) - - return subsequences - - -def adjust_header(header, number): - header_tokens = header.split() - name = header_tokens[0] - name = adjust_name(name, number) - header_tokens[0] = name - return " ".join(header_tokens) - - -def adjust_name(name, number): - short_name = name - - if name.find("|") != -1: - name_tokens = name.split("|") - short_name = name_tokens[1] - short_name = short_name+"_"+str(number) - name_tokens[1] = short_name - return "|".join(name_tokens) - else: - return short_name+"_"+str(number) - - -def write_parts(out, parts): - for part in parts: - out.write(part[0]) - out.write("\n") - out.write(part[1]) - out.write("\n") - - -def cut_sequences(input_file, max_length, overlap, output_file): - out = open(output_file, "w") - - with open(input_file, "r") as fh: - sequence = "" - header = "" - - for line in fh: - line = line.strip() - if line[0] == ">": - subsequences = cut_sequence(header, sequence, max_length, overlap) - write_parts(out, subsequences) - - sequence = "" - header = line - else: - sequence += line - - subsequences = cut_sequence(header, sequence, max_length, overlap) - write_parts(out, subsequences) - - out.close() - - -def main(): - parser = opt() - (options, argv) = parser.parse_args() - - cut_sequences(options.input_file, options.max_length, options.overlap, options.output_file) - - -if __name__ == "__main__": - main() diff -Nru hhsuite-3.0~beta2+dfsg/scripts/hhmakemodel.pl hhsuite-3.0~beta3+dfsg/scripts/hhmakemodel.pl --- hhsuite-3.0~beta2+dfsg/scripts/hhmakemodel.pl 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/scripts/hhmakemodel.pl 2017-07-14 10:34:03.000000000 +0000 @@ -1,4 +1,4 @@ -#! /usr/bin/env perl +#! /usr/bin/perl # # hhmakemodel.pl # Generate a model from an output alignment of hhsearch. @@ -1158,7 +1158,7 @@ # return 1; # no SCOP/DALI/pdb sequence } - &FindPDBfile($pdbcode, $chain); + $pdbfile = &FindPDBfile($pdbcode, $chain); if ($pdbfile eq "") { if ($v>=2) {print("Warning: no pdb file found for sequence name '$name'\n");} diff -Nru hhsuite-3.0~beta2+dfsg/scripts/hhmakemodel.py hhsuite-3.0~beta3+dfsg/scripts/hhmakemodel.py --- hhsuite-3.0~beta2+dfsg/scripts/hhmakemodel.py 1970-01-01 00:00:00.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/scripts/hhmakemodel.py 2017-07-14 10:34:03.000000000 +0000 @@ -0,0 +1,2400 @@ +#!/usr/bin/env python + +from hh_reader import read_result +from copy import deepcopy +from pdbx.reader.PdbxReader import PdbxReader +from pdbx.writer.PdbxWriter import PdbxWriter +import re, os, sys, tempfile, glob + +from operator import itemgetter # hzhu +from itertools import groupby # hzhu + +EMPTY = '*' +GAP = '-' +DEBUG_MODE = False + +class Gap: + """ A gap is a continuous stretch of indels. + It is defined by a opening position and a size/length + """ + def __init__(self, open_pos, size): + self.open_pos = open_pos # gap opening position + self.size = size # num of indels in the gap + + def __repr__(self): + return 'Gap opening pos = %d, size = %d' % (self.open_pos, self.size) + +class Grid: + """ + Implementation of 2D grid of cells + Includes boundary handling + """ + + def __init__(self, grid_height, grid_width): + """ + Initializes grid to be empty, take height and width of grid as parameters + Indexed by rows (left to right), then by columns (top to bottom) + """ + + self._grid_height = grid_height + self._grid_width = grid_width + self._cells = [ [ EMPTY for dummy_col in range(self._grid_width) ] + for dummy_row in range(self._grid_height)] + + def __str__(self): + """ Return multi-line string represenation for grid """ + + ans = '' + for row in range(self._grid_height): + ans += ''.join(self._cells[row]) + ans += '\n' + return ans + + def clear(self): + """ Clears grid to be empty """ + + self._cells = [[EMPTY for dummy_col in range(self._grid_width)] + for dummy_row in range(self._grid_height)] + + def get_grid_height(self): + """ Return the height of the grid """ + + return self._grid_height + + def get_grid_width(self): + """ Return the width of the grid """ + + return self._grid_width + + def get_cell(self, row, col): + return self._cells[row][col] + + def get_seq_start(self, row): + """ Returns the start position of the sequence """ + + index = 0 + for pos in self._cells[row]: + if pos != EMPTY: + return index + index += 1 + + return None + + def get_seq_end(self, row): + """ Returns the end position of the sequence """ + + index = 0 + for pos in reversed(self._cells[row]): + if pos != EMPTY: + return self.get_grid_width() - index + index += 1 + + return None + + def get_gaps(self, row): + """ Return the position of gaps in a row """ + + gaps = list() + + index = 0 + for pos in self._cells[row]: + if pos == GAP: + gaps.append(index) + index += 1 + + return gaps + + def get_gaps_ref_gapless(self, row): + """ Return the pos of gaps in a row. + The opening positions of the gaps are wrt. the gapless seq + """ + # get all the indels + indels = self.get_gaps(row) + gaps = [] + # combine continuous indels into a gap + for k,i in groupby( enumerate(indels), lambda x: x[0]-x[1] ): + g = list(map(itemgetter(1), i)) + gaps.append( Gap(g[0], len(g)) ) + + # offset the gap opening positions + for i in range(1, len(gaps)): + # offset by total gap number before + gaps[i].open_pos -= sum([gaps[j].size for j in range(i)]) + + return gaps # a list of Gap instances + + def get_seq_indeces(self, row): + + seq = list() + for pos, res in enumerate(self._cells[row]): + if res != EMPTY and res != GAP: + seq.append(pos) + + return seq + + ## def get_gap_list(self): # hzhu commented this out. wrote a new version + ## """ Returns a list of list of all gap positions in the sequence grid. """ + ## gap_pos = set() + + ## for row in range(self.get_grid_height()): + ## for gap in self.get_gaps(row): + ## gap_pos.add(gap) + + ## gap_pos = list(sorted(gap_pos)) + ## boundaries = [ (x + 1) for x, y in zip(gap_pos, gap_pos[1:]) if y - x != 1 ] + + ## gap_list = list() + ## prev = 0 + + ## for boundary in boundaries: + ## sub_list = [ pos for pos in gap_pos[prev:] if pos < boundary ] + ## gap_list.append(sub_list) + ## prev += len(sub_list) + + ## gap_list.append([ x for x in gap_pos[prev:]]) + + ## return gap_list + + def get_gap_list(self): + """ Returns a list of Gap instances for all rows in the grid + """ + gap_dict = dict() # each position should occur as gap at most once + # keys are gap openning positions + # values are Gap instances + gap_list = [] + for row in range(self.get_grid_height()): + gap_pos = [] + gaps = self.get_gaps_ref_gapless(row) + + for g in gaps: + if g.open_pos in gap_dict: # if there is already gaps at this open pos + if g.size > gap_dict[g.open_pos].size: # if new gap is bigger + gap_dict[g.open_pos] = g # keep the larger gap as they overlap + else: + gap_dict[g.open_pos] = g + + gap_list = sorted(list(gap_dict.values()), key=lambda x: x.open_pos) # sort according to start position + return gap_list # a list of Gap instances + + def set_gap(self, row, col): + """ Set cell with index (row, col) to be a gap """ + + self._cells[row][col] = GAP + + def set_empty(self, row, col): + """ Set cell with index (row, col) to be a gap """ + + self._cells[row][col] = EMPTY + + def set_cell(self, row, col, res): + """ Set cell with index (row, col) to be full """ + + self._cells[row][col] = res + + def is_empty(self, row, col): + """ Checks whether cell with index (row, col) is empty """ + + return self._cells[row][col] == EMPTY + + def is_gap(self, row, col): + """ Checks whetehr cell with indxex (row, col) is a gap """ + + return self._cells[row][col] == GAP + + def insert_gaps(self, cols): + """ Inserts a gaps into a column of the template grid """ + + for col in cols: + for row in range(self._grid_height): + if col >= self.get_seq_start(row) and col < self.get_seq_end(row): + self._cells[row].insert(col, GAP) + else: + self._cells[row].insert(col, EMPTY) + + self._grid_width += 1 + + def insert_gaps_row(self, cols, row): + """ Intert gaps into cols only for certain row""" + for col in cols: + if col >= self.get_seq_start(row) and col < self.get_seq_end(row): + self._cells[row].insert(col, GAP) + else: + self._cells[row].insert(col, EMPTY) + # NOTE: grid_with should not be changed after every row is updated. + #self._grid_width += 1 + + def clean_trail_empty(self): + """ Remove all trailing EMPTY and pad grid to same width""" + # first find out the max length (exluding trailing EMPTY) + max_width = 0 + for row in range(self._grid_height): + for i in range(len(self._cells[row])-1, -1, -1): + if self._cells[row][i] != EMPTY: + break + if i+1 > max_width: + max_width = i+1 + + # delete excessive EMPTY + for row in range(self._grid_height): + del self._cells[row][max_width:] + + # then pad all rows to the same length + [self._cells[row].append( EMPTY * (max_width-len(self._cells[row])) ) \ + for row in range(self._grid_height) if len(self._cells[row]) < max_width] + self._grid_width = max_width + return + + def remove_gaps(self, keep_width=True): # hzhu add keep_width option + """ Removes all gaps from the grid. """ + + for row in range(self.get_grid_height()): + not_gap = list() + for col in range(self.get_grid_width()): + if not self.is_gap(row, col): + not_gap.append(col) + + self._cells[row] = [ self._cells[row][col] for col in not_gap ] + + if keep_width: # hzhu only pad to original width if desired + for del_pos in range(self._grid_width - len(not_gap)): + self._cells[row].append(EMPTY) + + if not keep_width: # hzhu if width is not kept, make sure width is consistent + self.clean_trail_empty() + + return + + +class QueryGrid(Grid): + + def __init__(self, grid_height, grid_width): + Grid.__init__(self, grid_height, grid_width) + + def get_query_start(self, row): + """ Returns the query start position """ + return self.get_seq_start(row) + 1 + + def get_query_end(self, row): + """ Returns the query end postion """ + + return self.get_seq_end(row) - len(self.get_gaps(row)) + + def get_col_residue(self, col): + """ Tries to find a the query residue in a given column. Used by derive_global_seq() to + identify the global query sequence """ + + for row in range(self.get_grid_height()): + if not self.is_empty(row, col): + return self._cells[row][col] + + return GAP + +class TemplateGrid(Grid): + + def __init__(self, grid_height, grid_width): + Grid.__init__(self, grid_height, grid_width) + + self._start = list() + self._end = list() + self._pdb_code = list() + self._chain = list() + self._organism = list() + self._resolution = list() + + def display(self): + """ Return multi-line string represenation for grid """ + + ans = '' + for row in range(self._grid_height): + ans += '>P1;{p}\nstructure:{p}:{s}:{c}:{e}:{c}::{o}:{r}:\n{a}*\n'.format( + p = self._pdb_code[row], + s = add_white_space_end(self.get_template_start(row), 4), + e = add_white_space_end(self.get_template_end(row), 4), + c = self._chain[row], + o = self._organism[row], + r = self._resolution[row], + a = ''.join(self._cells[row]).replace(EMPTY, GAP).replace('#', GAP)) + + return ans + + def debug(self, row): + """ Return multi-line string represenation for grid, for debugging purposes """ + + ans = '{p}\nInternal: {s}, {e} Query: {qs}, {qe} Gaps ({g1}): {g2}\n{seq}\n'.format( + p = self._pdb_code[row], + s = self.get_seq_start(row), + e = self.get_seq_end(row), + qs = self.get_template_start(row), + qe = self.get_template_end(row), + g1 = len(self.get_gaps(row)), + g2 = ', '.join([str(gap) for gap in self.get_gaps(row)]), + seq = ''.join(self._cells[row])) + + return ans + + def set_metadata(self, row, start, end, pdb_code, chain, organism, resolution): + """ Used by create_template_grid() to setup metadata of pir template """ + + self._start.append(start) + self._end.append(end) + self._pdb_code.append(pdb_code) + self._chain.append(chain) + self._organism.append(organism) + self._resolution.append(resolution) + + def set_map(self, row, start, end): + + self._start[row] = start + self._end[row] = end + + def get_template_start(self, row): + """ Returns the template start position """ + + return self._start[row] + + def get_template_end(self, row): + """ Return sthe template end position """ + + return self._end[row] + + def del_row(self, row): + """ Removes a complete template entry from the grid """ + + del self._cells[row] + del self._start[row] + del self._end[row] + del self._pdb_code[row] + del self._chain[row] + del self._organism[row] + del self._resolution[row] + self._grid_height -= 1 + +# Helper functions + +def add_white_space_end(string, length): + """ Adds whitespaces to a string until it has the wished length""" + + edited_string = str(string) + + if len(edited_string) >= length: + return string + else: + while len(edited_string) != length: + edited_string += ' ' + + return edited_string + +def convert_aa_code(three_letter, convert): + """ + Assumes a string that contains a three letter aminoacid code and + returns the corresponding one letter code. + """ + + aa_code = { + 'CYS': 'C', + 'ASP': 'D', + 'SER': 'S', + 'GLN': 'Q', + 'LYS': 'K', + 'ILE': 'I', + 'PRO': 'P', + 'THR': 'T', + 'PHE': 'F', + 'ASN': 'N', + 'GLY': 'G', + 'HIS': 'H', + 'LEU': 'L', + 'ARG': 'R', + 'TRP': 'W', + 'ALA': 'A', + 'VAL': 'V', + 'GLU': 'E', + 'TYR': 'Y', + 'MET': 'M', + } + + non_canonical = { + 'MSE': 1, + 'HYP': 2, + 'MLY': 3, + 'SEP': 4, + 'TPO': 5, + 'CSO': 6, + 'PTR': 7, + 'KCX': 8, + 'CME': 9, + 'CSD': 10, + 'CAS': 11, + 'MLE': 12, + 'DAL': 13, + 'CGU': 14, + 'DLE': 15, + 'FME': 16, + 'DVA': 17, + 'OCS': 18, + 'DPR': 19, + 'MVA': 20, + 'TYS': 21, + 'M3L': 22, + 'SMC': 23, + 'ALY': 24, + 'CSX': 25, + 'DCY': 26, + 'NLE': 27, + 'DGL': 28, + 'DSN': 29, + 'CSS': 30, + 'DLY': 31, + 'MLZ': 32, + 'DPN': 33, + 'DAR': 34, + 'PHI': 35, + 'IAS': 36, + 'DAS': 37, + 'HIC': 38, + 'MP8': 39, + 'DTH': 40, + 'DIL': 41, + 'MEN': 42, + 'DTY': 43, + 'CXM': 44, + 'DGN': 45, + 'DTR': 46, + 'SAC': 47, + 'DSG': 48, + 'MME': 49, + 'MAA': 50, + 'YOF': 51, + 'FP9': 52, + 'FVA': 53, + 'MLU': 54, + 'OMY': 55, + 'FGA': 56, + 'MEA': 57, + 'CMH': 58, + 'DHI': 59, + 'SEC': 60, + 'OMZ': 61, + 'SCY': 62, + 'MHO': 63, + 'MED': 64, + 'CAF': 65, + 'NIY': 66, + 'OAS': 67, + 'SCH': 68, + 'MK8': 69, + 'SME': 70, + 'LYZ': 71 + } + + if three_letter in aa_code.keys(): + return aa_code[three_letter] + elif convert and (three_letter in non_canonical.keys()): + return non_canonical[three_letter] + else: + return '-' + + +def get_query_name(hhr_file): + + with open(hhr_file) as fh: + for line in fh: + if line.startswith('Query'): + # match the PDB Code + m = re.search('(\d[A-Z0-9]{3})_(\S)', line) + + if m: + pdb_code = m.group(1) + chain = m.group(2) + else: + pdb_code = 'UKNP' + chain = 'A' + # raise ValueError('Input HHR-File Does not seem to be a PDB-Structure') + + break + + return pdb_code, chain + +def get_cif_files(folder): + """ Gets all cif files located in folder. """ + + return glob(os.path.join(folder, '*.cif')) + +def open_cif(cif_file): + """ Assumes a mmCif file and returns a data block used for subsequent procedures """ + # The "usual" procedure to open a mmCIF with pdbX/mmCIF + with open(cif_file) as cif_fh: + data = [] + reader = PdbxReader(cif_fh) + reader.read(data) + + block = data[0] + + return block + +def get_pdb_entry_id(block): + """ Extracts the PDB entry information of a cif file and returns it as a string """ + + entry = block.getObj('entry') + entry_id = entry.getValue('id') + + return entry_id + + +def template_id_to_pdb(template_id): + """ + Extracts PDB ID and chain name from the provided template id + """ + # match PDBID without chain (8fab, 1a01) + m = re.match(r'/^(\d[A-Za-z0-9]{3})$', template_id) + if m: + return m.group(1).upper(), 'A' + + # PDB CODE with chain Identifier + m = re.match(r'^(\d[A-Za-z0-9]{3})_(\S)$', template_id) + if m: + return m.group(1).upper(), m.group(2).upper() + + # Match DALI ID + m = re.match(r'^(\d[A-Za-z0-9]{3})([A-Za-z0-9]?)_\d+$', template_id) + if m: + return m.group(1).upper(), m.group(2).upper() + + # No PDB code and chain identified + return None, None + + +def create_template_grid(hhr_data): + """ Creates a template grid """ + + total_seq = len(hhr_data) + templ_max = max( [ hhr.start[0] + len(to_seq(hhr.template_ali)) for hhr in hhr_data ] ) - 1 + + + template_grid = TemplateGrid(total_seq, templ_max) + + for row, template in enumerate(hhr_data): + seq_start = template.start[0] - 1 + templatealignment = to_seq(template.template_ali) + seq_end = seq_start + len(templatealignment) + + # Load Meta Data + start = template.start[1] + end = template.end[1] + + # Get pdb_code and chain identifier of template + pdb_code, chain = template_id_to_pdb(template.template_id) + + m = re.search("(\d+.\d+)A", template.template_info) # try to extract resolution of the structure + + if m: + resolution = m.group(1) + else: + resolution = "" + + m = re.search("\{(.*)\}", template.template_info) # try to extract the organism + if m: + organism = m.group(1).replace(":", " ") # make sure that no colons are in the organism + else: + organism = "" + + template_grid.set_metadata(row, start, end, pdb_code, chain, organism, resolution) + + # Write sequence into the grid + for pos, col in enumerate(range(seq_start, seq_end)): + template_grid.set_cell(row, col, templatealignment[pos]) + + return template_grid + + +def to_seq(ali): + if isinstance(ali, list): + return ''.join(ali) + else: + return ali + + +def create_query_grid(hhr_data): + """ Creates a Query Grid """ + + total_seq = len(hhr_data) + query_max = max( [ hhr.start[0] + len(to_seq(hhr.query_ali)) for hhr in hhr_data ] ) - 1 + + query_grid = QueryGrid(total_seq, query_max) + + for row, query in enumerate(hhr_data): + + queryalignment = to_seq(query.query_ali) + query_start = query.start[0] - 1 + query_end = query_start + len(queryalignment) + + for pos, col in enumerate(range(query_start, query_end)): + if queryalignment[pos] not in ['Z', 'U', 'O', 'J', 'X', 'B']: # CAUTION + + query_grid.set_cell(row, col, queryalignment[pos]) + + return query_grid + +def create_gapless_grid(grid): + """ Returns a gapless grid """ + + gapless = deepcopy(grid) + gapless.remove_gaps(keep_width=False) # hzhu: shrink grid + + return gapless + +def process_query_grid(query_grid, gapless_grid): + """ Processes a query grid sucht that it contains all gaps + """ + gaplist = query_grid.get_gap_list() + off_set = 0 + + for g in gaplist: + gapless_grid.insert_gaps([ p + off_set for p in range(g.open_pos, g.open_pos+g.size) ]) + off_set += g.size + + return gapless_grid + +def derive_global_seq(processed_query_grid, query_name, query_chain): + + global_seq = list() + + for col in range(processed_query_grid.get_grid_width()): + global_seq.append(processed_query_grid.get_col_residue(col)) + + # this is the query entry + header = '>P1;{q}\nsequence:{q}:1 :{c}:{l} :{c}::::\n'.format( + q = query_name, + l = len(global_seq), + c = query_chain) + + return header + ''.join(global_seq) + '*' + +def process_template_grid(query_grid, template_grid): + """ Insertes Gaps into the template grid + Only add gaps from **other** query_grids into template grid (NOT gapless) + """ + gaplist = query_grid.get_gap_list() # use this to keep the offset + + for row in range(template_grid.get_grid_height()): + # do NOT consider gaps in current query row + gaplist_row = query_grid.get_gaps_ref_gapless(row) + gapdict_row = dict(zip([g.open_pos for g in gaplist_row], + [g.size for g in gaplist_row])) + off_set = 0 + for g in gaplist: + # if there is a gap with same opening position in the current row, + # only consider g if it is larger than the on in the current row + if g.open_pos in gapdict_row: + if g.size > gapdict_row[g.open_pos]: + template_grid.insert_gaps_row([ p + off_set for p in range(g.open_pos, + g.open_pos+g.size-gapdict_row[g.open_pos]) ], row) + else: + template_grid.insert_gaps_row([ p + off_set for p in range(g.open_pos, g.open_pos+g.size) ], row) + + off_set += g.size # even if the gaps are not inserted, the offset should be adjusted + + template_grid.clean_trail_empty() # clean the redundant trailing EMPTY char + + return template_grid + +def compare_with_cifs(template_grid, folder, output_path, convert, threshold): + """ + Compare the PIR Alignment with Atomsection of a mmCIF file. To make the ATOM-Section of + a mmCIF file compatible with MODELLER, each residue has in the ATOM-Section has to match + corresponding positions in the PIR-Alignment + """ + + # glob the mmCif files from given directory and map the PDB identifier to the path + cif_files = glob.glob(os.path.join(folder, '*.cif')) + cif_paths = { path.split('/')[-1].split('.')[0].upper() : path for path in cif_files } + cif_edits = dict() + + + # create the path where renumbered cifs are saved to + if not os.path.exists(output_path): + os.mkdir(output_path) + + # if the cif does not contain any residue of the por alignment we delete it + del_row = list() + + for row in range(template_grid.get_grid_height()): + # get the pdb code and strand id from the current template + pdb_code = template_grid._pdb_code[row] + chain = template_grid._chain[row] # hhr users pdb chain ID + + + # load mmCif file accordingly + if pdb_code in cif_edits.keys(): + block = cif_edits[pdb_code] + else: + try: + block = open_cif(cif_paths[pdb_code]) + except KeyError: + del_row.append(row) + print ('! Did not find the mmCIF file for {pdb}. Removing it from the alignment.'.format( + pdb = pdb_code)) + continue + + # Create a mapping of the atom site + atom_site = block.getObj('atom_site') + + ######################################################################## + ## Get the mapping of the residues in the atom section ## + ######################################################################## + + cif_seq = dict() + # For the case that we have to rename a chain + cif_chains = set([]) + + # Iterate through the atomsection of the cif file + for atom_row in range(0, atom_site.getRowCount()): + + try: + if atom_site.getValue('label_comp_id', atom_row) == 'HOH': + continue + cif_chain = atom_site.getValue('label_asym_id', atom_row) + pdb_chain = atom_site.getValue('auth_asym_id', atom_row) # use PDB chain ID + except IndexError: + pass + + cif_chains.add(cif_chain) + + # We do not care about the residues apart from the chain + #if cif_chain != chain: # hzhu + if pdb_chain != chain: # hhr uses PDB chain, not the cif chain! hzhu + continue + # and update the chain id from pdb_chain to cif_chain + if atom_site.getValue('group_PDB', atom_row).startswith('ATOM'): # hzhu in case HETATM ruins ch id + template_grid._chain[row] = cif_chain + + # get the residue and the residue number + try: + res_num = int(atom_site.getValue("label_seq_id", atom_row)) + except ValueError: + continue + + residue = atom_site.getValue('label_comp_id', atom_row) + residue = convert_aa_code(residue, convert) + + + if res_num not in cif_seq.keys(): + cif_seq[res_num] = residue + elif res_num in cif_seq.keys() and cif_seq[res_num] == residue: + continue + elif res_num in cif_seq.keys() and cif_seq[res_num] != residue: + cif_seq[res_num] = '-' + + if DEBUG_MODE: + print ('! {p} {c}: mmCIF contains a residue position that is assigned {cr} to two residues. Removing it.'.format( + p = pdb_code, + c = chain, + cr = res_num)) + + ######################################################################## + ## Rename chain if necessary ## + ######################################################################## + + chain_idx = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + + if len(template_grid._chain[row]) != 1: + i = 0 + new_chain = 0 + + while i < len(chain_idx): + if chain_idx[i] in cif_chains: + if DEBUG_MODE: + print ('! {p} {c}: Chain identifier {i} is already taken.'.format( + p = pdb_code, + c = chain, + i = chain_idx[i])) + i += 1 + else: + new_chain = chain_idx[i] + break + + if new_chain == 0: + if DEBUG_MODE: + print ('! {p} {c}: Could not use {p}. The chain identifier {c} is not compatible with MODELLER (2 letters) and could not be renanmed.'.format( + p = pdb_code, + c = chain)) + + del_row.append(row) + continue + + if new_chain != 0: + print ('Selected new chain name {c}'.format(c = new_chain)) + + #TODO + + ######################################################################## + ## Compare cif positions with the atom positions ## + ######################################################################## + + del_pos = list() + mod_pos = dict() + mapping = dict() + + for pos_cif, pos_tem in zip(range(template_grid.get_template_start(row), + template_grid.get_template_end(row) + 1), template_grid.get_seq_indeces(row)): + + res_tem = template_grid.get_cell(row, pos_tem) + + try: + res_cif = cif_seq[pos_cif] + except KeyError: + res_cif = -1 + + + match = True if res_tem == res_cif else False + + if not match: + if res_cif == 1 and res_tem == 'M': + mod_pos[pos_cif] = 1 + mapping[(pos_tem, res_tem)] = (pos_cif, 'M') + + elif res_cif == 2 and res_tem == 'P': + + mod_pos[pos_cif] = 2 + mapping[(pos_tem, res_tem)] = (pos_cif, 'P') + + elif res_cif == 3 and res_tem == 'K': + + mod_pos[pos_cif] = 3 + mapping[(pos_tem, res_tem)] = (pos_cif, 'K') + + elif res_cif == 4 and res_tem == 'S': + + mod_pos[pos_cif] = 4 + mapping[(pos_tem, res_tem)] = (pos_cif, 'S') + + elif res_cif == 5 and res_tem == 'T': + + mod_pos[pos_cif] = 5 + mapping[(pos_tem, res_tem)] = (pos_cif, 'T') + + elif res_cif == 6 and res_tem == 'C': + + mod_pos[pos_cif] = 6 + mapping[(pos_tem, res_tem)] = (pos_cif, 'C') + + elif res_cif == 7 and res_tem == 'Y': + + mod_pos[pos_cif] = 7 + mapping[(pos_tem, res_tem)] = (pos_cif, 'Y') + + elif res_cif == 8 and res_tem == 'K': + + mod_pos[pos_cif] = 8 + mapping[(pos_tem, res_tem)] = (pos_cif, 'K') + + elif res_cif == 9 and res_tem == 'C': + + mod_pos[pos_cif] = 9 + mapping[(pos_tem, res_tem)] = (pos_cif, 'C') + + elif res_cif == 10 and res_tem == 'A': + + mod_pos[pos_cif] = 10 + mapping[(pos_tem, res_tem)] = (pos_cif, 'A') + + elif res_cif == 11 and res_tem == 'C': + + mod_pos[pos_cif] = 11 + mapping[(pos_tem, res_tem)] = (pos_cif, 'C') + + elif res_cif == 12 and res_tem == 'L': + + mod_pos[pos_cif] = 12 + mapping[(pos_tem, res_tem)] = (pos_cif, 'L') + + elif res_cif == 13 and res_tem == 'A': + + mod_pos[pos_cif] = 13 + mapping[(pos_tem, res_tem)] = (pos_cif, 'A') + + elif res_cif == 14 and res_tem == 'E': + + mod_pos[pos_cif] = 14 + mapping[(pos_tem, res_tem)] = (pos_cif, 'E') + + elif res_cif == 15 and res_tem == 'L': + + mod_pos[pos_cif] = 15 + mapping[(pos_tem, res_tem)] = (pos_cif, 'L') + + elif res_cif == 16 and res_tem == 'M': + + mod_pos[pos_cif] = 16 + mapping[(pos_tem, res_tem)] = (pos_cif, 'M') + + elif res_cif == 17 and res_tem == 'V': + + mod_pos[pos_cif] = 17 + mapping[(pos_tem, res_tem)] = (pos_cif, 'V') + + elif res_cif == 18 and res_tem == 'C': + + mod_pos[pos_cif] = 18 + mapping[(pos_tem, res_tem)] = (pos_cif, 'C') + + elif res_cif == 19 and res_tem == 'P': + + mod_pos[pos_cif] = 19 + mapping[(pos_tem, res_tem)] = (pos_cif, 'P') + + elif res_cif == 20 and res_tem == 'V': + + mod_pos[pos_cif] = 20 + mapping[(pos_tem, res_tem)] = (pos_cif, 'V') + + elif res_cif == 21 and res_tem == 'Y': + + mod_pos[pos_cif] = 21 + mapping[(pos_tem, res_tem)] = (pos_cif, 'Y') + + elif res_cif == 22 and res_tem == 'K': + + mod_pos[pos_cif] = 22 + mapping[(pos_tem, res_tem)] = (pos_cif, 'K') + + elif res_cif == 23 and res_tem == 'C': + + mod_pos[pos_cif] = 23 + mapping[(pos_tem, res_tem)] = (pos_cif, 'C') + + elif res_cif == 24 and res_tem == 'K': + + mod_pos[pos_cif] = 24 + mapping[(pos_tem, res_tem)] = (pos_cif, 'K') + + elif res_cif == 25 and res_tem == 'C': + + mod_pos[pos_cif] = 25 + mapping[(pos_tem, res_tem)] = (pos_cif, 'C') + + elif res_cif == 26 and res_tem == 'C': + + mod_pos[pos_cif] = 26 + mapping[(pos_tem, res_tem)] = (pos_cif, 'C') + + elif res_cif == 27 and res_tem == 'L': + + mod_pos[pos_cif] = 27 + mapping[(pos_tem, res_tem)] = (pos_cif, 'L') + + elif res_cif == 28 and res_tem == 'E': + + mod_pos[pos_cif] = 28 + mapping[(pos_tem, res_tem)] = (pos_cif, 'E') + + elif res_cif == 29 and res_tem == 'S': + + mod_pos[pos_cif] = 29 + mapping[(pos_tem, res_tem)] = (pos_cif, 'S') + + elif res_cif == 30 and res_tem == 'C': + + mod_pos[pos_cif] = 30 + mapping[(pos_tem, res_tem)] = (pos_cif, 'C') + + elif res_cif == 31 and res_tem == 'K': + + mod_pos[pos_cif] = 31 + mapping[(pos_tem, res_tem)] = (pos_cif, 'K') + + elif res_cif == 32 and res_tem == 'K': + + mod_pos[pos_cif] = 32 + mapping[(pos_tem, res_tem)] = (pos_cif, 'K') + + elif res_cif == 33 and res_tem == 'F': + + mod_pos[pos_cif] = 33 + mapping[(pos_tem, res_tem)] = (pos_cif, 'F') + + elif res_cif == 34 and res_tem == 'R': + + mod_pos[pos_cif] = 34 + mapping[(pos_tem, res_tem)] = (pos_cif, 'R') + + elif res_cif == 35 and res_tem == 'F': + + mod_pos[pos_cif] = 35 + mapping[(pos_tem, res_tem)] = (pos_cif, 'F') + + elif res_cif == 36 and res_tem == 'D': + + mod_pos[pos_cif] = 36 + mapping[(pos_tem, res_tem)] = (pos_cif, 'D') + + elif res_cif == 37 and res_tem == 'D': + + mod_pos[pos_cif] = 37 + mapping[(pos_tem, res_tem)] = (pos_cif, 'D') + + elif res_cif == 38 and res_tem == 'H': + + mod_pos[pos_cif] = 38 + mapping[(pos_tem, res_tem)] = (pos_cif, 'H') + + elif res_cif == 39 and res_tem == 'P': + + mod_pos[pos_cif] = 39 + mapping[(pos_tem, res_tem)] = (pos_cif, 'P') + + elif res_cif == 40 and res_tem == 'T': + + mod_pos[pos_cif] = 40 + mapping[(pos_tem, res_tem)] = (pos_cif, 'T') + + elif res_cif == 41 and res_tem == 'I': + + mod_pos[pos_cif] = 41 + mapping[(pos_tem, res_tem)] = (pos_cif, 'I') + + elif res_cif == 42 and res_tem == 'N': + + mod_pos[pos_cif] = 42 + mapping[(pos_tem, res_tem)] = (pos_cif, 'N') + + elif res_cif == 43 and res_tem == 'Y': + + mod_pos[pos_cif] = 43 + mapping[(pos_tem, res_tem)] = (pos_cif, 'Y') + + elif res_cif == 44 and res_tem == 'M': + + mod_pos[pos_cif] = 44 + mapping[(pos_tem, res_tem)] = (pos_cif, 'M') + + elif res_cif == 45 and res_tem == 'G': + + mod_pos[pos_cif] = 45 + mapping[(pos_tem, res_tem)] = (pos_cif, 'G') + + elif res_cif == 46 and res_tem == 'W': + + mod_pos[pos_cif] = 46 + mapping[(pos_tem, res_tem)] = (pos_cif, 'W') + + elif res_cif == 47 and res_tem == 'S': + + mod_pos[pos_cif] = 47 + mapping[(pos_tem, res_tem)] = (pos_cif, 'S') + + elif res_cif == 48 and res_tem == 'N': + + mod_pos[pos_cif] = 48 + mapping[(pos_tem, res_tem)] = (pos_cif, 'N') + + elif res_cif == 49 and res_tem == 'M': + + mod_pos[pos_cif] = 49 + mapping[(pos_tem, res_tem)] = (pos_cif, 'M') + + elif res_cif == 50 and res_tem == 'A': + + mod_pos[pos_cif] = 50 + mapping[(pos_tem, res_tem)] = (pos_cif, 'A') + + elif res_cif == 51 and res_tem == 'Y': + + mod_pos[pos_cif] = 51 + mapping[(pos_tem, res_tem)] = (pos_cif, 'Y') + + elif res_cif == 52 and res_tem == 'P': + + mod_pos[pos_cif] = 52 + mapping[(pos_tem, res_tem)] = (pos_cif, 'P') + + elif res_cif == 53 and res_tem == 'V': + + mod_pos[pos_cif] = 53 + mapping[(pos_tem, res_tem)] = (pos_cif, 'V') + + elif res_cif == 54 and res_tem == 'L': + + mod_pos[pos_cif] = 54 + mapping[(pos_tem, res_tem)] = (pos_cif, 'L') + + elif res_cif == 55 and res_tem == 'Y': + + mod_pos[pos_cif] = 55 + mapping[(pos_tem, res_tem)] = (pos_cif, 'Y') + + elif res_cif == 56 and res_tem == 'E': + + mod_pos[pos_cif] = 56 + mapping[(pos_tem, res_tem)] = (pos_cif, 'E') + + elif res_cif == 57 and res_tem == 'F': + + mod_pos[pos_cif] = 57 + mapping[(pos_tem, res_tem)] = (pos_cif, 'F') + + elif res_cif == 58 and res_tem == 'C': + + mod_pos[pos_cif] = 58 + mapping[(pos_tem, res_tem)] = (pos_cif, 'C') + + elif res_cif == 59 and res_tem == 'H': + + mod_pos[pos_cif] = 59 + mapping[(pos_tem, res_tem)] = (pos_cif, 'H') + + elif res_cif == 60 and res_tem == 'C': + + mod_pos[pos_cif] = 60 + mapping[(pos_tem, res_tem)] = (pos_cif, 'C') + + elif res_cif == 61 and res_tem == 'Y': + + mod_pos[pos_cif] = 61 + mapping[(pos_tem, res_tem)] = (pos_cif, 'Y') + + elif res_cif == 62 and res_tem == 'C': + + mod_pos[pos_cif] = 62 + mapping[(pos_tem, res_tem)] = (pos_cif, 'C') + + elif res_cif == 63 and res_tem == 'M': + + mod_pos[pos_cif] = 63 + mapping[(pos_tem, res_tem)] = (pos_cif, 'M') + + elif res_cif == 64 and res_tem == 'M': + + mod_pos[pos_cif] = 64 + mapping[(pos_tem, res_tem)] = (pos_cif, 'M') + + elif res_cif == 65 and res_tem == 'C': + + mod_pos[pos_cif] = 65 + mapping[(pos_tem, res_tem)] = (pos_cif, 'C') + + elif res_cif == 66 and res_tem == 'Y': + + mod_pos[pos_cif] = 66 + mapping[(pos_tem, res_tem)] = (pos_cif, 'Y') + + elif res_cif == 67 and res_tem == 'S': + + mod_pos[pos_cif] = 67 + mapping[(pos_tem, res_tem)] = (pos_cif, 'S') + + elif res_cif == 68 and res_tem == 'C': + + mod_pos[pos_cif] = 68 + mapping[(pos_tem, res_tem)] = (pos_cif, 'C') + + elif res_cif == 69 and res_tem == 'L': + + mod_pos[pos_cif] = 69 + mapping[(pos_tem, res_tem)] = (pos_cif, 'L') + + elif res_cif == 70 and res_tem == 'M': + + mod_pos[pos_cif] = 70 + mapping[(pos_tem, res_tem)] = (pos_cif, 'M') + + elif res_cif == 71 and res_tem == 'K': + + mod_pos[pos_cif] = 71 + mapping[(pos_tem, res_tem)] = (pos_cif, 'K') + + else: + # insert a gap + template_grid.set_empty(row, pos_tem) + mapping[(pos_tem, res_tem)] = (pos_cif, res_cif) + + if DEBUG_MODE: + print ('! {p} {c}: template pos {pt} ({rt}) does not match cif pos {pc} ({rc}). Replacing with gap.'.format( + p = pdb_code, + c = chain, + pt = pos_tem, + rt = res_tem, + pc = pos_cif, + rc = res_cif if res_cif != -1 else 'DNE')) + + if res_cif != -1: + del_pos.append(pos_cif) + else: + mapping[(pos_tem, res_tem)] = (pos_cif, res_cif) + + + # adjust template start and end positions + correct_mapping = { key:value for key, value in mapping.items() if key[1] == value[1] } + + try: + tstart = correct_mapping[sorted(correct_mapping.keys())[0]][0] + tend = correct_mapping[sorted(correct_mapping.keys())[-1]][0] + template_grid.set_map(row, tstart, tend) + except IndexError: + # This exception handles cases in which all residues were deleted + if DEBUG_MODE: + print ('! {p} {c}: Removing {p} from alignment. No residues matched the alignment sequence.'.format( + p = pdb_code, + c = chain)) + + del_row.append(row) + continue + + ######################################################################## + ## Delete rows from the PIR Alignment if the residue ratio is to low ## + ######################################################################## + + if threshold > 0: + + gaps = 0 + res = 0 + + for col in range(template_grid.get_grid_width()): + if template_grid.is_empty(row, col): + template_grid.set_gap(row, col) + + if template_grid.is_gap(row, col): + gaps += 1 + else: + res += 1 + + ratio = res/float(gaps + res) + + if ratio > threshold: + print ('! Template {p} successfully passed residue ratio ({r:.2f} / {t}).'.format( + p = pdb_code, + r = ratio, + t = threshold )) + else: + print ('! Template {p} did not passed residue ratio ({r:.2f} / {t}). Removing it from pir Alignment.'.format( + p = pdb_code, + r = ratio, + t = threshold )) + + if row not in del_row: + del_row.append(row) + continue + + ######################################################################## + ## Edit cif files ## + ######################################################################## + + rem_row = list() # verbosity: saves information about removed residues + mod_row = list() # verbosity: saves information about modified residues + cha_row = list() # verbosity: saves any other changes + + for atom_row in reversed(range(0, atom_site.getRowCount())): + + try: + cif_chain = atom_site.getValue('label_asym_id', atom_row) + except IndexError: + pass + + # We do not care about the residues apart from the chain + if cif_chain != chain: + continue + + # get the residue number + try: + res_num = int(atom_site.getValue("label_seq_id", atom_row)) + except ValueError: + continue + + # pdb_PDB_model_num has to be set to 1 + try: + model_num = int(atom_site.getValue('pdbx_PDB_model_num', atom_row)) + except IndexError: + model_num = 1 # if we cannot extract, assume that it is alright + + try: + ins_code = atom_site.getValue('pdbx_PDB_ins_code', atom_row) + except IndexError: + ins_code = '?' # assume it has no insertion code + + group_PDB = atom_site.getValue('group_PDB', atom_row) + residue = atom_site.getValue('label_comp_id', atom_row) + residue = convert_aa_code(residue, convert) + + # MODELLER accepts only structures if pdbx_PDB_model_num is set to 1 + if model_num != 1: + + if (res_num, residue, 'model_num') not in cha_row: + cha_row.append((res_num, residue, 'model_num')) + + atom_site.setValue(1, "pdbx_PDB_model_num", atom_row) + + if ins_code != '?': + + if (res_num, residue, 'ins_code') not in cha_row: + cha_row.append((res_num, residue, 'ins_code')) + + atom_site.setValue('?', "pdbx_PDB_ins_code", atom_row) + + if group_PDB != 'ATOM': + + if (res_num, residue, 'group_PDB') not in cha_row: + cha_row.append((res_num, residue, 'group_PDB')) + + atom_site.setValue('ATOM', 'group_PDB', atom_row) + + ######################################################################## + ## Delete residues ## + ######################################################################## + + if res_num in del_pos: + if (res_num, residue) not in rem_row: + rem_row.append((res_num, residue)) + + atom_site.removeRow(atom_row) + + ######################################################################## + ## Modify residues ## + ######################################################################## + + if res_num in mod_pos.keys(): + + # Get the data + type_symbol = atom_site.getValue('type_symbol', atom_row) + label_atom_id = atom_site.getValue('label_atom_id', atom_row) + auth_atom_id = atom_site.getValue('auth_atom_id', atom_row) + + if mod_pos[res_num] == 1: # try to convert MSE to M + + atom_site.setValue('MET', 'label_comp_id', atom_row) + + try: + atom_site.setValue('MET', 'auth_comp_id', atom_row) + except IndexError: + pass + + if type_symbol == 'SE': + atom_site.setValue('S', 'type_symbol', atom_row) + if label_atom_id == 'SE': + atom_site.setValue('S', 'label_atom_id', atom_row) + if auth_atom_id == 'SE': + atom_site.setValue('S', 'auth_atom_id', atom_row) + + if (res_num, residue, 'MSE -> MET') not in mod_row: + mod_row.append((res_num, residue, 'MSE -> MET')) + + elif mod_pos[res_num] == 2: # try to convert HYP to PRO + # apparently it is enough to rename the label_comp_id to PRO to get + # MODELLER working with Hydroxyprolines (HYP) + + atom_site.setValue('PRO', 'label_comp_id', atom_row) + + try: + atom_site.setValue('PRO', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'HYP -> PRO') not in mod_row: + mod_row.append((res_num, residue, 'HYP -> PRO')) + + elif mod_pos[res_num] == 3: # try to convert MLY to LYS + + atom_site.setValue('LYS', 'label_comp_id', atom_row) + + try: + atom_site.setValue('LYS', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'MLY -> LYS') not in mod_row: + mod_row.append((res_num, residue, 'MLY -> LYS')) + + elif mod_pos[res_num] == 4: # converts Phosphoserine to Serine + + atom_site.setValue('SER', 'label_comp_id', atom_row) + + try: + atom_site.setValue('SER', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'SEP -> SER') not in mod_row: + mod_row.append((res_num, residue, 'SEP -> SER')) + + elif mod_pos[res_num] == 5: # converts Phosphothreonine to Threonine + + atom_site.setValue('THR', 'label_comp_id', atom_row) + + try: + atom_site.setValue('THR', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'TPO -> THR') not in mod_row: + mod_row.append((res_num, residue, 'TPO -> THR')) + + elif mod_pos[res_num] == 6: # converts S-HYDROXYCYSTEINE to Cysteine + + atom_site.setValue('CYS', 'label_comp_id', atom_row) + + try: + atom_site.setValue('CYS', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'CSO -> CYS') not in mod_row: + mod_row.append((res_num, residue, 'CSO -> CYS')) + + elif mod_pos[res_num] == 7: # converts O-PHOSPHOTYROSINE to Tyrosine + + atom_site.setValue('TYR', 'label_comp_id', atom_row) + + try: + atom_site.setValue('TYR', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'PTR -> TYR') not in mod_row: + mod_row.append((res_num, residue, 'PTR -> TYR')) + + elif mod_pos[res_num] == 8: # converts LYSINE NZ-CARBOXYLIC ACID to Lysine + + atom_site.setValue('LYS', 'label_comp_id', atom_row) + + try: + atom_site.setValue('LYS', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'KCX -> LYS') not in mod_row: + mod_row.append((res_num, residue, 'KCX -> LYS')) + + elif mod_pos[res_num] == 9: # converts S,S-(2-HYDROXYETHYL)THIOCYSTEINE to Cysteine + + atom_site.setValue('CYS', 'label_comp_id', atom_row) + + try: + atom_site.setValue('CYS', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'CME -> CYS') not in mod_row: + mod_row.append((res_num, residue, 'CME -> CYS')) + + elif mod_pos[res_num] == 10: # converts 3-SULFINOALANINE to Alanine + + atom_site.setValue('ALA', 'label_comp_id', atom_row) + + try: + atom_site.setValue('ALA', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'CSD -> ALA') not in mod_row: + mod_row.append((res_num, residue, 'CSD -> ALA')) + + elif mod_pos[res_num] == 11: # converts S-(DIMETHYLARSENIC)CYSTEINE to Cysteine + + atom_site.setValue('CYS', 'label_comp_id', atom_row) + + try: + atom_site.setValue('CYS', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'CAS -> CYS') not in mod_row: + mod_row.append((res_num, residue, 'CAS -> CYS')) + + elif mod_pos[res_num] == 12: # converts N-METHYLLEUCINE (MLE) to Leucine + + atom_site.setValue('LEU', 'label_comp_id', atom_row) + + try: + atom_site.setValue('LEU', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'MLE -> LEU') not in mod_row: + mod_row.append((res_num, residue, 'MLE -> LEU')) + + elif mod_pos[res_num] == 13: # converts D-ALANINE (DAL) to ALA + + atom_site.setValue('ALA', 'label_comp_id', atom_row) + + try: + atom_site.setValue('ALA', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'DAL -> ALA') not in mod_row: + mod_row.append((res_num, residue, 'DAL -> ALA')) + + elif mod_pos[res_num] == 14: # converts GAMMA-CARBOXY-GLUTAMIC ACID (CGU) to GLU + + atom_site.setValue('GLU', 'label_comp_id', atom_row) + + try: + atom_site.setValue('GLU', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'CGU -> GLU') not in mod_row: + mod_row.append((res_num, residue, 'CGU -> GLU')) + + elif mod_pos[res_num] == 15: # converts D-LEUCINE (DLE) to LEU + + atom_site.setValue('LEU', 'label_comp_id', atom_row) + + try: + atom_site.setValue('LEU', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'DLE -> LEU') not in mod_row: + mod_row.append((res_num, residue, 'DLE -> LEU')) + + elif mod_pos[res_num] == 16: # converts N-FORMYLMETHIONINE (FME) to MET + + atom_site.setValue('MET', 'label_comp_id', atom_row) + + try: + atom_site.setValue('MET', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'FME -> MET') not in mod_row: + mod_row.append((res_num, residue, 'FME -> MET')) + + elif mod_pos[res_num] == 17: # converts D-VAL (DVA) to VAL + + atom_site.setValue('VAL', 'label_comp_id', atom_row) + + try: + atom_site.setValue('VAL', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'DVA -> VAL') not in mod_row: + mod_row.append((res_num, residue, 'DVA -> VAL')) + + elif mod_pos[res_num] == 18: # converts CYSTEINESULFONIC ACID (OCS) to CYS + + atom_site.setValue('CYS', 'label_comp_id', atom_row) + + try: + atom_site.setValue('CYS', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'OCS -> CYS') not in mod_row: + mod_row.append((res_num, residue, 'OCS -> CYS')) + + elif mod_pos[res_num] == 19: # converts D-PROLINE (DPR) to PRO + + atom_site.setValue('PRO', 'label_comp_id', atom_row) + + try: + atom_site.setValue('PRO', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'DPR -> PRO') not in mod_row: + mod_row.append((res_num, residue, 'DPR -> PRO')) + + elif mod_pos[res_num] == 20: # converts N-METHYLVALINE (MVA) to VAL + + atom_site.setValue('VAL', 'label_comp_id', atom_row) + + try: + atom_site.setValue('VAL', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'MVA -> VAL') not in mod_row: + mod_row.append((res_num, residue, 'MVA -> VAL')) + + elif mod_pos[res_num] == 21: # converts O-SULFO-L-TYROSINE (TYS) to VAL + + atom_site.setValue('TYR', 'label_comp_id', atom_row) + + try: + atom_site.setValue('TYR', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'TYS -> TYR') not in mod_row: + mod_row.append((res_num, residue, 'TYS -> TYR')) + + elif mod_pos[res_num] == 22: # converts N-TRIMETHYLLYSINE (M3L) to LYS + + atom_site.setValue('LYS', 'label_comp_id', atom_row) + + try: + atom_site.setValue('LYS', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'M3L -> LYS') not in mod_row: + mod_row.append((res_num, residue, 'M3L -> LYS')) + + elif mod_pos[res_num] == 23: # converts S-METHYLCYSTEINE (SMC) to CYS + + atom_site.setValue('CYS', 'label_comp_id', atom_row) + + try: + atom_site.setValue('CYS', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'SMC -> CYS') not in mod_row: + mod_row.append((res_num, residue, 'SMC -> CYS')) + + elif mod_pos[res_num] == 24: # converts N(6)-ACETYLLYSINE (ALY) to LYS + + atom_site.setValue('LYS', 'label_comp_id', atom_row) + + try: + atom_site.setValue('LYS', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'ALY -> LYS') not in mod_row: + mod_row.append((res_num, residue, 'ALY -> LYS')) + + elif mod_pos[res_num] == 25: # converts S-OXY CYSTEINE (CSX) to CYS + + atom_site.setValue('CYS', 'label_comp_id', atom_row) + + try: + atom_site.setValue('CYS', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'CSX -> CYS') not in mod_row: + mod_row.append((res_num, residue, 'CSX -> CYS')) + + elif mod_pos[res_num] == 26: # converts D-CYSTEINE (DCY) to CYS + + atom_site.setValue('CYS', 'label_comp_id', atom_row) + + try: + atom_site.setValue('CYS', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'DCY -> CYS') not in mod_row: + mod_row.append((res_num, residue, 'DCY -> CYS')) + + elif mod_pos[res_num] == 27: # converts NORLEUCINE (NLE) to LEU + + atom_site.setValue('LEU', 'label_comp_id', atom_row) + + try: + atom_site.setValue('LEU', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'NLE -> LEU') not in mod_row: + mod_row.append((res_num, residue, 'NLE -> LEU')) + + elif mod_pos[res_num] == 28: # converts D-GLUTAMIC ACID (DGL) to GLU + + atom_site.setValue('GLU', 'label_comp_id', atom_row) + + try: + atom_site.setValue('GLU', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'DGL -> GLU') not in mod_row: + mod_row.append((res_num, residue, 'DGL -> GLU')) + + elif mod_pos[res_num] == 29: # converts D-SERINE (DSN) to SER + + atom_site.setValue('SER', 'label_comp_id', atom_row) + + try: + atom_site.setValue('SER', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'DSN -> SER') not in mod_row: + mod_row.append((res_num, residue, 'DSN -> SER')) + + elif mod_pos[res_num] == 30: # converts S-MERCAPTOCYSTEINE (CSS) to CYS + + atom_site.setValue('CYS', 'label_comp_id', atom_row) + + try: + atom_site.setValue('CYS', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'CSS -> CYS') not in mod_row: + mod_row.append((res_num, residue, 'CSS -> CYS')) + + elif mod_pos[res_num] == 31: # converts D-LYSINE (DLY) to LYS + + atom_site.setValue('LYS', 'label_comp_id', atom_row) + + try: + atom_site.setValue('LYS', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'DLY -> LYS') not in mod_row: + mod_row.append((res_num, residue, 'DLY -> LYS')) + + elif mod_pos[res_num] == 32: # converts N-METHYL-LYSINE (MLZ) to LYS + + atom_site.setValue('LYS', 'label_comp_id', atom_row) + + try: + atom_site.setValue('LYS', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'MLZ -> LYS') not in mod_row: + mod_row.append((res_num, residue, 'MLZ -> LYS')) + + elif mod_pos[res_num] == 33: # converts D-PHENYLALANINE (DPN) to PHE + + atom_site.setValue('PHE', 'label_comp_id', atom_row) + + try: + atom_site.setValue('PHE', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'DPN -> PHE') not in mod_row: + mod_row.append((res_num, residue, 'DPN -> PHE')) + + elif mod_pos[res_num] == 34: # converts D-ARGININE (DAR) to ARG + + atom_site.setValue('ARG', 'label_comp_id', atom_row) + + try: + atom_site.setValue('ARG', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'DAR -> ARG') not in mod_row: + mod_row.append((res_num, residue, 'DAR -> ARG')) + + elif mod_pos[res_num] == 35: # converts IODO-PHENYLALANINE (PHI) to PHE + + atom_site.setValue('PHE', 'label_comp_id', atom_row) + + try: + atom_site.setValue('PHE', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'PHI -> PHE') not in mod_row: + mod_row.append((res_num, residue, 'PHI -> PHE')) + + elif mod_pos[res_num] == 36: # converts BETA-L-ASPARTIC ACID (IAS) to ASP + + atom_site.setValue('ASP', 'label_comp_id', atom_row) + + try: + atom_site.setValue('ASP', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'IAS -> ASP') not in mod_row: + mod_row.append((res_num, residue, 'IAS -> ASP')) + + elif mod_pos[res_num] == 37: # converts D-ASPARTIC ACID (DAS) to ASP + + atom_site.setValue('ASP', 'label_comp_id', atom_row) + + try: + atom_site.setValue('ASP', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'DAS -> ASP') not in mod_row: + mod_row.append((res_num, residue, 'DAS -> ASP')) + + elif mod_pos[res_num] == 38: # converts 4-METHYL-HISTIDINE (HIC) to HIS + + atom_site.setValue('HIS', 'label_comp_id', atom_row) + + try: + atom_site.setValue('HIS', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'HIC -> HIS') not in mod_row: + mod_row.append((res_num, residue, 'HIC -> HIS')) + + elif mod_pos[res_num] == 39: # converts (4R)-4-methyl-L-proline (MP8) to PRO + + atom_site.setValue('PRO', 'label_comp_id', atom_row) + + try: + atom_site.setValue('PRO', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'MP8 -> PRO') not in mod_row: + mod_row.append((res_num, residue, 'MP8 -> PRO')) + + elif mod_pos[res_num] == 40: # converts D-THREONINE (DTH) to THR + + atom_site.setValue('THR', 'label_comp_id', atom_row) + + try: + atom_site.setValue('THR', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'DTH -> THR') not in mod_row: + mod_row.append((res_num, residue, 'DTH -> THR')) + + elif mod_pos[res_num] == 41: # converts D-ISOLEUCINE (DIL) to ILE + + atom_site.setValue('ILE', 'label_comp_id', atom_row) + + try: + atom_site.setValue('ILE', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'DIL -> ILE') not in mod_row: + mod_row.append((res_num, residue, 'DIL -> ILE')) + + elif mod_pos[res_num] == 42: # converts N-METHYL ASPARAGINE (MEN) to ASN + + atom_site.setValue('ASN', 'label_comp_id', atom_row) + + try: + atom_site.setValue('ASN', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'MEN -> ASN') not in mod_row: + mod_row.append((res_num, residue, 'MEN -> ASN')) + + elif mod_pos[res_num] == 43: # converts D-TYROSINE (DTY) to TYR + + atom_site.setValue('TYR', 'label_comp_id', atom_row) + + try: + atom_site.setValue('TYR', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'DTY -> TYR') not in mod_row: + mod_row.append((res_num, residue, 'DTY -> TYR')) + + elif mod_pos[res_num] == 44: # converts N-CARBOXYMETHIONINE (CXM) to MET + + atom_site.setValue('MET', 'label_comp_id', atom_row) + + try: + atom_site.setValue('MET', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'CXM -> MET') not in mod_row: + mod_row.append((res_num, residue, 'CXM -> MET')) + + elif mod_pos[res_num] == 45: # converts D-GLUTAMINE (DGN) to MET + + atom_site.setValue('GLN', 'label_comp_id', atom_row) + + try: + atom_site.setValue('GLN', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'DGN -> GLN') not in mod_row: + mod_row.append((res_num, residue, 'DGN -> GLN')) + + elif mod_pos[res_num] == 46: # converts D-TRYPTOPHAN (DTR) to TRP + + atom_site.setValue('TRP', 'label_comp_id', atom_row) + + try: + atom_site.setValue('TRP', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'DTR -> TRP') not in mod_row: + mod_row.append((res_num, residue, 'DTR -> TRP')) + + elif mod_pos[res_num] == 47: # converts N-ACETYL-SERINE (SAC) to SER + + atom_site.setValue('SER', 'label_comp_id', atom_row) + + try: + atom_site.setValue('SER', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'SAC -> SER') not in mod_row: + mod_row.append((res_num, residue, 'SAC -> SER')) + + elif mod_pos[res_num] == 48: # converts D-ASPARAGINE (DSG) to ASN + + atom_site.setValue('ASN', 'label_comp_id', atom_row) + + try: + atom_site.setValue('ASN', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'DSG -> ASN') not in mod_row: + mod_row.append((res_num, residue, 'DSG -> ASN')) + + elif mod_pos[res_num] == 49: # converts N-METHYL METHIONINE (MME) to MET + + atom_site.setValue('MET', 'label_comp_id', atom_row) + + try: + atom_site.setValue('MET', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'MME -> MET') not in mod_row: + mod_row.append((res_num, residue, 'MME -> MET')) + + elif mod_pos[res_num] == 50: # converts N-methyl-L-alanine (MAA) to ALA + + atom_site.setValue('ALA', 'label_comp_id', atom_row) + + try: + atom_site.setValue('ALA', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'MAA -> ALA') not in mod_row: + mod_row.append((res_num, residue, 'MAA -> ALA')) + + elif mod_pos[res_num] == 51: # converts 3-FLUOROTYROSINE (YOF) to TYR + + atom_site.setValue('TYR', 'label_comp_id', atom_row) + + try: + atom_site.setValue('TYR', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'YOF -> TYR') not in mod_row: + mod_row.append((res_num, residue, 'YOF -> TYR')) + + elif mod_pos[res_num] == 52: # converts (4R)-4-fluoro-L-proline (FP9) to PRO + + atom_site.setValue('PRO', 'label_comp_id', atom_row) + + try: + atom_site.setValue('PRO', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'FP9 -> PRO') not in mod_row: + mod_row.append((res_num, residue, 'FP9 -> PRO')) + + elif mod_pos[res_num] == 53: # converts N-formyl-L-valine (FVA) to VAL + + atom_site.setValue('VAL', 'label_comp_id', atom_row) + + try: + atom_site.setValue('VAL', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'FVA -> VAL') not in mod_row: + mod_row.append((res_num, residue, 'FVA -> VAL')) + + elif mod_pos[res_num] == 54: # converts N-methyl-D-leucine (MLU) to LEU + + atom_site.setValue('LEU', 'label_comp_id', atom_row) + + try: + atom_site.setValue('LEU', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'MLU -> LEU') not in mod_row: + mod_row.append((res_num, residue, 'MLU -> LEU')) + + elif mod_pos[res_num] == 55: # converts (betaR)-3-chloro-beta-hydroxy-L-tyrosine (OMY) to TYR + + atom_site.setValue('TYR', 'label_comp_id', atom_row) + + try: + atom_site.setValue('TYR', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'OMY -> TYR') not in mod_row: + mod_row.append((res_num, residue, 'OMY -> TYR')) + + elif mod_pos[res_num] == 56: # converts GAMMA-D-GLUTAMIC ACID (FGA) to GLU + + atom_site.setValue('GLU', 'label_comp_id', atom_row) + + try: + atom_site.setValue('GLU', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'FGA -> GLU') not in mod_row: + mod_row.append((res_num, residue, 'FGA -> GLU')) + + elif mod_pos[res_num] == 57: # converts N-METHYLPHENYLALANINE (MEA) to PHE + + atom_site.setValue('PHE', 'label_comp_id', atom_row) + + try: + atom_site.setValue('PHE', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'MEA -> PHE') not in mod_row: + mod_row.append((res_num, residue, 'MEA -> PHE')) + + elif mod_pos[res_num] == 58: # converts S-(METHYLMERCURY)-L-CYSTEINE (CMH) to CYS + + atom_site.setValue('CYS', 'label_comp_id', atom_row) + + try: + atom_site.setValue('CYS', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'CMH -> CYS') not in mod_row: + mod_row.append((res_num, residue, 'CMH -> CYS')) + + elif mod_pos[res_num] == 59: # converts D-HISTIDINE (DHI) to HIS + + atom_site.setValue('HIS', 'label_comp_id', atom_row) + + try: + atom_site.setValue('HIS', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'DHI -> HIS') not in mod_row: + mod_row.append((res_num, residue, 'DHI -> HIS')) + + elif mod_pos[res_num] == 60: # converts SELENOCYSTEINE (SEC) to CYS + + atom_site.setValue('CYS', 'label_comp_id', atom_row) + + try: + atom_site.setValue('CYS', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'SEC -> CYS') not in mod_row: + mod_row.append((res_num, residue, 'SEC -> CYS')) + + elif mod_pos[res_num] == 61: # converts (betaR)-3-CHLORO-BETA-HYDROXY-D-TYROSINE (OMZ) to TYR + + atom_site.setValue('TYR', 'label_comp_id', atom_row) + + try: + atom_site.setValue('TYR', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'OMZ -> TYR') not in mod_row: + mod_row.append((res_num, residue, 'OMZ -> TYR')) + + elif mod_pos[res_num] == 62: # converts S-ACETYL-CYSTEINE (SCY) to CYS + + atom_site.setValue('CYS', 'label_comp_id', atom_row) + + try: + atom_site.setValue('CYS', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'SCY -> CYS') not in mod_row: + mod_row.append((res_num, residue, 'SCY -> CYS')) + + elif mod_pos[res_num] == 63: # converts S-OXYMETHIONINE (MHO) to MET + + atom_site.setValue('MET', 'label_comp_id', atom_row) + + try: + atom_site.setValue('MET', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'MHO -> MET') not in mod_row: + mod_row.append((res_num, residue, 'MHO -> MET')) + + elif mod_pos[res_num] == 64: # converts D-METHIONINE (MED) to MET + + atom_site.setValue('MET', 'label_comp_id', atom_row) + + try: + atom_site.setValue('MET', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'MED -> MET') not in mod_row: + mod_row.append((res_num, residue, 'MED -> MET')) + + elif mod_pos[res_num] == 65: # converts S-DIMETHYLARSINOYL-CYSTEINE (CAF) to CYS + + atom_site.setValue('CYS', 'label_comp_id', atom_row) + + try: + atom_site.setValue('CYS', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'CAF -> CYS') not in mod_row: + mod_row.append((res_num, residue, 'CAF -> CYS')) + + elif mod_pos[res_num] == 66: # converts META-NITRO-TYROSINE (NIY) to TYR + + atom_site.setValue('TYR', 'label_comp_id', atom_row) + + try: + atom_site.setValue('TYR', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'NIY -> TYR') not in mod_row: + mod_row.append((res_num, residue, 'NIY -> TYR')) + + elif mod_pos[res_num] == 67: # converts O-ACETYLSERINE (OAS) to SER + + atom_site.setValue('SER', 'label_comp_id', atom_row) + + try: + atom_site.setValue('SER', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'OAS -> SER') not in mod_row: + mod_row.append((res_num, residue, 'OAS -> SER')) + + elif mod_pos[res_num] == 68: # converts S-METHYL-THIO-CYSTEINE (SCH) to CYS + + atom_site.setValue('CYS', 'label_comp_id', atom_row) + + try: + atom_site.setValue('CYS', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'SCH -> CYS') not in mod_row: + mod_row.append((res_num, residue, 'SCH -> CYS')) + + elif mod_pos[res_num] == 69: # converts 2-methyl-L-norleucine (MK8) to LEU + + atom_site.setValue('LEU', 'label_comp_id', atom_row) + + try: + atom_site.setValue('LEU', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'MK8 -> LEU') not in mod_row: + mod_row.append((res_num, residue, 'MK8 -> LEU')) + + elif mod_pos[res_num] == 70: # converts METHIONINE SULFOXIDE (SME) to MET + + atom_site.setValue('MET', 'label_comp_id', atom_row) + + try: + atom_site.setValue('MET', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'SME -> MET') not in mod_row: + mod_row.append((res_num, residue, 'SME -> MET')) + + elif mod_pos[res_num] == 71: # converts 5-HYDROXYLYSINE (LYZ) to LYS + + atom_site.setValue('LYS', 'label_comp_id', atom_row) + + try: + atom_site.setValue('LYS', 'auth_comp_id', atom_row) + except IndexError: + pass + + if (res_num, residue, 'LYZ -> LYS') not in mod_row: + mod_row.append((res_num, residue, 'LYZ -> LYS')) + + ######################################################################## + ## Notify user about modification made to cif data ## + ######################################################################## + + if DEBUG_MODE: + mod_model_num = len([ msg for msg in cha_row if msg[2] == 'model_num' ]) + mod_ins_code = len([ msg for msg in cha_row if msg[2] == 'ins_code' ]) + mod_group_PDB = len([ msg for msg in cha_row if msg[2] == 'group_PDB' ]) + + if mod_model_num != 0: + print ('! {p} {c}: modified atom_site.pdbx_PDB_model_num for {cr} residues to 1.'.format( + p = pdb_code, + c = chain, + cr = mod_model_num)) + + if mod_ins_code != 0: + print ('! {p} {c}: modified atom_site.pdbx_PDB_ins_code for {cr} residues to "?".'.format( + p = pdb_code, + c = chain, + cr = mod_ins_code)) + + if mod_group_PDB != 0: + print ('! {p} {c}: modified atom_site.group_PDB for {cr} residues to "ATOM".'.format( + p = pdb_code, + c = chain, + cr = mod_group_PDB)) + + for residue in reversed(mod_row): + print ('! {p} {c}: modified cif pos {cr} ({nr}).'.format( + p = pdb_code, + c = chain, + cr = residue[0], + ca = residue[1], + nr = residue[2])) + + + for residue in reversed(rem_row): + print ('! {p} {c}: removed cif pos {cr} ({ca})'.format( + p = pdb_code, + c = chain, + cr = residue[0], + ca = residue[1])) + + cif_edits[pdb_code] = block + + # write modified pir to disk + for pdb_code in cif_edits: + out = open(os.path.join(output_path, pdb_code + '.cif'), 'w') + writer = PdbxWriter(out) + writer.writeContainer(cif_edits[pdb_code]) + + # Delete missing entries from the last template sequence to the first + for row in reversed(del_row): + template_grid.del_row(row) + + return template_grid + +def remove_self_alignment(template_grid, query_name): + """ Removes a self alignment from the final pir alignment to prevent clashes with MODELLER """ + + to_delete = list() + + for row in range(template_grid.get_grid_height()): + if template_grid._pdb_code[row] == query_name: + to_delete.append(row) + + for row in reversed(to_delete): + template_grid.del_row(row) + + return True + +def write_to_file(line_list, fname): + """ Writes the final pir file """ + + with open(fname, 'w+') as fout: + for line in line_list: + fout.write(line + "\n") + +def arg(): + import argparse + description = """Creates a MODELLER alignment (*.pir) from a HHSearch results file (*.hhr).""" + epilog= '2016 Harald Voehringer.' + # Initiate a ArgumentParser Class + parser = argparse.ArgumentParser(description = description, epilog = epilog) + + # Call add_options to the parser + parser.add_argument('input', help = 'results file from HHsearch with hit list and alignment', metavar = 'FILE') + parser.add_argument('cifs', help = 'path to the folder containing cif files', metavar = 'DIR') + parser.add_argument('pir', help = 'output file (PIR-formatted multiple alignment)', metavar = 'FILE') + parser.add_argument('output', help = 'path to the folder where modified cif files should be written to', metavar = 'DIR') + + parser.add_argument('-v', '--verbose', action = 'store_true', help = 'verbose mode') + parser.add_argument('-m', nargs = '+', help = 'pick hits with specified indices (e.g. -m 2 5)', metavar = 'INT') + parser.add_argument('-e', type = float, help = 'maximum E-Value threshold (e.g. -e 0.001)', metavar = 'FLOAT') + parser.add_argument('-r', type = float, help = 'residue ratio (filter alignments that have contribute at least residues according to the specified ratio).', + default = 0, metavar = 'FLOAT') + + parser.add_argument('-c', help = 'convert non-canonical residues (default = True)', action = 'store_true', default = True) + + + return parser + + +def main(): + import sys + parser = arg() + args = parser.parse_args(sys.argv[1:]) + + global DEBUG_MODE + + if args.verbose: + DEBUG_MODE = True + + query_name, query_chain = get_query_name(args.input) + + data = read_result(args.input) + selected_templates = list() + + if args.m and not args.e: + selection = map(lambda x: int(x), args.m) + print ('Selected templates {st}.'.format(st = ', '.join(args.m))) + + for i in selection: + tmp_info = str(data[i - 1].template_info.split('>')[1]) + print ('{i}: {t}'.format( + i = i, + t = tmp_info[0:80])) + + selected_templates.append(data[i - 1]) + + elif args.e and not args.m: + print ('Selected templates satisfying E-val <= {e}'.format(e = args.e)) + + e_values = { float(j.evalue):i for i, j in enumerate(data) } + selection = sorted([ val for key, val in e_values.items() if key <= args.e ]) + + for i in selection: + tmp_info = str(data[i - 1].template_info.split('>')[1]) + print ('{i}: {t}'.format( + i = i + 1, + t = tmp_info[0:80])) + + selected_templates.append(data[i - 1]) + + elif args.m and args.e: + print ('! Please do not use option -m and -e at the same time ! Exiting.') + sys.exit() + else: + selected_templates = data + + print ('Creating pir file using all templates ({n})'.format( + n = len(selected_templates))) + + query_grid = create_query_grid(selected_templates) # load query grid + print ('query_grid') + print(query_grid) + gapless_query_grid = create_gapless_grid(query_grid) # remove gaps + print ('gapless_query_grid') + print(gapless_query_grid) + processed_query_grid = process_query_grid(query_grid, gapless_query_grid) # insert gaps + ##processed_query_grid = process_query_grid(query_grid, query_grid) # insert gaps + print ('processed_query_grid') + print (processed_query_grid) + glob_seq = derive_global_seq(processed_query_grid, query_name, query_chain) # derive query sequence + template_grid = create_template_grid(selected_templates) # create template grid + print ('template_grid') + print (template_grid) + processed_template_grid = process_template_grid(query_grid, template_grid) # insert gaps to template sequnces + print ('processed_query_grid') + print (processed_query_grid) + print ('hzhu processed_template_grid') + print (processed_template_grid) + final_grid = compare_with_cifs(processed_template_grid, args.cifs, args.output, args.c, args.r) # compare with atom section of cifs + remove_self_alignment(final_grid, query_name) # remove self alignment if any + write_to_file([glob_seq, final_grid.display()], args.pir) + + +if __name__ == "__main__": + main() diff -Nru hhsuite-3.0~beta2+dfsg/scripts/HHPaths.pm hhsuite-3.0~beta3+dfsg/scripts/HHPaths.pm --- hhsuite-3.0~beta2+dfsg/scripts/HHPaths.pm 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/scripts/HHPaths.pm 2017-07-14 10:34:03.000000000 +0000 @@ -28,9 +28,10 @@ use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION); use Exporter; our $v; -our $VERSION = "version 2.0.16 (February 2013)"; +our $VERSION = "version 3.0.0 (15-03-2015)"; our @ISA = qw(Exporter); our @EXPORT = qw($VERSION $hhlib $hhdata $hhbin $hhscripts $execdir $datadir $ncbidir $dummydb $pdbdir $dsspdir $dssp $cs_lib $context_lib $v); +push @EXPORT, qw($hhshare $hhbdata); ############################################################################################## # PLEASE COMPLETE THE PATHS ... TO PSIPRED AND OLD-STYLE BLAST (NOT BLAST+) (NEEDED FOR PSIPRED) @@ -54,11 +55,13 @@ # The lines below probably do not need to be changed # Setting paths for hh-suite perl scripts -our $hhlib = $ENV{"HHLIB"}; # main hh-suite directory -our $hhdata = $hhlib."/data"; # path to data directory for hhblits, example files +our $hhlib = $ENV{"HHLIB"} || "/usr/lib/hhsuite"; # main hh-suite directory +our $hhshare = $ENV{"HHLIB"} || "/usr/share/hhsuite"; # main hh-suite directory +our $hhdata = $hhshare."/data"; # path to arch indep data directory for hhblits, example files +our $hhbdata = $hhlib."/data"; # path to arch dep data directory for hhblits, example files our $hhbin = $hhlib."/bin"; # path to cstranslate (path to hhsearch, hhblits etc. should be in environment variable PATH) -our $hhscripts= $hhlib."/scripts"; # path to hh perl scripts (addss.pl, reformat.pl, hhblitsdb.pl etc.) -our $dummydb = $hhdata."/do_not_delete"; # Name of dummy blast db for PSIPRED (single sequence formatted with NCBI formatdb) +our $hhscripts= $hhshare."/scripts"; # path to hh perl scripts (addss.pl, reformat.pl, hhblitsdb.pl etc.) +our $dummydb = $hhbdata."/do_not_delete"; # Name of dummy blast db for PSIPRED (single sequence formatted with NCBI formatdb) # HHblits data files our $cs_lib = "$hhdata/cs219.lib"; @@ -72,7 +75,7 @@ ################################################################################################ sub System() { - if ($v>=2) {printf("\$ %s\n",$_[0]);} + if ($v>=2) {printf(STDERR "\$ %s\n",$_[0]);} system($_[0]); if ($? == -1) { die("\nError: failed to execute '$_[0]': $!\n\n"); diff -Nru hhsuite-3.0~beta2+dfsg/scripts/hhpred/README hhsuite-3.0~beta3+dfsg/scripts/hhpred/README --- hhsuite-3.0~beta2+dfsg/scripts/hhpred/README 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/scripts/hhpred/README 2017-07-14 10:34:03.000000000 +0000 @@ -5,8 +5,8 @@ 2) download hhpred databases: - hhpred needs hhm, a3m and pdb files for each template, all in the same folder - - goto ftp://toolkit.genzentrum.lmu.de/pub/HH-suite/databases/hhsearch_dbs/ - and download the latest + - goto http://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/hhsearch_dbs/ + and download the latest pdb70*.a3m.tar.gz, pdb70*.hhm.tar.gz, pdb70*.a3m.tar.gz and extract them into your hhpred database folder - enter the path to the database into the config.pm file in the "lib" folder (entry: _pdbdir) diff -Nru hhsuite-3.0~beta2+dfsg/scripts/hh_reader.py hhsuite-3.0~beta3+dfsg/scripts/hh_reader.py --- hhsuite-3.0~beta2+dfsg/scripts/hh_reader.py 1970-01-01 00:00:00.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/scripts/hh_reader.py 2017-07-14 10:34:03.000000000 +0000 @@ -0,0 +1,204 @@ +#!/usr/bin/env python + +""" +Parser for hhr result files created with hhblits|hhsearch|hhalign -o +""" + + +import sys +from collections import namedtuple + + +__author__ = 'Markus Meier (markus.meier@mpibpc.mpg.de)' +__version__ = '1.0' +__license__ = "GPL-3" + + +hhr_alignment = namedtuple('hhr_alignment', ['query_id', 'query_length', 'query_neff', + 'template_id', 'template_length', 'template_info', + 'template_neff', 'query_ali', 'template_ali', + 'start', 'end', 'probability', 'evalue', 'score', + 'aligned_cols', 'identity', 'similarity', 'sum_probs']) + + +class HHRFormatError(Exception): + def __init__(self, value): + self.value = "ERROR: "+value + + def __str__(self): + return repr(self.value) + + +def get_sequence_name(header): + name = header.replace(">", "").split()[0] + return name + + +def parse_result(lines): + results = [] + + query_id = None + query_length = None + query_neff = None + query_seq = [] + template_id = None + template_length = None + template_seq = [] + template_info = None + query_start = None + query_end = None + template_start = None + template_end = None + probability = None + evalue = None + score = None + identity = None + similarity = None + template_neff = None + sum_probs = None + aligned_cols = None + + skipped_ali_tags = ["ss_dssp", "ss_pred", "Consensus"] + + is_alignment_section = False + + for line in lines: + if(line.startswith("Query")): + query_id = line.split()[1] + elif(line.startswith("Match_columns")): + query_length = int(line.split()[1]) + elif(line.startswith("Neff")): + query_neff = float(line.split()[1]) + elif(is_alignment_section and (line.startswith("No") or line.startswith("Done!"))): + if query_start is not None: + result = hhr_alignment(query_id, query_length, query_neff, + template_id, template_length, template_info, template_neff, + query_seq, template_seq, (query_start, template_start), + (query_end, template_end), probability, evalue, score, + aligned_cols, identity, similarity, sum_probs) + results.append(result) + template_id = None + template_info = None + query_seq = [] + template_seq = [] + + query_start = None + query_end = None + template_start = None + template_end = None + elif(line.startswith("Probab")): + tokens = line.split() + probability = float(tokens[0].split("=")[1]) + evalue = float(tokens[1].split("=")[1]) + score = float(tokens[2].split("=")[1]) + aligned_cols = int(tokens[3].split("=")[1]) + identity = float(tokens[4].split("=")[1].replace("%", "")) / 100.0 + similarity = float(tokens[5].split("=")[1]) + sum_probs = float(tokens[6].split("=")[1]) + if(len(tokens) > 7): + template_neff = float(tokens[7].split("=")[1]) + continue + elif(line.startswith(">")): + is_alignment_section = True + template_id = line[1:].split()[0] + template_info = line + elif(line.startswith("Q")): + tokens = line.split() + if(tokens[1] in skipped_ali_tags): + continue + + try: + token_2 = tokens[2].replace("(", "").replace(")", "") + token_2 = int(token_2) + except: + raise HHRFormatError(("Converting failure of start index ({}) " + "of query alignment").format(tokens[2])) + + if query_start is None: + query_start = token_2 + query_start = min(query_start, token_2) + + try: + token_4 = tokens[4].replace("(", "").replace(")", "") + token_4 = int(token_4) + except: + raise HHRFormatError(("Converting failure of end index ({}) " + "of query alignment").format(tokens[4])) + + if query_end is None: + query_end = token_4 + query_end = max(query_end, token_4) + query_seq.append(tokens[3]) + elif(line.startswith("T")): + tokens = line.split() + if(tokens[1] in skipped_ali_tags): + continue + template_seq.append(tokens[3]) + + try: + token_2 = tokens[2].replace("(", "").replace(")", "") + token_2 = int(token_2) + except: + raise HHRFormatError(("Converting failure of start index ({}) " + "of template alignment").format(tokens[2])) + + if template_start is None: + template_start = token_2 + template_start = min(template_start, token_2) + + try: + token_4 = tokens[4].replace("(", "").replace(")", "") + token_4 = int(token_4) + except: + raise HHRFormatError(("Converting failure of end index ({}) " + "of template alignment").format(tokens[4])) + + if template_end is None: + template_end = token_4 + template_end = max(template_end, token_4) + + try: + token_5 = tokens[4].replace("(", "").replace(")", "") + token_5 = int(token_5) + except: + raise HHRFormatError(("Converting failure of template length ({}) " + "in template alignment").format(tokens[5])) + template_length = token_5 + + + if(template_id is not None and query_start is not None): + result = hhr_alignment(query_id, query_length, query_neff, + template_id, template_length, template_info, template_neff, + "".join(query_seq), "".join(template_seq), (query_start, template_start), + (query_end, template_end), probability, evalue, score, + aligned_cols, identity, similarity, sum_probs) + results.append(result) + + return results + + +def read_result(input_file): + with open(input_file) as fh: + lines = fh.readlines() + return parse_result(lines) + + +def main(): + counter = 0 + for result in read_result(sys.argv[1]): + print("Alignment " + str(counter) + "\t evalue: " + str(result.evalue) + + "\t probability: " + str(result.probability)) + + print(result.query_id + "\t" + str(result.start[0]) + "\t" + + result.query_ali + "\t" + + str(result.end[0])) + + print(result.template_id + "\t" + str(result.start[1]) + "\t" + + result.template_ali + "\t" + + str(result.end[1])) + + counter += 1 + + +if __name__ == "__main__": + main() diff -Nru hhsuite-3.0~beta2+dfsg/scripts/is_huge_a3m.py hhsuite-3.0~beta3+dfsg/scripts/is_huge_a3m.py --- hhsuite-3.0~beta2+dfsg/scripts/is_huge_a3m.py 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/scripts/is_huge_a3m.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,34 +0,0 @@ -#!/usr/bin/env python - -from a3m import A3M_Container -from a3m import A3MFormatError -import sys - - -def main(): - filename = sys.argv[1] - - a3m = A3M_Container() - - if(filename.lower() == "stdin"): - fh = sys.stdin - else: - fh = open(filename, "r") - - threshold = int(sys.argv[2]) - - try: - a3m.read_a3m(fh) - size = a3m.get_number_sequences() - if size >= threshold: - exit(0) - else: - exit(1) - except A3MFormatError as e: - sys.stderr.write(e) - exit(1) - - -if __name__ == "__main__": - main() - diff -Nru hhsuite-3.0~beta2+dfsg/scripts/mergeali.pl hhsuite-3.0~beta3+dfsg/scripts/mergeali.pl --- hhsuite-3.0~beta2+dfsg/scripts/mergeali.pl 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/scripts/mergeali.pl 2017-07-14 10:34:03.000000000 +0000 @@ -1,4 +1,4 @@ -#! /usr/bin/env perl +#! /usr/bin/perl # mergeali.pl- Merge multiple alignments in A3M format via a multiple alignment of their seed sequences. # Usage: mergeali.pl [-i] infile.fas [-o] outfile.a3m [options] diff -Nru hhsuite-3.0~beta2+dfsg/scripts/multithread.pl hhsuite-3.0~beta3+dfsg/scripts/multithread.pl --- hhsuite-3.0~beta2+dfsg/scripts/multithread.pl 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/scripts/multithread.pl 2017-07-14 10:34:03.000000000 +0000 @@ -1,4 +1,4 @@ -#!/usr/bin/env perl +#!/usr/bin/perl # # multithread.pl: # Run a command with different file names as arguments on multiple threads in parallel diff -Nru hhsuite-3.0~beta2+dfsg/scripts/pdb2fasta.pl hhsuite-3.0~beta3+dfsg/scripts/pdb2fasta.pl --- hhsuite-3.0~beta2+dfsg/scripts/pdb2fasta.pl 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/scripts/pdb2fasta.pl 2017-07-14 10:34:03.000000000 +0000 @@ -1,4 +1,4 @@ -#! /usr/bin/env perl +#! /usr/bin/perl # # pdb2fasta.pl - generate FASTA nonredundant sequence file from SEQRES records of globbed pdb files. diff -Nru hhsuite-3.0~beta2+dfsg/scripts/pdbfilter.pl hhsuite-3.0~beta3+dfsg/scripts/pdbfilter.pl --- hhsuite-3.0~beta2+dfsg/scripts/pdbfilter.pl 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/scripts/pdbfilter.pl 2017-07-14 10:34:03.000000000 +0000 @@ -1,4 +1,4 @@ -#! /usr/bin/env perl +#! /usr/bin/perl # pdbfilter.pl - Read pdb or SCOP sequences from infile and write representative set of sequences to outfile # diff -Nru hhsuite-3.0~beta2+dfsg/scripts/pdbfilter.py hhsuite-3.0~beta3+dfsg/scripts/pdbfilter.py --- hhsuite-3.0~beta2+dfsg/scripts/pdbfilter.py 1970-01-01 00:00:00.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/scripts/pdbfilter.py 2017-07-14 10:34:03.000000000 +0000 @@ -0,0 +1,277 @@ +#!/usr/bin/env python + +""" +Created in Jun 2016 + +@author: Harald Voehringer +""" + + +from itertools import groupby +from collections import namedtuple, defaultdict +import textwrap + +DEBUG = False +FASTA = namedtuple('FASTA', ['header', 'seq']) +PDBDATA = namedtuple('PDBDATA', ['entry', 'res', 'rfr', 'comp', 'met']) + +def as_pairs(fasta): + """ Reads in fasta headers as defined """ + + for header, group in groupby(fasta, lambda x: x[0] == '>'): + if header: + line = next(group) + identifier = line.split(' ')[0].split('>')[1] + header_line = str(line.strip()) + + else: + sequence = ''.join(line.strip() for line in group).upper() + data = FASTA(header_line, sequence) + + yield identifier, data + +def read_fasta(fasta_file): + """ Reads in fasta sequences with help of the as_pairs function.""" + fasta_dic = dict() + duplicate = 0 + + with open(fasta_file) as fasta: + for title, sequence in as_pairs(fasta): + if title not in fasta_dic: + fasta_dic[title] = sequence + else: + duplicate +=1 + + if duplicate != 0: + print ('! Warning found duplicate {num} fasta.'.format( + num = duplicate)) + + return fasta_dic + +def read_fasta_annotations(fname): + """ Reads the information about resolution, R-free and completness which + be outputed by cif2fasta.""" + + annotations = dict() + + with open(fname) as fh: + for line in fh: + if len(line) > 0 and line[0] == '#': + continue + + identifier, res, r_free, comp, method = line.strip().split('\t') + + try: + res = float(res) + except ValueError: + res = None + + try: + r_free = float(r_free) + except ValueError: + r_free = None + + try: + comp = float(comp) + except ValueError: + comp = None + + annotations[identifier] = PDBDATA(identifier, res, r_free, comp, method) + + return annotations + +def read_cluster(cluster_file): + """ Reads in the clusters (this is the output of MMseqs). """ + cluster = defaultdict(set) + + with open(cluster_file) as fh: + for line in fh: + exemplar, node = line.split() + + if node in cluster[exemplar]: + raise Exception('! Please check the clustering procedure: {n} was found twice in cluster {e}.'.format( + n = node, + e = exemplar)) + else: + cluster[exemplar].add(node) + + return cluster + +def read_pdblist(in_file): + + pdb_list = set() + + with open(in_file) as fh: + for line in fh: + if line.startswith('#'): + continue + + strip_line = line.strip() + pdb_code = strip_line.split('_')[0] + + # this is a very very basic check + if len(pdb_code) != 4: + print ('! Warning: {line} seems to be an incorrect identifer. Skipping it.'.format( + line = strip_line)) + continue + + pdb = strip_line.upper() + pdb_list.add(pdb) + + return pdb_list + +def select_sequences(clusters, annotations): + + selected_sequences = set() + + for idx, cluster in enumerate(clusters): + + nodes = [ annotations[entry] for entry in clusters[cluster] ] + + if DEBUG: + print ('Processing Cluster {c} ({i}): {m}'.format( + c = cluster, + i = idx, + m = ', '.join([x.entry for x in nodes]))) + + # select the best entries of the cluster + best_res = float('inf') + best_entry_res = None + + best_rfr = float('inf') + best_entry_rfr = None + + best_comp = -float('inf') + best_entry_comp = None + + # iterate through each entry in nodes while selecting the representative sequence + for node in nodes: + + if (node.res is not None) and (node.res < best_res): + best_res = node.res + best_entry_res = node.entry + + if (node.rfr is not None) and (node.rfr < best_rfr): + best_rfr = node.rfr + best_entry_rfr = node.entry + + if (node.comp is not None) and (node.comp > best_comp): + best_comp = node.comp + best_entry_comp = node.entry + + if best_entry_res is not None: + selected_sequences.add(best_entry_res) + + if DEBUG: + print (' - Selected {n} (best resolution = {r}).'.format( + n = best_entry_res, + r = best_res)) + + if best_entry_rfr is not None: + selected_sequences.add(best_entry_rfr) + + if DEBUG: + print (' - Selected {n} (best R-free = {r}).'.format( + n = best_entry_rfr, + r = best_rfr)) + + if best_entry_comp is not None: + selected_sequences.add(best_entry_comp) + + if DEBUG: + print (' - Selected {n} (best completness = {r}).'.format( + n = best_entry_comp, + r = best_comp)) + + if best_entry_res == None and best_entry_rfr == None and best_entry_comp == None: + print ('! Warning: Did not find any representative entry for cluster {c}.'.format( + c = cluster)) + + return selected_sequences + +def write_sequences(out_file, fasta_db, selected_sequences): + """ Writes selected sequences to a fasta file.""" + with open(out_file, 'w') as fh: + for seq in selected_sequences: + + fasta_entry = '{h}\n{seq}\n'.format( + h = fasta_db[seq].header, + seq = '\n'.join(textwrap.wrap(fasta_db[seq].seq, 80))) + + fh.write(fasta_entry) + +def arg(): + import argparse + description = """ + pdbfilter.py selects from sequence clusters (determined by MMSeqs) the sequences + which have the best resolution, R-free factor and/or completness and writes them to a fasta file. + """.replace('\n', '') + + epilog = '2016 Harald Voehringer' + # Initiate a ArgumentParser Class + parser = argparse.ArgumentParser(description = description, epilog = epilog) + + # Call add_options to the parser + parser.add_argument('fasta', help = 'input fasta file (created by cif2fasta.py)', metavar = 'FILE') + parser.add_argument('cluster', help = 'sequence clusters (MMseqs)', metavar = 'FILE') + parser.add_argument('annotations', help = 'annotations file (created by cif2fasta using the -p flag, contains information about resolution, R-free and completness of sequences).', metavar = 'FILE') + parser.add_argument('out_file', help = 'output fasta file', metavar = 'FILE') + parser.add_argument('-i', '--include', help = 'include PDB chains', metavar = 'FILE') + parser.add_argument('-r', '--remove', help = 'exclude PDB chains', metavar = 'FILE') + + parser.add_argument('-v', '--verbose', action = 'store_true', help = 'verbose mode') + + + return parser + + +def main(): + import sys + parser = arg() + args = parser.parse_args(sys.argv[1:]) + + global DEBUG + + if args.verbose: + DEBUG = True + + fasta = read_fasta(args.fasta) + clu70 = read_cluster(args.cluster) + annot = read_fasta_annotations(args.annotations) + + print ("Found {i} clusters.".format( + i = len(clu70.keys()))) + + # choose representative sequences from clusters + selection = select_sequences(clu70, annot) + + # make sure that pdbs specified in the argument are included + if args.include: + to_include = read_pdblist(args.include) + for pdb in to_include: + if pdb in fasta.keys(): + if pdb not in selection: + if DEBUG: + print ('Adding {p}.'.format( + p = pdb)) + selection.add(pdb) + else: + print ('! Warning: {p} was not found in input fasta.'.format( + p = pdb)) + + # removes entries + if args.remove: + to_remove = read_pdblist(args.remove) + for pdb in to_remove: + if pdb in selection: + if DEBUG: + print ('Removing {p}.'.format( + p = pdb)) + selection.remove(pdb) + + # write them to file + write_sequences(args.out_file, fasta, selection) + + +if __name__ == "__main__": + main() diff -Nru hhsuite-3.0~beta2+dfsg/scripts/reformat.pl hhsuite-3.0~beta3+dfsg/scripts/reformat.pl --- hhsuite-3.0~beta2+dfsg/scripts/reformat.pl 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/scripts/reformat.pl 2017-07-14 10:34:03.000000000 +0000 @@ -1,4 +1,4 @@ -#! /usr/bin/env perl +#! /usr/bin/perl # reformat.pl # Reformat a multiple alignment file diff -Nru hhsuite-3.0~beta2+dfsg/scripts/renumberpdb.pl hhsuite-3.0~beta3+dfsg/scripts/renumberpdb.pl --- hhsuite-3.0~beta2+dfsg/scripts/renumberpdb.pl 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/scripts/renumberpdb.pl 2017-07-14 10:34:03.000000000 +0000 @@ -1,4 +1,4 @@ -#!/usr/bin/env perl +#!/usr/bin/perl # renumberpdb.pl - generate a PDB file with renumbered indices that match residue indices in input sequence # Usage: renumberpdb.pl [options] infile [outfile] # Example: renumberpdb.pl d1hz4a_.a3m d1hz4a_.pdb @@ -379,7 +379,7 @@ if (-e $pdbfile."pdb$pdbcode.ent") {$pdbfile.="pdb$pdbcode.ent";} elsif (-e $pdbfile."pdb$pdbcode.ent.gz") {$pdbfile="gunzip -c $pdbfile"."pdb$pdbcode.ent.gz |";} elsif (-e $pdbfile."pdb$pdbcode.ent.Z") {$pdbfile="gunzip -c $pdbfile"."pdb$pdbcode.ent.Z |";} - elsif (-e $pdbfile."$pdbcode.pdb") {$pdbfile."$pdbcode.pdb";} + elsif (-e $pdbfile."$pdbcode.pdb") {$pdbfile.="$pdbcode.pdb";} else { printf(STDERR "Error in $program: Cannot find pdb file $pdbfile"."pdb$pdbcode.ent!\n"); return ""; diff -Nru hhsuite-3.0~beta2+dfsg/scripts/splitfasta.pl hhsuite-3.0~beta3+dfsg/scripts/splitfasta.pl --- hhsuite-3.0~beta2+dfsg/scripts/splitfasta.pl 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/scripts/splitfasta.pl 2017-07-14 10:34:03.000000000 +0000 @@ -1,4 +1,4 @@ -#! /usr/bin/env perl +#! /usr/bin/perl # splitfasta.pl # Split a file with multiple, FASTA formatted sequences into many single-sequence FASTA files # diff -Nru hhsuite-3.0~beta2+dfsg/src/a3m_compress.cpp hhsuite-3.0~beta3+dfsg/src/a3m_compress.cpp --- hhsuite-3.0~beta2+dfsg/src/a3m_compress.cpp 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/a3m_compress.cpp 2017-07-14 10:34:03.000000000 +0000 @@ -161,7 +161,8 @@ nr_consensus++; } else { - if(compressed_a3m::compress_sequence(id, sequence, + std::string short_id = getShortIdFromHeader(id); + if(compressed_a3m::compress_sequence(short_id, sequence, ffindex_sequence_database_index, ffindex_sequence_database_data, output)) { nr_sequences++; @@ -215,7 +216,8 @@ //TODO: Warning } else { - if(compressed_a3m::compress_sequence(id, sequence, + std::string short_id = getShortIdFromHeader(id); + if(compressed_a3m::compress_sequence(short_id, sequence, ffindex_sequence_database_index, ffindex_sequence_database_data, output)) { nr_sequences++; @@ -528,11 +530,12 @@ } } + std::string short_id = id; if(first_pipe_index != id.length() && second_pipe_index != id.length()) { - return id.substr(first_pipe_index + 1, second_pipe_index - first_pipe_index - 1); + short_id = id.substr(first_pipe_index + 1, second_pipe_index - first_pipe_index - 1); } - return id; + return short_id; } bool isConsensus(std::string &id) { diff -Nru hhsuite-3.0~beta2+dfsg/src/a3m_database_extract.cpp hhsuite-3.0~beta3+dfsg/src/a3m_database_extract.cpp --- hhsuite-3.0~beta2+dfsg/src/a3m_database_extract.cpp 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/a3m_database_extract.cpp 2017-07-14 10:34:03.000000000 +0000 @@ -191,5 +191,8 @@ } fclose(a3m_data_fh); + fclose(a3m_index_fh); + + ffsort_index(a3mIndexFile.c_str()); } diff -Nru hhsuite-3.0~beta2+dfsg/src/a3m_database_filter.cpp hhsuite-3.0~beta3+dfsg/src/a3m_database_filter.cpp --- hhsuite-3.0~beta2+dfsg/src/a3m_database_filter.cpp 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/a3m_database_filter.cpp 2017-07-14 10:34:03.000000000 +0000 @@ -200,6 +200,10 @@ delete out_buffer; } + fclose(oa3m_index_fh); fclose(oa3m_data_fh); + + ffsort_index(oa3mIndexFile.c_str()); + } diff -Nru hhsuite-3.0~beta2+dfsg/src/a3m_database_reduce.cpp hhsuite-3.0~beta3+dfsg/src/a3m_database_reduce.cpp --- hhsuite-3.0~beta2+dfsg/src/a3m_database_reduce.cpp 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/a3m_database_reduce.cpp 2017-07-14 10:34:03.000000000 +0000 @@ -161,6 +161,9 @@ delete out_buffer; } + fclose(ca3m_index_fh); fclose(ca3m_data_fh); + + ffsort_index(ca3mIndexFile.c_str()); } diff -Nru hhsuite-3.0~beta2+dfsg/src/CMakeLists.txt hhsuite-3.0~beta3+dfsg/src/CMakeLists.txt --- hhsuite-3.0~beta2+dfsg/src/CMakeLists.txt 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/CMakeLists.txt 2017-07-14 10:34:03.000000000 +0000 @@ -8,6 +8,9 @@ include(FindTSan) endif() + +if(X86 OR X86_64) + #SSE if(${HAVE_AVX2}) ADD_DEFINITIONS("-DAVX2") @@ -27,6 +30,12 @@ endif(${HAVE_AVX2_EXTENSIONS}) endif() +else() # + +message("!! NOT X86 OR X86_64 !!") + +endif() # NOT X86 OR X86_64 + find_package(OpenMP) @@ -127,26 +136,26 @@ # add the executable add_executable(hhblits - hhblits_app.cpp $ $ - $ - $ - $ + hhblits_app.cpp $ $ + $ + $ + $ ) target_link_libraries (hhblits ffindex) -if (OPENMP_FOUND) -add_executable(hhblits_omp - hhblits_omp.cpp $ $ - $ - $ - $ -) -target_link_libraries (hhblits_omp ffindex) -INSTALL(PROGRAMS - ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/hhblits_omp - DESTINATION bin -) +if (OPENMP_FOUND) + add_executable(hhblits_omp + hhblits_omp.cpp $ $ + $ + $ + $ + ) + target_link_libraries (hhblits_omp ffindex) + INSTALL(PROGRAMS + ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/hhblits_omp + DESTINATION bin + ) endif() add_executable(hhmake @@ -229,9 +238,25 @@ target_link_libraries (cstranslate ffindex) find_package(MPI QUIET) + if(MPI_CXX_FOUND) include_directories(${MPI_CXX_INCLUDE_PATH}) + #hhblits_mpi + add_executable(hhblits_mpi + hhblits_mpi.cpp $ $ + $ + $ + $ + ) + target_link_libraries(hhblits_mpi ffindex mpq) + target_link_libraries(hhblits_mpi ${MPI_CXX_LIBRARIES}) + set_target_properties(hhblits_mpi PROPERTIES COMPILE_FLAGS "${MPI_CXX_COMPILE_FLAGS}") + set_target_properties(hhblits_mpi PROPERTIES LINK_FLAGS "${MPI_CXX_LINK_FLAGS}") + + install(PROGRAMS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/hhblits_mpi DESTINATION bin) + + #cstranslate_mpi add_executable(cstranslate_mpi cs/cstranslate_mpi_app.cc $ @@ -264,9 +289,3 @@ DESTINATION bin ) -if (OPENMP_FOUND) -INSTALL(PROGRAMS - ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/hhblits_omp - DESTINATION bin -) -endif() diff -Nru hhsuite-3.0~beta2+dfsg/src/cs/alignment.h hhsuite-3.0~beta3+dfsg/src/cs/alignment.h --- hhsuite-3.0~beta2+dfsg/src/cs/alignment.h 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/cs/alignment.h 2017-07-14 10:34:03.000000000 +0000 @@ -76,6 +76,9 @@ // Fills match_idx__ with the indices of all match columns. void SetMatchIndices(); + // Removes sequences with headers indicating non-protein sequences (secondary structure predictions) + void FilterSequencesByHeaders(std::vector& headers, std::vector& seqs); + // Reads an alignment in FASTA format. void ReadFasta(FILE* fin, std::vector& headers, std::vector& seqs); diff -Nru hhsuite-3.0~beta2+dfsg/src/cs/alignment-inl.h hhsuite-3.0~beta3+dfsg/src/cs/alignment-inl.h --- hhsuite-3.0~beta2+dfsg/src/cs/alignment-inl.h 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/cs/alignment-inl.h 2017-07-14 10:34:03.000000000 +0000 @@ -153,12 +153,36 @@ default: throw Exception("Unsupported alignment input format %i!", format); } + Init(headers, seqs); LOG(DEBUG4) << *this; } template +void Alignment::FilterSequencesByHeaders(std::vector& headers, std::vector& seqs) { + std::vector ignored_headers; + ignored_headers.push_back("ss_pred"); + ignored_headers.push_back("ss_conf"); + ignored_headers.push_back("ss_dssp"); + + for (int i = headers.size() - 1; i >= 0; i--) { + bool remove_header = false; + for (size_t j = 0; j < ignored_headers.size(); j++) { + if (!headers[i].compare(0, ignored_headers[j].size(), ignored_headers[j])) { + remove_header = true; + break; + } + } + //TODO: also delete consensus sequence if included??? + if(remove_header) { + headers.erase(headers.begin() + i); + seqs.erase(seqs.begin() + i); + } + } +} + +template void Alignment::ReadFastaFlavors(FILE* fin, std::vector& headers, std::vector& seqs) { headers.clear(); seqs.clear(); @@ -172,7 +196,7 @@ if (buffer[0] == '#') { name_ = std::string(buffer + 1); } else if (buffer[0] == '>') { - if (headers.empty() && + if (headers.empty() && (strstr(buffer, ">ss_") == buffer || strstr(buffer, ">sa_") == buffer)) { while (!feof(fin)) { c = getc(fin); @@ -282,6 +306,8 @@ void Alignment::ReadA3M(FILE* fin, std::vector& headers, std::vector& seqs) { ReadFastaFlavors(fin, headers, seqs); + FilterSequencesByHeaders(headers, seqs); + // Check number of match states const size_t nseqs = seqs.size(); const size_t nmatch_cols = count_if(seqs[0].begin(), seqs[0].end(), match_chr); @@ -670,7 +696,7 @@ void ReadAll(FILE* fin, AlignmentFormat format, std::vector< Alignment >& v) { while (!feof(fin)) { v.push_back(Alignment(fin, format)); - uint8_t c = fgetc(fin); + int c = fgetc(fin); if (c == EOF) break; ungetc(c, fin); } @@ -839,12 +865,14 @@ } // for j over ncols Normalize(&wi[0], nseqs); - if (ncoli < kMinCols) // number of columns in subalignment insufficient? - for (size_t k = 0; k < nseqs; ++k) + if (ncoli < kMinCols) { // number of columns in subalignment insufficient? + for (size_t k = 0; k < nseqs; ++k) { if (ali[i][k] < any) wi[k] = wg[k]; else wi[k] = 0.0f; + } + } neff[i] = 0.0f; for (size_t j = 0; j < ncols; ++j) { diff -Nru hhsuite-3.0~beta2+dfsg/src/cs/context_profile-inl.h hhsuite-3.0~beta3+dfsg/src/cs/context_profile-inl.h --- hhsuite-3.0~beta2+dfsg/src/cs/context_profile-inl.h 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/cs/context_profile-inl.h 2017-07-14 10:34:03.000000000 +0000 @@ -106,7 +106,7 @@ assert(len & 1); if (nalph != Abc::kSize) throw Exception("Alphabet size of serialized context profile should be %d" - "but is acutally %d!", Abc::kSize, nalph); + "but is actually %d!", Abc::kSize, nalph); // If everything went fine we can resize our data memmbers probs.Resize(len); diff -Nru hhsuite-3.0~beta2+dfsg/src/cs/crf_state-inl.h hhsuite-3.0~beta3+dfsg/src/cs/crf_state-inl.h --- hhsuite-3.0~beta2+dfsg/src/cs/crf_state-inl.h 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/cs/crf_state-inl.h 2017-07-14 10:34:03.000000000 +0000 @@ -44,7 +44,7 @@ assert(len & 1); if (nalph != Abc::kSize) throw Exception("Alphabet size of serialized CRF state should be %d" - "but is acutally %d!", Abc::kSize, nalph); + "but is actually %d!", Abc::kSize, nalph); // If everything went fine we can resize our data memmbers context_weights.Resize(len); diff -Nru hhsuite-3.0~beta2+dfsg/src/cs/cstranslate_app.h hhsuite-3.0~beta3+dfsg/src/cs/cstranslate_app.h --- hhsuite-3.0~beta2+dfsg/src/cs/cstranslate_app.h 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/cs/cstranslate_app.h 2017-07-14 10:34:03.000000000 +0000 @@ -353,8 +353,15 @@ // FIXME: we are leaking inf, but if we fclose we get weird crashes //fclose(inf); + } + fclose(output_index_fh); + fclose(output_data_fh); + + + ffsort_index(output_index_file.c_str()); + if (isCa3m) { delete sequence_db; delete header_db; diff -Nru hhsuite-3.0~beta2+dfsg/src/cs/cstranslate_mpi_app.h hhsuite-3.0~beta3+dfsg/src/cs/cstranslate_mpi_app.h --- hhsuite-3.0~beta2+dfsg/src/cs/cstranslate_mpi_app.h 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/cs/cstranslate_mpi_app.h 2017-07-14 10:34:03.000000000 +0000 @@ -142,9 +142,9 @@ MPI_Barrier(MPI_COMM_WORLD); if (MPQ_rank == MPQ_MASTER) { - ffmerge_splits(data_filename_out[0].c_str(), index_filename_out[0].c_str(), MPQ_size, 1); + ffmerge_splits(data_filename_out[0].c_str(), index_filename_out[0].c_str(), 1, MPQ_size - 1, true); if (this->opts_.both) { - ffmerge_splits(data_filename_out[1].c_str(), index_filename_out[1].c_str(), MPQ_size, 1); + ffmerge_splits(data_filename_out[1].c_str(), index_filename_out[1].c_str(), 1, MPQ_size - 1, true); } } } else { diff -Nru hhsuite-3.0~beta2+dfsg/src/ffindex_from_fasta_with_split.c hhsuite-3.0~beta3+dfsg/src/ffindex_from_fasta_with_split.c --- hhsuite-3.0~beta2+dfsg/src/ffindex_from_fasta_with_split.c 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/ffindex_from_fasta_with_split.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,219 +0,0 @@ -/* - * FFindex - * written by Andy Hauser . - * Please add your name here if you distribute modified versions. - * - * FFindex is provided under the Create Commons license "Attribution-ShareAlike - * 3.0", which basically captures the spirit of the Gnu Public License (GPL). - * - * See: - * http://creativecommons.org/licenses/by-sa/3.0/ -*/ - -#define _GNU_SOURCE 1 -#define _LARGEFILE64_SOURCE 1 -#define _FILE_OFFSET_BITS 64 - -#include -#include -#include - -#include -#include -#include -#include - - -#include "ffindex.h" -#include "ffutil.h" - -#define MAX_FILENAME_LIST_FILES 4096 -#define MAX_ENTRY_LENGTH 100000 - - -void usage(char *program_name) -{ - fprintf(stderr, "USAGE: %s -v | [-s] data_header_filename index_header_filename data_sequence_filename index_sequence_filename fasta_filename\n" - "\t-s\tsort index file\n" - "\nBases on a Design and Implementation of Andreas W. Hauser .\n", program_name); -} - -int main(int argn, char **argv) -{ - int sort = 0, version = 0; - int opt, err = EXIT_SUCCESS; - while ((opt = getopt(argn, argv, "sv")) != -1) - { - switch (opt) - { - case 's': - sort = 1; - break; - case 'v': - version = 1; - break; - default: - usage(argv[0]); - return EXIT_FAILURE; - } - } - - if(version == 1) - { - /* Don't you dare running it on a platform where byte != 8 bits */ - printf("%s version %.2f, off_t = %zd bits\n", argv[0], FFINDEX_VERSION, sizeof(off_t) * 8); - return EXIT_SUCCESS; - } - - if(argn - optind < 3) - { - usage(argv[0]); - return EXIT_FAILURE; - } - - - char *data_header_filename = argv[optind++]; - char *index_header_filename = argv[optind++]; - char *data_sequence_filename = argv[optind++]; - char *index_sequence_filename = argv[optind++]; - - char *fasta_filename = argv[optind++]; - - printf("data header file: %s\n", data_header_filename); - printf("index header file: %s\n", index_header_filename); - printf("data sequence file: %s\n", data_sequence_filename); - printf("index sequence file: %s\n", index_sequence_filename); - printf("fasta file: %s\n", fasta_filename); - - - FILE *data_header_file, *index_header_file, *data_sequence_file, *index_sequence_file, *fasta_file; - size_t header_offset = 0; - size_t sequence_offset = 0; - - struct stat st; - - // open header ffindex - if(stat(data_header_filename, &st) == 0) { errno = EEXIST; perror(data_header_filename); return EXIT_FAILURE; } - data_header_file = fopen(data_header_filename, "w"); - if( data_header_file == NULL) { perror(data_header_filename); return EXIT_FAILURE; } - - if(stat(index_header_filename, &st) == 0) { errno = EEXIST; perror(index_header_filename); return EXIT_FAILURE; } - index_header_file = fopen(index_header_filename, "w+"); - if(index_header_file == NULL) { perror(index_header_filename); return EXIT_FAILURE; } - - //open sequence ffindex - if(stat(data_sequence_filename, &st) == 0) { errno = EEXIST; perror(data_sequence_filename); return EXIT_FAILURE; } - data_sequence_file = fopen(data_sequence_filename, "w"); - if( data_sequence_file == NULL) { perror(data_sequence_filename); return EXIT_FAILURE; } - - if(stat(index_sequence_filename, &st) == 0) { errno = EEXIST; perror(index_sequence_filename); return EXIT_FAILURE; } - index_sequence_file = fopen(index_sequence_filename, "w+"); - if(index_sequence_file == NULL) { perror(index_sequence_filename); return EXIT_FAILURE; } - - fasta_file = fopen(fasta_filename, "r"); - if(fasta_file == NULL) { perror(fasta_filename); return EXIT_FAILURE; } - - size_t fasta_size; - char *fasta_data = ffindex_mmap_data(fasta_file, &fasta_size); -// size_t from_length = 0; - - char name[FFINDEX_MAX_ENTRY_NAME_LENTH]; - int seq_id = 1; - size_t seq_id_length = 0; - size_t count_ws = 0; - - char header[MAX_ENTRY_LENGTH]; - header[0] = '>'; - size_t header_length = 1; - char is_header = 1; - - char sequence[MAX_ENTRY_LENGTH]; - size_t sequence_length = 0; - - for(size_t fasta_offset = 1; fasta_offset < fasta_size; fasta_offset++) // position after first ">" - { - seq_id_length = 0; - count_ws = 0; - - is_header = 1; - header_length = 1; - - sequence_length = 0; - - while(fasta_offset < fasta_size && !(*(fasta_data + fasta_offset) == '>' && *(fasta_data + fasta_offset - 1) == '\n')) - { - char input = *(fasta_data + fasta_offset); - - //get fasta name - if(isspace(input)) - { - count_ws++; - name[seq_id_length] = '\0'; - } - else if(count_ws == 0) - { - name[seq_id_length++] = *(fasta_data + fasta_offset); - } - - if(input == '\n') { - is_header = 0; - header[header_length] = '\0'; - sequence[sequence_length] = '\0'; - } - else { - if(is_header == 1) { - header[header_length++] = input; - } - else { - sequence[sequence_length++] = input; - } - } - - fasta_offset++; - } - - if(seq_id_length == 0) { - sprintf(name, "%d", seq_id); - } - seq_id++; - - ffindex_insert_memory(data_header_file, index_header_file, &header_offset, header, header_length, name); - ffindex_insert_memory(data_sequence_file, index_sequence_file, &sequence_offset, sequence, sequence_length, name); - } - fclose(data_header_file); - fclose(data_sequence_file); - - /* Sort the index entries and write back */ - if(sort) - { - rewind(index_header_file); - ffindex_index_t* index = ffindex_index_parse(index_header_file, 0); - if(index == NULL) - { - perror("ffindex_index_parse failed"); - exit(EXIT_FAILURE); - } - fclose(index_header_file); - ffindex_sort_index_file(index); - index_header_file = fopen(index_header_filename, "w"); - if(index_header_file == NULL) { perror(index_header_filename); return EXIT_FAILURE; } - err += ffindex_write(index, index_header_file); - - rewind(index_sequence_file); - index = ffindex_index_parse(index_sequence_file, 0); - if(index == NULL) - { - perror("ffindex_index_parse failed"); - exit(EXIT_FAILURE); - } - fclose(index_sequence_file); - ffindex_sort_index_file(index); - index_sequence_file = fopen(index_sequence_filename, "w"); - if(index_sequence_file == NULL) { perror(index_sequence_filename); return EXIT_FAILURE; } - err += ffindex_write(index, index_sequence_file); - } - - return err; -} - -/* vim: ts=2 sw=2 et: */ diff -Nru hhsuite-3.0~beta2+dfsg/src/hhalign.cpp hhsuite-3.0~beta3+dfsg/src/hhalign.cpp --- hhsuite-3.0~beta2+dfsg/src/hhalign.cpp 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/hhalign.cpp 2017-07-14 10:34:03.000000000 +0000 @@ -1,5 +1,5 @@ -// hhalign.C: -// Align a multiple alignment to an alignment or HMM +// hhalign.C: +// Align a multiple alignment to an alignment or HMM // Print out aligned input sequences in a3m format // Error codes: 0: ok 1: file format error 2: file access error 3: memory error 4: internal numeric error 5: command line error @@ -50,7 +50,7 @@ printf("%s", COPYRIGHT); printf("\n"); - printf("Usage: hhalign -i query [-t template] [options] \n"); + printf("Usage: hhalign -i query -t template [options] \n"); printf(" -i input/query: single sequence or multiple sequence alignment (MSA)\n"); printf(" in a3m, a2m, or FASTA format, or HMM in hhm format\n"); printf(" -t input/template: single sequence or multiple sequence alignment (MSA)\n"); @@ -528,7 +528,7 @@ par.realign = 1; else if (!strcmp(argv[i], "-norealign")) par.realign = 0; - else if (!strcmp(argv[i], "-M") && (i < argc - 1)) + else if (!strcmp(argv[i], "-M") && (i < argc - 1)) { //TODO: M a3m not defined in the help if (!strcmp(argv[++i], "a2m") || !strcmp(argv[i], "a3m")){ par.M = 1; @@ -545,6 +545,7 @@ } else HH_LOG(WARNING) << "Ignoring unknown argument: -M " << argv[i] << std::endl; + } else if (!strcmp(argv[i], "-shift") && (i < argc - 1)) par.shift = atof(argv[++i]); else if (!strcmp(argv[i], "-mact") && (i < argc - 1)) { diff -Nru hhsuite-3.0~beta2+dfsg/src/hhalignment.cpp hhsuite-3.0~beta3+dfsg/src/hhalignment.cpp --- hhsuite-3.0~beta2+dfsg/src/hhalignment.cpp 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/hhalignment.cpp 2017-07-14 10:34:03.000000000 +0000 @@ -215,17 +215,15 @@ ///////////////////////////////////////////////////////////////////////// // Read infile line by line - while (firstline || (fgetline(line, LINELEN, inf) && k < MAXSEQ)) { + while (firstline || (fgetline(line, LINELEN, inf))) { linenr++; firstline = NULL; if (line[0] == '>') //line contains sequence name { if (k >= MAXSEQ - 1) { - if (k >= MAXSEQ) { - HH_LOG(WARNING) << "Maximum number " << MAXSEQ - << " of sequences exceeded in file " - << infile << std::endl; - } + HH_LOG(WARNING) << "Maximum number " << MAXSEQ + << " of sequences exceeded in file " + << infile << std::endl; break; } @@ -496,7 +494,6 @@ } cur_seq[l] = '\0'; //Ensure that cur_seq ends with a '\0' character } //end else - } ///////////////////////////////////////////////////////////////////////// @@ -902,7 +899,7 @@ case 1: default: - // Warn if alignment is ment to be -M first or -M <%> instead of A2M/A3M + // Warn if alignment is meant to be -M first or -M <%> instead of A2M/A3M // Seed/query sequence contains a gap ... if (strchr(seq[kfirst], '-')) { unsigned int len = strlen(seq[kfirst]) - 1; @@ -1082,13 +1079,14 @@ for (l = 1; l <= L; l++) { float res = 0; float gap = 0; - for (k = 0; k < N_in; ++k) + for (k = 0; k < N_in; ++k) { if (keep[k]) { if (X[k][l] < GAP) res += wg[k]; // AA or ANY; Changed from = maxres - 2) { HH_LOG(WARNING) @@ -1115,6 +1115,13 @@ seq[k][h[k]++] = MatchChr(seq[k][l]); X[k][i] = X[k][l]; I[k][i] = 0; + + //kfirst might not be copied in the case of consensus sequences + //so it will be deleted and kfirst will be set to 0 + //kfirst needs to be set to the next following sequence + if (kfirst == -1) { + kfirst = k; + } } else if (k == kss_dssp || k == kss_pred) { seq[k][h[k]++] = MatchChr(seq[k][l]); X[k][i] = ss2i(seq[k][l]); @@ -1125,6 +1132,10 @@ seq[k][h[k]++] = seq[k][l]; X[k][i] = cf2i(seq[k][l]); } + //consensus sequence, keep[kfirst] == 0 + else if (k == kfirst) { + kfirst = -1; + } } } else { for (k = 0; k < N_in; ++k) @@ -1267,12 +1278,12 @@ exit(1); } - // Avert user about -cons option? + // Avert user about -add_cons option? if (!cons) { for (i = 1; i <= L; ++i) if (X[kfirst][i] == GAP) { HH_LOG(INFO) - << "NOTE: Use the '-cons' option to calculate a consensus sequence as first sequence of the alignment.\n"; + << "NOTE: Use the '-add_cons' option to calculate a consensus sequence as first sequence of the alignment with hhconsensus or hhmake.\n"; break; } } @@ -1295,13 +1306,17 @@ for (i = 1; i <= L; ++i) HH_LOG(DEBUG1) << char(i2aa(X[k][i])); HH_LOG(DEBUG1) << "\n"; - for (i = 1; i <= L; ++i) - if (I[k][i] == 0) + for (i = 1; i <= L; ++i) { + if (I[k][i] == 0) { HH_LOG(DEBUG1) << "-"; - else if (I[k][i] > 9) + } + else if (I[k][i] > 9) { HH_LOG(DEBUG1) << "X"; - else - HH_LOG(DEBUG1) << I[k][i]; + } + else { + HH_LOG(DEBUG1) << I[k][i]; + } + } } HH_LOG(DEBUG1) << "\n"; } @@ -1443,6 +1458,7 @@ n_display++; } delete[] dummy; + return n_display; } @@ -1473,7 +1489,7 @@ int new_N_in = N_in; int new_k = 0; for(int k = 0; k < N_in; k++) { - if(keep[k] == 0 && k != new_kss_dssp && k != new_ksa_dssp && k != new_kss_pred && k != new_kss_conf && k != new_kfirst) { + if(keep[k] == 0 && k != kss_dssp && k != ksa_dssp && k != kss_pred && k != kss_conf && k != kfirst) { free(X[k]); delete[] I[k]; delete[] sname[k]; @@ -3164,13 +3180,15 @@ } // Check whether number of columns in subalignment is sufficient - if (ncol >= NCOLMIN) + if (ncol >= NCOLMIN) { // Take global weights - for (k = 0; k < N_in; ++k) + for (k = 0; k < N_in; ++k) { if (in[k] && I[k][i] > 0) wi[k] = wg[k]; else wi[k] = 0.0; + } + } // Calculate Neff[i] Neff[i] = 0.0; @@ -3369,13 +3387,15 @@ } // Check whether number of columns in subalignment is sufficient - if (ncol < NCOLMIN) + if (ncol < NCOLMIN) { // Take global weights - for (k = 0; k < N_in; ++k) + for (k = 0; k < N_in; ++k) { if (in[k] && X[k][i] == GAP) wi[k] = wg[k]; else wi[k] = 0.0; + } + } // Calculate Neff[i] Neff[i] = 0.0; diff -Nru hhsuite-3.0~beta2+dfsg/src/hhblits.cpp hhsuite-3.0~beta3+dfsg/src/hhblits.cpp --- hhsuite-3.0~beta2+dfsg/src/hhblits.cpp 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/hhblits.cpp 2017-07-14 10:34:03.000000000 +0000 @@ -182,7 +182,7 @@ par.num_rounds = 1; else if (par.num_rounds > 8) { if (v >= 1) { - HH_LOG(WARNING) << "Number of iterations (" << par.num_rounds << ") to large => Set to 8 iterations\n"; + HH_LOG(WARNING) << "Number of iterations (" << par.num_rounds << ") too large => Set to 8 iterations\n"; } par.num_rounds = 8; } @@ -330,7 +330,7 @@ printf(" -maxfilt max number of hits allowed to pass 2nd prefilter (default=%i) \n", par.maxnumdb); printf(" -min_prefilter_hits min number of hits to pass prefilter (default=%i) \n", par.min_prefilter_hits); printf(" -prepre_smax_thresh min score threshold of ungapped prefilter (default=%i) \n", par.preprefilter_smax_thresh); - printf(" -pre_evalue_thresh max E-value threshold of Smith-Waterman prefilter score (default=%i)\n", par.prefilter_evalue_thresh); + printf(" -pre_evalue_thresh max E-value threshold of Smith-Waterman prefilter score (default=%.1f)\n", par.prefilter_evalue_thresh); printf(" -pre_bitfactor prefilter scores are in units of 1 bit / pre_bitfactor (default=%i)\n", par.prefilter_bit_factor); printf(" -pre_gap_open gap open penalty in prefilter Smith-Waterman alignment (default=%i)\n", par.prefilter_gap_open); printf(" -pre_gap_extend gap extend penalty in prefilter Smith-Waterman alignment (default=%i)\n", par.prefilter_gap_extend); @@ -457,7 +457,7 @@ printf(" becomes larger than neffmax (default=%.1f)\n", par.neffmax); printf(" -cpu number of CPUs to use (for shared memory SMPs) (default=%i) \n", par.threads); if (all) { - printf(" -scores write scores for all pairwise comparisions to file \n"); + printf(" -scores write scores for all pairwise comparisons to file \n"); printf(" -filter_matrices filter matrices for similarity to output at most 100 matrices\n"); printf(" -atab write all alignments in tabular layout to file \n"); printf(" -maxres max number of HMM columns (def=%5i) \n", par.maxres); @@ -469,7 +469,12 @@ printf("An extended list of options can be obtained by calling 'hhblits -h all'\n"); printf("\n"); } - printf("Example: %s -i query.fas -oa3m query.a3m -n 1 \n", program_name); + printf("Examples:\n"); + printf("hhblist -i query.fas -o query.hhr -d ./uniclust30\n"); + printf("\n"); + printf("hhblits -i query.fas -o query.hhr -oa3m query.a3m -n 1 -d ./uniclust30\n"); + printf("\n"); + printf("Download databases from .\n"); } @@ -494,6 +499,10 @@ exit(4); } else { std::string db(argv[i]); + if (HHDatabase::checkDatabaseConflicts(argv[i])) { + HH_LOG(ERROR) << "Ambiguous database basename. Choose either a A3M or CA3M database." << std::endl; + exit(4); + } par.db_bases.push_back(db); } } else if (!strcmp(argv[i], "-contxt") @@ -640,7 +649,7 @@ par.matrix = 80; else HH_LOG(WARNING) << "Ignoring unknown option " << argv[i] << std::endl; - } else if (!strcmp(argv[i], "-M") && (i < argc - 1)) + } else if (!strcmp(argv[i], "-M") && (i < argc - 1)) { if (!strcmp(argv[++i], "a2m") || !strcmp(argv[i], "a3m")) par.M = 1; else if (!strcmp(argv[i], "first")) @@ -650,6 +659,7 @@ par.M = 2; } else HH_LOG(WARNING) << "Ignoring unknown argument: -M " << argv[i] << std::endl; + } else if (!strcmp(argv[i], "-p") && (i < argc - 1)) par.p = atof(argv[++i]); else if (!strcmp(argv[i], "-E") && (i < argc - 1)) @@ -1336,6 +1346,34 @@ new_hits++; } + if (new_hits == 0 || round == par.num_rounds) { + if (round < par.num_rounds) { + HH_LOG(INFO) << "No new hits found in iteration " << round + << " => Stop searching" << std::endl; + } + + if (old_entries.size() > 0 && par.realign_old_hits) { + HH_LOG(INFO) + << "Rescoring previously found HMMs with Viterbi algorithm" + << std::endl; + + ViterbiRunner viterbirunner(viterbiMatrices, dbs, par.threads); + std::vector hits_to_add = viterbirunner.alignment(par, &q_vec, + old_entries, + par.qsc_db, pb, + S, Sim, R, par.ssm, S73, S33, S37); + add_hits_to_hitlist(hits_to_add, hitlist); + // Add dbfiles_old to dbfiles_new for realign + new_entries.insert(new_entries.end(), old_entries.begin(), + old_entries.end()); + } else if (!par.realign_old_hits && previous_hits->Size() > 0) { + HH_LOG(INFO) + << "Rescoring previously found HMMs with Viterbi algorithm" + << std::endl; + RescoreWithViterbiKeepAlignment(q_vec, previous_hits); + } + } + // Realign hits with MAC algorithm if (par.realign) perform_realign(q_vec, input_format, new_entries); @@ -1418,32 +1456,32 @@ } if (new_hits == 0 || round == par.num_rounds || q->Neff_HMM > par.neffmax || Qali->N_in >= MAXSEQ) { - if (new_hits == 0 && round < par.num_rounds) { - HH_LOG(INFO) << "No new hits found in iteration " << round - << " => Stop searching" << std::endl; - } - - if (old_entries.size() > 0 && par.realign_old_hits) { - HH_LOG(INFO) - << "Recalculating previously found HMMs with Viterbi algorithm" - << std::endl; - - ViterbiRunner viterbirunner(viterbiMatrices, dbs, par.threads); - std::vector hits_to_add = viterbirunner.alignment(par, &q_vec, - old_entries, - par.qsc_db, pb, - S, Sim, R, par.ssm, S73, S33, S37); - - add_hits_to_hitlist(hits_to_add, hitlist); - - if (par.realign) - perform_realign(q_vec, input_format, old_entries); - } else if (!par.realign_old_hits && previous_hits->Size() > 0) { - HH_LOG(INFO) - << "Rescoring previously found HMMs with Viterbi algorithm" - << std::endl; - RescoreWithViterbiKeepAlignment(q_vec, previous_hits); - } +// if (new_hits == 0 && round < par.num_rounds) { +// HH_LOG(INFO) << "No new hits found in iteration " << round +// << " => Stop searching" << std::endl; +// } +// +// if (old_entries.size() > 0 && par.realign_old_hits) { +// HH_LOG(INFO) +// << "Recalculating previously found HMMs with Viterbi algorithm" +// << std::endl; +// +// ViterbiRunner viterbirunner(viterbiMatrices, dbs, par.threads); +// std::vector hits_to_add = viterbirunner.alignment(par, &q_vec, +// old_entries, +// par.qsc_db, pb, +// S, Sim, R, par.ssm, S73, S33, S37); +// +// add_hits_to_hitlist(hits_to_add, hitlist); +// +// if (par.realign) +// perform_realign(q_vec, input_format, old_entries); +// } else if (!par.realign_old_hits && previous_hits->Size() > 0) { +// HH_LOG(INFO) +// << "Rescoring previously found HMMs with Viterbi algorithm" +// << std::endl; +// RescoreWithViterbiKeepAlignment(q_vec, previous_hits); +// } delete q_rescore; break; diff -Nru hhsuite-3.0~beta2+dfsg/src/hhblits.h hhsuite-3.0~beta3+dfsg/src/hhblits.h --- hhsuite-3.0~beta2+dfsg/src/hhblits.h 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/hhblits.h 2017-07-14 10:34:03.000000000 +0000 @@ -28,7 +28,10 @@ #ifdef OPENMP #include #endif + +#ifdef SSE #include +#endif using std::cout; using std::cerr; @@ -89,16 +92,16 @@ class HHblits { public: - HHblits(Parameters& parameters, std::vector& databases); - virtual ~HHblits(); + HHblits(Parameters& parameters, std::vector& databases); + virtual ~HHblits(); - void Reset(); + void Reset(); - static void ProcessAllArguments(int argc, char** argv, Parameters& par); + static void ProcessAllArguments(int argc, char** argv, Parameters& par); - //print methods for hhalign and hhblits - void printHitList(); - void printHHRFile(); + //print methods for hhalign and hhblits + void printHitList(); + void printHHRFile(); //writer for non-mpi version void writeHHRFile(char* hhrFile); diff -Nru hhsuite-3.0~beta2+dfsg/src/hhblits_mpi.cpp hhsuite-3.0~beta3+dfsg/src/hhblits_mpi.cpp --- hhsuite-3.0~beta2+dfsg/src/hhblits_mpi.cpp 1970-01-01 00:00:00.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/hhblits_mpi.cpp 2017-07-14 10:34:03.000000000 +0000 @@ -0,0 +1,246 @@ +/* + * hhblits_mpi.cpp + * + * Created on: Apr 1, 2014 + * Author: Markus Meier (markus.meier@mpibpc.mpg.de) + */ + +#include +#include +#include +#include "hhdecl.h" +#include "hhblits.h" + +extern "C" { + #include + #include +} +#ifdef OPENMP + #include +#endif + + + +struct OutputFFIndex { + char base[NAMELEN]; + FILE* data_fh; + FILE* index_fh; + size_t offset; + void (*print)(HHblits&, std::stringstream&); + + void close() { + fclose(data_fh); + fclose(index_fh); + } + + void saveOutput(HHblits& hhblits, char* name) { + std::stringstream out; + print(hhblits, out); + + std::string tmp = out.str(); + ffindex_insert_memory(data_fh, index_fh, &offset, + const_cast(tmp.c_str()), tmp.size(), name); + + fflush(data_fh); + fflush(index_fh); + } +}; + +void makeOutputFFIndex(char* par, const int mpi_rank, + void (*print)(HHblits&, std::stringstream&), + std::vector& outDatabases) { + if (*par) { + OutputFFIndex db; + + strcpy(db.base, par); + db.offset = 0; + db.print = print; + + char data_filename_out_rank[NAMELEN]; + char index_filename_out_rank[NAMELEN]; + + snprintf(data_filename_out_rank, FILENAME_MAX, "%s.ffdata.%d", par, + mpi_rank); + snprintf(index_filename_out_rank, FILENAME_MAX, "%s.ffindex.%d", par, + mpi_rank); + + db.data_fh = fopen(data_filename_out_rank, "w+"); + db.index_fh = fopen(index_filename_out_rank, "w+"); + + if (db.data_fh == NULL) { + HH_LOG(WARNING) << "Could not open datafile " << data_filename_out_rank << std::endl; + return; + } + + if (db.index_fh == NULL) { + HH_LOG(WARNING) << "Could not open indexfile " << index_filename_out_rank << std::endl; + return; + } + + outDatabases.push_back(db); + } +} + +void merge_splits(const char* prefix) { + if (*prefix) { + char data_filename[FILENAME_MAX]; + char index_filename[FILENAME_MAX]; + + snprintf(data_filename, FILENAME_MAX, "%s.ffdata", prefix); + snprintf(index_filename, FILENAME_MAX, "%s.ffindex", prefix); + + ffmerge_splits(data_filename, index_filename, 1, MPQ_size - 1, true); + } +} + +struct HHblits_MPQ_Wrapper { + char *data; + ffindex_index_t* index; + HHblits* hhblits; + std::vector* outputDatabases; + + HHblits_MPQ_Wrapper(char* data, ffindex_index_t* index, HHblits& hhblits, std::vector& outputDatabases) { + this->data = data; + this->index = index; + this->hhblits = &hhblits; + this->outputDatabases = &outputDatabases; + } + + void Payload(const size_t start, const size_t end) { + // Foreach entry in the input file + for (size_t entry_index = start; entry_index < end; entry_index++) { + ffindex_entry_t* entry = ffindex_get_entry_by_index(index, entry_index); + if (entry == NULL) { + continue; + } + + hhblits->Reset(); + + FILE* inf = ffindex_fopen_by_entry(data, entry); + hhblits->run(inf, entry->name); + fclose(inf); + + for (size_t i = 0; i < outputDatabases->size(); i++) { + outputDatabases->operator [](i).saveOutput(*hhblits, entry->name); + } + } + } +}; + +void static payload(void* env, const size_t start, const size_t end) { + HHblits_MPQ_Wrapper* hhblits_wrapper = (HHblits_MPQ_Wrapper*)env; + hhblits_wrapper->Payload(start, end); +} + +int main(int argc, char **argv) { + Parameters par; + HHblits::ProcessAllArguments(argc, argv, par); + + //hhblits_mpi will be parallelized with openmpi, no other parallelization + par.threads = 1; + #ifdef OPENMP + omp_set_num_threads(par.threads); + #endif + + char data_filename[NAMELEN]; + char index_filename[NAMELEN]; + + strcpy(data_filename, par.infile); + strcat(data_filename, ".ffdata"); + + strcpy(index_filename, par.infile); + strcat(index_filename, ".ffindex"); + + FILE *data_file = fopen(data_filename, "r"); + FILE *index_file = fopen(index_filename, "r"); + + if (data_file == NULL) { + HH_LOG(ERROR) << "Input data file " << data_filename << " does not exist!" << std::endl; + MPI_Finalize(); + exit(EXIT_FAILURE); + } + if (index_file == NULL) { + HH_LOG(ERROR) << "Input index file " << index_filename << " does not exist!" << std::endl; + MPI_Finalize(); + exit(EXIT_FAILURE); + } + + //init input ffindex + size_t data_size; + char *data = ffindex_mmap_data(data_file, &data_size); + + size_t number_input_index_lines = CountLinesInFile(index_filename); + ffindex_index_t* index = ffindex_index_parse(index_file, number_input_index_lines); + if (index == NULL) { + HH_LOG(ERROR) << "Could not parse index from " << index_filename << std::endl; + MPI_Finalize(); + exit(EXIT_FAILURE); + } + + int mpq_status = MPQ_Init(argc, argv, index->n_entries); + + if (mpq_status == MPQ_SUCCESS) { + if (MPQ_rank == MPQ_MASTER) { + MPQ_Master(1); + } else { + std::vector outputDatabases; + makeOutputFFIndex(par.outfile, MPQ_rank, &HHblits::writeHHRFile, + outputDatabases); + makeOutputFFIndex(par.scorefile, MPQ_rank, &HHblits::writeScoresFile, + outputDatabases); + makeOutputFFIndex(par.pairwisealisfile, MPQ_rank, + &HHblits::writePairwiseAlisFile, outputDatabases); + makeOutputFFIndex(par.alitabfile, MPQ_rank, &HHblits::writeAlitabFile, + outputDatabases); + makeOutputFFIndex(par.psifile, MPQ_rank, &HHblits::writePsiFile, + outputDatabases); + makeOutputFFIndex(par.hhmfile, MPQ_rank, &HHblits::writeHMMFile, + outputDatabases); + makeOutputFFIndex(par.alnfile, MPQ_rank, &HHblits::writeA3MFile, + outputDatabases); + makeOutputFFIndex(par.matrices_output_file, MPQ_rank, &HHblits::writeMatricesFile, + outputDatabases); + makeOutputFFIndex(par.m8file, MPQ_rank, &HHblits::writeM8, + outputDatabases); + + std::vector databases; + HHblits::prepareDatabases(par, databases); + + HHblits hhblits(par, databases); + + HHblits_MPQ_Wrapper* wrapper = new HHblits_MPQ_Wrapper(data, index, hhblits, outputDatabases); + MPQ_Worker(payload, wrapper); + delete wrapper; + + fclose(data_file); + fclose(index_file); + + for (size_t i = 0; i < outputDatabases.size(); i++) { + outputDatabases[i].close(); + } + } + + MPI_Barrier(MPI_COMM_WORLD); + + if (MPQ_rank == MPQ_MASTER) { + merge_splits(par.outfile); + merge_splits(par.scorefile); + merge_splits(par.pairwisealisfile); + merge_splits(par.alitabfile); + merge_splits(par.psifile); + merge_splits(par.hhmfile); + merge_splits(par.alnfile); + merge_splits(par.matrices_output_file); + merge_splits(par.m8file); + } + } else { + if (mpq_status == MPQ_ERROR_NO_WORKERS) { + fprintf(stderr, "MPQ_Init: Needs at least one worker process.\n"); + exit(EXIT_FAILURE); + } + } + + MPI_Finalize(); + return EXIT_SUCCESS; +} + diff -Nru hhsuite-3.0~beta2+dfsg/src/hhblits_omp.cpp hhsuite-3.0~beta3+dfsg/src/hhblits_omp.cpp --- hhsuite-3.0~beta2+dfsg/src/hhblits_omp.cpp 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/hhblits_omp.cpp 2017-07-14 10:34:03.000000000 +0000 @@ -28,8 +28,13 @@ void (*print)(HHblits&, std::stringstream&); void close() { - fclose(data_fh); + char index_filename[NAMELEN]; + snprintf(index_filename, FILENAME_MAX, "%s.ffindex", base); + fclose(index_fh); + fclose(data_fh); + + ffsort_index(index_filename); } void saveOutput(HHblits& hhblits, char* name) { @@ -44,33 +49,6 @@ fflush(index_fh); number_entries++; } - - void sort() { - /* Sort the index entries and write back */ - char index_filename[NAMELEN]; - snprintf(index_filename, FILENAME_MAX, "%s.ffindex", base); - - rewind(index_fh); - ffindex_index_t* index = ffindex_index_parse(index_fh, number_entries); - fclose(index_fh); - - if (index == NULL) { - HH_LOG(ERROR) << "Could not read index from " << index_filename << " for sorting!" << std::endl; - return; - } - - ffindex_sort_index_file(index); - - index_fh = fopen(index_filename, "w"); - - if (index_fh == NULL) { - HH_LOG(ERROR) << "Could not open " << index_filename << " for sorting!" << std::endl; - return; - } - - ffindex_write(index, index_fh); - free(index); - } }; @@ -236,7 +214,6 @@ databases.clear(); for (size_t i = 0; i < outputDatabases.size(); i++) { - outputDatabases[i].sort(); outputDatabases[i].close(); } } diff -Nru hhsuite-3.0~beta2+dfsg/src/hhconsensus.cpp hhsuite-3.0~beta3+dfsg/src/hhconsensus.cpp --- hhsuite-3.0~beta2+dfsg/src/hhconsensus.cpp 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/hhconsensus.cpp 2017-07-14 10:34:03.000000000 +0000 @@ -33,7 +33,7 @@ #include "hhhalfalignment.h" // class HalfAlignment #include "hhfunc.h" // some functions common to hh programs ///////////////////////////////////////////////////////////////////////////////////// -// Global variables +// Global variables ///////////////////////////////////////////////////////////////////////////////////// Parameters par; @@ -206,7 +206,7 @@ par.coverage = atoi(argv[++i]); else if (!strcmp(argv[i], "-diff") && (i < argc - 1)) par.Ndiff = atoi(argv[++i]); - else if (!strcmp(argv[i], "-M") && (i < argc - 1)) + else if (!strcmp(argv[i], "-M") && (i < argc - 1)) { if (!strcmp(argv[++i], "a2m") || !strcmp(argv[i], "a3m")) par.M = 1; else if (!strcmp(argv[i], "first")) @@ -217,6 +217,7 @@ } else HH_LOG(WARNING) << "Ignoring unknown argument: -M " << argv[i] << std::endl; + } else if (!strcmp(argv[i], "-Gonnet")) par.matrix = 0; else if (!strncmp(argv[i], "-BLOSUM", 7) @@ -300,7 +301,7 @@ strcpy(par.infile, ""); strcpy(par.outfile, ""); strcpy(par.alnfile, ""); - + //Default parameter settings par.nseqdis = MAXSEQ - 1; // maximum number of sequences to be written par.showcons = 0; @@ -316,7 +317,7 @@ par.argv = argv; par.argc = argc; RemovePathAndExtension(program_name, argv[0]); - + // Enable changing verbose mode before defaults file and command line are processed int v = 2; for (int i = 1; i < argc; i++) { @@ -408,13 +409,13 @@ q->AddAminoAcidPseudocounts(0, par.pc_hhm_nocontext_a, par.pc_hhm_nocontext_b, par.pc_hhm_nocontext_c); } - + q->CalculateAminoAcidBackground(pb); - + if (par.columnscore == 5 && !q->divided_by_local_bg_freqs) q->DivideBySqrtOfLocalBackgroundFreqs( par.half_window_size_local_aa_bg_freqs, pb); - + // Write consensus sequence to sequence file // Consensus sequence is calculated in hhalignment.C, Alignment::FrequenciesAndTransitions() if (*par.outfile) { @@ -463,4 +464,3 @@ pc_hhm_context_mode, pc_prefilter_context_engine, pc_prefilter_context_mode); } - diff -Nru hhsuite-3.0~beta2+dfsg/src/hhdatabase.cpp hhsuite-3.0~beta3+dfsg/src/hhdatabase.cpp --- hhsuite-3.0~beta2+dfsg/src/hhdatabase.cpp 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/hhdatabase.cpp 2017-07-14 10:34:03.000000000 +0000 @@ -78,7 +78,28 @@ strcat(databaseName, suffix); } -HHblitsDatabase::HHblitsDatabase(const char* base) { +bool HHDatabase::checkDatabaseConflicts(const char* base) { + + char a3m_index_filename[NAMELEN]; + char a3m_data_filename[NAMELEN]; + + char ca3m_index_filename[NAMELEN]; + char ca3m_data_filename[NAMELEN]; + + buildDatabaseName(base, "a3m", ".ffdata", a3m_data_filename); + buildDatabaseName(base, "a3m", ".ffindex", a3m_index_filename); + + buildDatabaseName(base, "ca3m", ".ffdata", ca3m_data_filename); + buildDatabaseName(base, "ca3m", ".ffindex", ca3m_index_filename); + + if (file_exists(ca3m_index_filename) && file_exists(ca3m_data_filename) + && file_exists(a3m_index_filename) && file_exists(a3m_data_filename)) { + return true; + } + return false; +} + +HHblitsDatabase::HHblitsDatabase(const char* base, bool initCs219) { cs219_database = NULL; a3m_database = NULL; @@ -93,32 +114,53 @@ basename = new char[strlen(base) + 1]; strcpy(basename, base); - char cs219_index_filename[NAMELEN]; - char cs219_data_filename[NAMELEN]; + if (initCs219) { + char cs219_index_filename[NAMELEN]; + char cs219_data_filename[NAMELEN]; - buildDatabaseName(base, "cs219", ".ffdata", cs219_data_filename); - buildDatabaseName(base, "cs219", ".ffindex", cs219_index_filename); + buildDatabaseName(base, "cs219", ".ffdata", cs219_data_filename); + buildDatabaseName(base, "cs219", ".ffindex", cs219_index_filename); - cs219_database = new FFindexDatabase(cs219_data_filename, - cs219_index_filename, use_compressed); + cs219_database = new FFindexDatabase(cs219_data_filename, + cs219_index_filename, use_compressed); + } if (!checkAndBuildCompressedDatabase(base)) { char a3m_index_filename[NAMELEN]; char a3m_data_filename[NAMELEN]; - char hhm_index_filename[NAMELEN]; - char hhm_data_filename[NAMELEN]; - buildDatabaseName(base, "a3m", ".ffdata", a3m_data_filename); buildDatabaseName(base, "a3m", ".ffindex", a3m_index_filename); + if (file_exists(a3m_data_filename) && file_exists(a3m_index_filename)) { + a3m_database = new FFindexDatabase(a3m_data_filename, a3m_index_filename, + use_compressed); + } + + char hhm_index_filename[NAMELEN]; + char hhm_data_filename[NAMELEN]; + buildDatabaseName(base, "hhm", ".ffdata", hhm_data_filename); buildDatabaseName(base, "hhm", ".ffindex", hhm_index_filename); - a3m_database = new FFindexDatabase(a3m_data_filename, a3m_index_filename, - use_compressed); - hhm_database = new FFindexDatabase(hhm_data_filename, hhm_index_filename, - use_compressed); + if (file_exists(hhm_data_filename) && file_exists(hhm_index_filename)) { + hhm_database = new FFindexDatabase(hhm_data_filename, hhm_index_filename, + use_compressed); + } + + if (a3m_database == NULL && hhm_database == NULL) { + HH_LOG(ERROR) << "Could find neither hhm_db nor a3m_db!" << std::endl; + exit(1); + } else if (a3m_database != NULL && hhm_database == NULL) { + query_database = a3m_database; + } else if (a3m_database == NULL && hhm_database != NULL) { + query_database = hhm_database; + } else { + // both exist, use the a3m + query_database = a3m_database; + } + } else { + query_database = cs219_database; } prefilter = NULL; @@ -148,7 +190,7 @@ void HHblitsDatabase::initNoPrefilter(std::vector& new_entries) { std::vector > new_entry_names; - hh::Prefilter::init_no_prefiltering(cs219_database, new_entry_names); + hh::Prefilter::init_no_prefiltering(query_database, new_entry_names); getEntriesFromNames(new_entry_names, new_entries); } @@ -214,14 +256,16 @@ entry); entries.push_back(hhentry); } else { - entry = ffindex_get_entry_by_name( - hhm_database->db_index, const_cast(hits[i].second.c_str())); - - if (entry != NULL) { - HHEntry* hhentry = new HHDatabaseEntry(hits[i].first, this, - hhm_database, entry); - entries.push_back(hhentry); - continue; + if (hhm_database != NULL) { + entry = ffindex_get_entry_by_name( + hhm_database->db_index, const_cast(hits[i].second.c_str())); + + if (entry != NULL) { + HHEntry* hhentry = new HHDatabaseEntry(hits[i].first, this, + hhm_database, entry); + entries.push_back(hhentry); + continue; + } } entry = ffindex_get_entry_by_name( @@ -400,6 +444,11 @@ tali.Compress(entry->name, par.cons, par.maxres, par.maxcol, par.M_template, par.Mgaps); + + if(tali.L > sequence_length) { + HH_LOG(ERROR) << "sequence length (" << sequence_length << ") does not fit to read MSA (match states: "<< tali.L << ") of file " << getName() << "!" << std::endl; + HH_LOG(ERROR) << "\tYour cs219 states might not fit your multiple sequence alignments." << std::endl; + } } void HHEntry::getTemplateHMM(FILE* dbf, char* name, Parameters& par, @@ -413,10 +462,10 @@ HH_LOG(ERROR) << "In " << __FILE__ << ":" << __LINE__ << ": " << __func__ << ":" << std::endl; HH_LOG(ERROR) << "\tThis should not happen!" << std::endl; } - - while (strscn(line) == NULL) - fgetline(line, LINELEN, dbf); // skip lines that contain only white space - + while (strscn(line) == NULL) { + + if (!fgetline(line, LINELEN, dbf)) break; // skip lines that contain only white space + } // read HMMER3 format if (!strncmp(line, "HMMER3", 6)) { format = 1; @@ -461,6 +510,11 @@ exit(1); } } + + if(t->L > sequence_length) { + HH_LOG(ERROR) << "sequence length (" << sequence_length << ") does not fit to read MSA (match states: "<< t->L << ") of file " << getName() << "!" << std::endl; + HH_LOG(ERROR) << "\tYour cs219 states might not fit your multiple sequence alignments." << std::endl; + } } char* HHDatabaseEntry::getName() { diff -Nru hhsuite-3.0~beta2+dfsg/src/hhdatabase.h hhsuite-3.0~beta3+dfsg/src/hhdatabase.h --- hhsuite-3.0~beta2+dfsg/src/hhdatabase.h 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/hhdatabase.h 2017-07-14 10:34:03.000000000 +0000 @@ -14,7 +14,9 @@ class FFindexDatabase; + extern "C" { +#include #include } @@ -25,6 +27,7 @@ #include "log.h" #include "hhalignment.h" + class FFindexDatabase { public: FFindexDatabase(char* data_filename, char* index_filename, bool isCompressed); @@ -46,14 +49,16 @@ HHDatabase(); virtual ~HHDatabase(); - protected: - void buildDatabaseName(const char* base, const char* extension, - const char* suffix, char* databaseName); + static bool checkDatabaseConflicts(const char* base); + +protected: + static void buildDatabaseName(const char* base, const char* extension, + const char* suffix, char* databaseName); }; class HHblitsDatabase: HHDatabase { public: - HHblitsDatabase(const char* base); + HHblitsDatabase(const char* base, bool initCs219 = true); ~HHblitsDatabase(); void initPrefilter(const char* cs_library); @@ -70,7 +75,6 @@ const float R[20][20], std::vector& new_entries, std::vector& old_entries); - char* basename; FFindexDatabase* cs219_database; @@ -78,6 +82,8 @@ FFindexDatabase* a3m_database; FFindexDatabase* hhm_database; + FFindexDatabase* query_database; + bool use_compressed; FFindexDatabase* ca3m_database; FFindexDatabase* sequence_database; diff -Nru hhsuite-3.0~beta2+dfsg/src/hhdecl.cpp hhsuite-3.0~beta3+dfsg/src/hhdecl.cpp --- hhsuite-3.0~beta2+dfsg/src/hhdecl.cpp 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/hhdecl.cpp 2017-07-14 10:34:03.000000000 +0000 @@ -13,7 +13,7 @@ if (getenv("HHLIB")) strcpy(hhlib, getenv("HHLIB")); else - strcpy(hhlib, "/usr/lib"); + strcpy(hhlib, "/usr/lib/hhsuite"); strcat(strcpy(hhdata, hhlib), "/data"); strcat(strcpy(clusterfile, hhdata), "/context_data.crf"); @@ -29,27 +29,25 @@ Pathname(program_path, argv[0]); /* we did not find HHLIB, if called with full path or in dist dir, we can try relative to program path */ - if (program_path != NULL) { - strcat(strcpy(hhlib, program_path), "../lib/hh"); + strcat(strcpy(hhlib, program_path), "../lib/hhsuite"); + strcat(strcpy(hhdata, hhlib), "/data"); + strcat(strcpy(clusterfile, hhdata), "/context_data.crf"); + strcat(strcpy(cs_library, hhdata), "/cs219.lib"); + testf = fopen(cs_library, "r"); + if (testf) + fclose(testf); + else { + HH_LOG(DEBUG) << "WARNING in HHsuite: Could not open " << cs_library << "\n"; + + strcat(strcpy(hhlib, program_path), ".."); strcat(strcpy(hhdata, hhlib), "/data"); strcat(strcpy(clusterfile, hhdata), "/context_data.crf"); strcat(strcpy(cs_library, hhdata), "/cs219.lib"); testf = fopen(cs_library, "r"); if (testf) fclose(testf); - else { + else HH_LOG(DEBUG) << "WARNING in HHsuite: Could not open " << cs_library << "\n"; - - strcat(strcpy(hhlib, program_path), ".."); - strcat(strcpy(hhdata, hhlib), "/data"); - strcat(strcpy(clusterfile, hhdata), "/context_data.crf"); - strcat(strcpy(cs_library, hhdata), "/cs219.lib"); - testf = fopen(cs_library, "r"); - if (testf) - fclose(testf); - else - HH_LOG(DEBUG) << "WARNING in HHsuite: Could not open " << cs_library << "\n"; - } } } if (!testf) { diff -Nru hhsuite-3.0~beta2+dfsg/src/hhdecl.h hhsuite-3.0~beta3+dfsg/src/hhdecl.h --- hhsuite-3.0~beta2+dfsg/src/hhdecl.h 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/hhdecl.h 2017-07-14 10:34:03.000000000 +0000 @@ -8,7 +8,6 @@ #ifndef HHDECL_H_ #define HHDECL_H_ -#include #include #include #include diff -Nru hhsuite-3.0~beta2+dfsg/src/hhfilter.cpp hhsuite-3.0~beta3+dfsg/src/hhfilter.cpp --- hhsuite-3.0~beta2+dfsg/src/hhfilter.cpp 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/hhfilter.cpp 2017-07-14 10:34:03.000000000 +0000 @@ -1,5 +1,5 @@ // hhfilterC: filter alignment in a2m format with maximum sequence identity of match states and minimum coverage -// +// // (C) Johannes Soeding 2012 // This program is free software: you can redistribute it and/or modify @@ -17,7 +17,7 @@ // We are very grateful for bug reports! Please contact us at soeding@mpibpc.mpg.de -// Reference: +// Reference: // Remmert M., Biegert A., Hauser A., and Soding J. // HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment. // Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011). @@ -182,7 +182,7 @@ par.Neff = atof(argv[++i]); else if (!strcmp(argv[i], "-Neff") && (i < argc - 1)) par.Neff = atof(argv[++i]); - else if (!strcmp(argv[i], "-M") && (i < argc - 1)) + else if (!strcmp(argv[i], "-M") && (i < argc - 1)) { if (!strcmp(argv[++i], "a2m") || !strcmp(argv[i], "a3m")) par.M = 1; else if (!strcmp(argv[i], "first")) @@ -193,7 +193,7 @@ } else HH_LOG(WARNING) << "Ignoring unknown argument: -M " << argv[i] << std::endl; - + } else if (!strcmp(argv[i], "-def")) par.readdefaultsfile = 1; else if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--help")) { @@ -212,11 +212,11 @@ //// MAIN PROGRAM ///////////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { - Alignment qali; //Create an alignment + Alignment qali; //Create an alignment char* argv_conf[MAXOPT]; // Input arguments from .hhconfig file (first=1: argv_conf[0] is not used) - int argc_conf; // Number of arguments in argv_conf - + int argc_conf; // Number of arguments in argv_conf + strcpy(par.infile, ""); strcpy(par.outfile, ""); @@ -227,7 +227,7 @@ par.argv = argv; par.argc = argc; RemovePathAndExtension(program_name, argv[0]); - + // Enable changing verbose mode before defaults file and command line are processed int v = 2; for (int i = 1; i < argc; i++) { @@ -282,7 +282,7 @@ qali.Read(inf, par.infile, par.mark, par.maxcol, par.nseqdis); fclose(inf); - // Convert ASCII to int (0-20),throw out all insert states, record their number in I[k][i] + // Convert ASCII to int (0-20),throw out all insert states, record their number in I[k][i] // and store marked sequences in name[k] and seq[k] qali.Compress(par.infile, par.cons, par.maxres, par.maxcol, par.M, par.Mgaps); @@ -294,7 +294,7 @@ // Remove sequences with seq. identity larger than seqid percent (remove the shorter of two) qali.N_filtered = qali.Filter(par.max_seqid, S, par.coverage, par.qid, par.qsc, par.Ndiff); - + // Atune alignment diversity q.Neff with qsc to value Neff_goal if (par.Neff >= 1.0) qali.FilterNeff(par.wg, par.mark, par.cons, par.showcons, par.maxres, par.max_seqid, par.coverage, par.Neff, pb, S, Sim); @@ -316,4 +316,3 @@ printf("Done\n"); } } - diff -Nru hhsuite-3.0~beta2+dfsg/src/hhfunc.cpp hhsuite-3.0~beta3+dfsg/src/hhfunc.cpp --- hhsuite-3.0~beta2+dfsg/src/hhfunc.cpp 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/hhfunc.cpp 2017-07-14 10:34:03.000000000 +0000 @@ -62,6 +62,8 @@ // and store marked sequences in name[k] and seq[k] ali_tmp.Compress(infile, par.cons, par.maxres, par.maxcol, par.M, par.Mgaps); + ali_tmp.Shrink(); + // Sort out the nseqdis most dissimilar sequences for display in the output alignments ali_tmp.FilterForDisplay(par.max_seqid, par.mark, S, par.coverage, par.qid, par.qsc, par.nseqdis); @@ -76,6 +78,8 @@ ali_tmp.FrequenciesAndTransitions(q, use_global_weights, par.mark, par.cons, par.showcons, par.maxres, pb, Sim); + + *qali = ali_tmp; input_format = 0; } diff -Nru hhsuite-3.0~beta2+dfsg/src/hhhit.cpp hhsuite-3.0~beta3+dfsg/src/hhhit.cpp --- hhsuite-3.0~beta2+dfsg/src/hhhit.cpp 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/hhhit.cpp 2017-07-14 10:34:03.000000000 +0000 @@ -169,7 +169,7 @@ char state = states[step]; if (state == MM) { char qc = query_mapping[this->i[step]]; - char tc = template_mapping[j[step]]; + char tc = template_mapping[this->j[step]]; alignment_similarity += S[aa2i(qc)][aa2i(tc)]; } } diff -Nru hhsuite-3.0~beta2+dfsg/src/hhhitlist.cpp hhsuite-3.0~beta3+dfsg/src/hhhitlist.cpp --- hhsuite-3.0~beta2+dfsg/src/hhhitlist.cpp 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/hhhitlist.cpp 2017-07-14 10:34:03.000000000 +0000 @@ -570,6 +570,11 @@ const bool filter_matrices, const size_t max_number_matrices, const float S[20][20]) { //limit matrices to par.max_number_matrices std::vector hits; + int protein_max_length = 4000; + + if(q->L >= protein_max_length) { + return; + } //remove invalid alignments const float tolerance = 0.10; @@ -614,7 +619,7 @@ for (int index1 = hits.size() - 1; index1 >= 0; index1--) { Hit it = hits[index1]; - if (it.Probab < matix_probability_threshold) { + if (it.Probab < matix_probability_threshold || it.L >= protein_max_length) { picked_alignments[index1] = false; chosen--; } diff -Nru hhsuite-3.0~beta2+dfsg/src/hhhmm.cpp hhsuite-3.0~beta3+dfsg/src/hhhmm.cpp --- hhsuite-3.0~beta2+dfsg/src/hhhmm.cpp 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/hhhmm.cpp 2017-07-14 10:34:03.000000000 +0000 @@ -1964,9 +1964,9 @@ int HMM::computeScoreSSMode( HMM *q, HMM *t){ int returnMode = HMM::NO_SS_INFORMATION; - if (q->nss_pred>=0 && t->nss_dssp>=0) returnMode=HMM::PRED_DSSP; - else if (q->nss_dssp>=0 && t->nss_pred>=0) returnMode=HMM::DSSP_PRED; - else if (q->nss_pred>=0 && t->nss_pred>=0) returnMode=HMM::PRED_PRED; + returnMode |= (q->nss_pred>=0 && t->nss_dssp>=0) ? HMM::PRED_DSSP : 0; + returnMode |= (q->nss_dssp>=0 && t->nss_pred>=0) ? HMM::DSSP_PRED : 0; + returnMode |= (q->nss_pred>=0 && t->nss_pred>=0) ? HMM::PRED_PRED : 0; return returnMode; } diff -Nru hhsuite-3.0~beta2+dfsg/src/hhhmm.h hhsuite-3.0~beta3+dfsg/src/hhhmm.h --- hhsuite-3.0~beta2+dfsg/src/hhhmm.h 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/hhhmm.h 2017-07-14 10:34:03.000000000 +0000 @@ -61,7 +61,7 @@ const static int NO_SS_INFORMATION = 0; const static int PRED_DSSP = 1; const static int DSSP_PRED = 2; - const static int PRED_PRED = 3; + const static int PRED_PRED = 4; // Make a flat copy of q void FlatCopyTo(HMM* t); diff -Nru hhsuite-3.0~beta2+dfsg/src/hhhmmsimd.h hhsuite-3.0~beta3+dfsg/src/hhhmmsimd.h --- hhsuite-3.0~beta2+dfsg/src/hhhmmsimd.h 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/hhhmmsimd.h 2017-07-14 10:34:03.000000000 +0000 @@ -5,8 +5,9 @@ #include "hhhmm.h" #include "simd.h" +#ifdef SSE #include - +#endif class HMMSimd { diff -Nru hhsuite-3.0~beta2+dfsg/src/hhmake.cpp hhsuite-3.0~beta3+dfsg/src/hhmake.cpp --- hhsuite-3.0~beta2+dfsg/src/hhmake.cpp 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/hhmake.cpp 2017-07-14 10:34:03.000000000 +0000 @@ -17,7 +17,7 @@ // We are very grateful for bug reports! Please contact us at soeding@mpibpc.mpg.de -// Reference: +// Reference: // Remmert M., Biegert A., Hauser A., and Soding J. // HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment. // Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011). @@ -116,7 +116,7 @@ printf( " Beware of overflows! All these sequences are stored in memory.\n"); printf( - " -cons make consensus sequence master sequence of query MSA \n"); + " -add_cons make consensus sequence master sequence of query MSA \n"); printf( " -name use this name for HMM (default: use name of first sequence) \n"); printf("\n"); @@ -239,7 +239,7 @@ } else if (!strcmp(argv[i], "-seq") && (i < argc - 1)) par.nseqdis = atoi(argv[++i]); - else if (!strncmp(argv[i], "-cons", 5)) + else if (!strncmp(argv[i], "-add_cons", 5)) par.cons = 1; else if (!strncmp(argv[i], "-mark", 5)) par.mark = 1; @@ -261,7 +261,7 @@ par.Neff = atof(argv[++i]); else if (!strcmp(argv[i], "-Neff") && (i < argc - 1)) par.Neff = atof(argv[++i]); - else if (!strcmp(argv[i], "-M") && (i < argc - 1)) + else if (!strcmp(argv[i], "-M") && (i < argc - 1)) { if (!strcmp(argv[++i], "a2m") || !strcmp(argv[i], "a3m")) par.M = 1; else if (!strcmp(argv[i], "first")) @@ -272,6 +272,7 @@ } else HH_LOG(WARNING) << "Ignoring unknown argument: -M " << argv[i] << std::endl; + } else if (!strcmp(argv[i], "-Gonnet")) par.matrix = 0; else if (!strncmp(argv[i], "-BLOSUM", 7) @@ -465,4 +466,3 @@ delete Qali; DeletePseudocountsEngine(context_lib, crf, pc_hhm_context_engine, pc_hhm_context_mode, pc_prefilter_context_engine, pc_prefilter_context_mode); } - diff -Nru hhsuite-3.0~beta2+dfsg/src/hhprefilter.cpp hhsuite-3.0~beta3+dfsg/src/hhprefilter.cpp --- hhsuite-3.0~beta2+dfsg/src/hhprefilter.cpp 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/hhprefilter.cpp 2017-07-14 10:34:03.000000000 +0000 @@ -255,9 +255,9 @@ /////////////////////////////////////////////////////////////////////////////////////////////////// // Pull out all names from prefilter db file and copy into dbfiles_new for full HMM-HMM comparison /////////////////////////////////////////////////////////////////////////////////////////////////// - void Prefilter::init_no_prefiltering(FFindexDatabase* cs219_database, + void Prefilter::init_no_prefiltering(FFindexDatabase* query_database, std::vector >& prefiltered_entries) { - ffindex_index_t* db_index = cs219_database->db_index; + ffindex_index_t* db_index = query_database->db_index; for (size_t n = 0; n < db_index->n_entries; n++) { ffindex_entry_t* entry = ffindex_get_entry_by_index(db_index, n); diff -Nru hhsuite-3.0~beta2+dfsg/src/hhsearch_app.cpp hhsuite-3.0~beta3+dfsg/src/hhsearch_app.cpp --- hhsuite-3.0~beta2+dfsg/src/hhsearch_app.cpp 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/hhsearch_app.cpp 2017-07-14 10:34:03.000000000 +0000 @@ -16,7 +16,7 @@ HHsearch::ProcessAllArguments(argc, argv, par); std::vector databases; - HHblits::prepareDatabases(par, databases); + HHsearch::prepareDatabases(par, databases); #ifdef OPENMP omp_set_num_threads(par.threads); diff -Nru hhsuite-3.0~beta2+dfsg/src/hhsearch.cpp hhsuite-3.0~beta3+dfsg/src/hhsearch.cpp --- hhsuite-3.0~beta2+dfsg/src/hhsearch.cpp 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/hhsearch.cpp 2017-07-14 10:34:03.000000000 +0000 @@ -3,18 +3,25 @@ #include "hhsearch.h" #include "hhsuite_config.h" -HHsearch::HHsearch(Parameters& par, std::vector& databases) : - HHblits(par, databases) { -} - -HHsearch::~HHsearch() { +void HHsearch::prepareDatabases(Parameters& par, + std::vector& databases) { + for (size_t i = 0; i < par.db_bases.size(); i++) { + HHblitsDatabase* db = new HHblitsDatabase(par.db_bases[i].c_str(), false); + databases.push_back(db); + } + par.dbsize = 0; + for (size_t i = 0; i < databases.size(); i++) { + par.dbsize += databases[i]->query_database->db_index->n_entries; + } } void HHsearch::ProcessAllArguments(int argc, char** argv, Parameters& par) { par.argv = argv; par.argc = argc; + par.prefilter = false; + par.num_rounds = 1; // Enable changing verbose mode before command line are processed @@ -238,10 +245,10 @@ } printf("Other options: \n"); - printf(" -v verbose mode: 0:no screen output 1:only warings 2: verbose (def=%i)\n", par.v); + printf(" -v verbose mode: 0:no screen output 1:only warnings 2: verbose (def=%i)\n", par.v); printf(" -cpu number of CPUs to use (for shared memory SMPs) (default=%i) \n", par.threads); if (all) { - printf(" -scores write scores for all pairwise comparisions to file \n"); + printf(" -scores write scores for all pairwise comparisons to file \n"); printf(" -atab write all alignments in tabular layout to file \n"); printf(" -maxres max number of HMM columns (def=%5i) \n", par.maxres); printf(" -maxmem [1,inf[ limit memory for realignment (in GB) (def=%.1f) \n", par.maxmem); @@ -254,6 +261,8 @@ } printf("Example: hhsearch -i a.1.1.1.a3m -d scop70_1.71\n"); + printf("\n"); + printf("Download databases from .\n"); } ///////////////////////////////////////////////////////////////////////////////////// @@ -280,6 +289,10 @@ exit(4); } else { std::string db(argv[i]); + if (HHDatabase::checkDatabaseConflicts(argv[i])) { + HH_LOG(ERROR) << "Ambiguous database basename. Choose either a A3M or CA3M database." << std::endl; + exit(4); + } par.db_bases.push_back(db); } } else if (!strcmp(argv[i], "-o")) { @@ -503,7 +516,7 @@ par.altali = atoi(argv[++i]); else if (!strncmp(argv[i], "-smin", 4) && (i < argc - 1)) par.smin = atof(argv[++i]); - else if (!strcmp(argv[i], "-M") && (i < argc - 1)) + else if (!strcmp(argv[i], "-M") && (i < argc - 1)) { if (!strcmp(argv[++i], "a2m") || !strcmp(argv[i], "a3m")) par.M = 1; else if (!strcmp(argv[i], "first")) @@ -513,6 +526,7 @@ par.M = 2; } else HH_LOG(WARNING) << "Ignoring unknown argument: -M " << argv[i] << std::endl; + } else if (!strcmp(argv[i], "-shift") && (i < argc - 1)) par.shift = atof(argv[++i]); else if ((!strcmp(argv[i], "-mact")) @@ -572,142 +586,3 @@ HH_LOG(DEBUG1) << i << " " << argv[i] << endl; } // end of for-loop for command line input } - -void HHsearch::run(FILE* query_fh, char* query_path) { - int cluster_found = 0; - int seqs_found = 0; - - Hit hit_cur; - Hash* previous_hits = new Hash(1631, hit_cur); - - Qali = new Alignment(); - Qali_allseqs = new Alignment(); - - q = new HMM(MAXSEQDIS, par.maxres); - HMMSimd q_vec(par.maxres); - q_tmp = new HMM(MAXSEQDIS, par.maxres); - - // Read input file (HMM, HHM, or alignment format), and add pseudocounts etc. - Qali->N_in = 0; - char input_format = 0; - ReadQueryFile(par, query_fh, input_format, par.wg, q, Qali, query_path, pb, - S, Sim); - PrepareQueryHMM(par, input_format, q, pc_hhm_context_engine, - pc_hhm_context_mode, pb, R); - q_vec.MapOneHMM(q); - *q_tmp = *q; - - // Set query columns in His-tags etc to Null model distribution - if (par.notags) - q->NeutralizeTags(pb); - - // Search databases - - std::vector new_entries; - if (!par.prefilter) { - for (size_t i = 0; i < dbs.size(); i++) { - dbs[i]->initNoPrefilter(new_entries); - } - } - - int max_template_length = getMaxTemplateLength(new_entries); - if(max_template_length > par.maxres){ - HH_LOG(WARNING) << "database contains sequnces that exceeds maximum allowed size (maxres = " - << par.maxres << "). Maxres can be increased with parameter -maxres." <AllocateBacktraceMatrix(q->L, max_template_length); - } - - ViterbiRunner viterbirunner(viterbiMatrices, dbs, par.threads); - std::vector hits_to_add = viterbirunner.alignment(par, &q_vec, new_entries, par.qsc_db, pb, S, Sim, R, par.ssm, S73, S33, S37); - - hitlist.N_searched = new_entries.size(); - add_hits_to_hitlist(hits_to_add, hitlist); - -//TODO -// if (v1 >= 2) -// cout << "\n"; -// v = v1; -// -// // Sort list according to sortscore -// if (v >= 3) -// printf("Sorting hit list ...\n"); -// hitlist.SortList(); -// -// // Fit EVD (with lamda, mu) to score distribution? -// if (par.calm == 3) { -// hitlist.CalculatePvalues(q, par.loc, par.ssm, par.ssw); // Use NN prediction of lamda and mu -// } -// else if ((par.calm != 1 && q->lamda == 0) || par.calibrate > 0) { -// if (v >= 2 && par.loc) -// printf("Fitting scores with EVD (first round) ...\n"); -// hitlist.MaxLikelihoodEVD(q, 3, par.loc, par.ssm, par.ssw); // first ML fit: exclude 3 best superfamilies from fit -// -// if (v >= 3) -// printf("Number of families present in database: %i\n", hitlist.fams); // DEBUG -// if (hitlist.fams >= 100) { -// if (par.loc) { -// if (v >= 2) -// printf("Fitting scores with EVD (second round) ...\n"); -// hitlist.MaxLikelihoodEVD(q, 0, par.loc, par.ssm, par.ssw); // second ML fit: exclude superfamilies with E-value= 2) -// fprintf(stderr, -// "E-values for global alignment option may be unreliable.\n"); -// hitlist.ResortList(); -// } -// } -// else { -// if (v) { -// fprintf(stderr, "\n no E-values could be calculated.\n"); -// fprintf(stderr, "To calculate E-values you have two options:\n"); -// fprintf(stderr, -// "1. Calibrate your query profile HMM with a calibration database:\n"); -// fprintf(stderr, " > hhsearch -i yourHMM.hhm -d cal.hhm -cal\n"); -// fprintf(stderr, -// " This will insert a line in yourHMM.hhm with lamda and mu of the score distribution.\n"); -// fprintf(stderr, -// " cal.hhm contains 1220 HMMs from different SCOP superfamilies and is supplied with HHsearch.\n"); -// fprintf(stderr, -// " Instead of cal.hhm you may also use any SCOP database file, e.g. scop70_1.69\n"); -// fprintf(stderr, -// " Note that your HMM needs to be recalibrated when changing HMM-HMM alignment options.\n"); -// fprintf(stderr, "2. Append cal.hhm to your own database:\n"); -// fprintf(stderr, " > cat cal.hhm >> yourDB.hhm\n"); -// fprintf(stderr, -// " But note that HMMs contained in cal.hmm will pop up among your hits.\n"); -// } -// } -// if (par.calm == 2) -// hitlist.GetPvalsFromCalibration(q, par.loc, par.calm, par.ssm, par.ssw); -// } -// else -// hitlist.GetPvalsFromCalibration(q, par.loc, par.calm, par.ssm, par.ssw); - - - // Set new ss weight for realign - par.ssw = par.ssw_realign; - - // Realign hits with MAC algorithm - if (par.realign) { - perform_realign(q_vec, input_format, new_entries); - } - - mergeHitsToQuery(previous_hits, seqs_found, cluster_found); - - // Calculate pos-specific weights, AA frequencies and transitions -> f[i][a], tr[i][a] - Qali->FrequenciesAndTransitions(q, par.wg, par.mark, par.cons, par.showcons, - par.maxres, pb, Sim, NULL, true); - - if (par.notags) - q->NeutralizeTags(pb); - - for(size_t i = 0; i < new_entries.size(); i++) { - delete new_entries[i]; - } - new_entries.clear(); -} diff -Nru hhsuite-3.0~beta2+dfsg/src/hhsearch.h hhsuite-3.0~beta3+dfsg/src/hhsearch.h --- hhsuite-3.0~beta2+dfsg/src/hhsearch.h 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/hhsearch.h 2017-07-14 10:34:03.000000000 +0000 @@ -36,11 +36,9 @@ const char HHSEARCH_REFERENCE[] = "Soding, J. Protein homology detection by HMM-HMM comparison. Bioinformatics 21:951-960 (2005).\n"; -class HHsearch: public HHblits { +class HHsearch { public: - HHsearch(Parameters& par, std::vector& databases); - virtual ~HHsearch(); - void run(FILE* query_fh, char* query_path); + static void prepareDatabases(Parameters& par, std::vector& databases); static void ProcessAllArguments(int argc, char** argv, Parameters& par); private: static void help(Parameters& par, char all = 0); diff -Nru hhsuite-3.0~beta2+dfsg/src/hhviterbialgorithm.cpp hhsuite-3.0~beta3+dfsg/src/hhviterbialgorithm.cpp --- hhsuite-3.0~beta2+dfsg/src/hhviterbialgorithm.cpp 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/hhviterbialgorithm.cpp 2017-07-14 10:34:03.000000000 +0000 @@ -104,8 +104,8 @@ -1, -1, -1, -1, 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1); #endif #ifdef VITERBI_CELLOFF - const __m128i tmp_vec = _mm_set_epi32(0x40000000,0x00400000,0x00004000,0x00000040);//01000000010000000100000001000000 #ifdef AVX2 + const __m128i tmp_vec = _mm_set_epi32(0x40000000,0x00400000,0x00004000,0x00000040);//01000000010000000100000001000000 const simd_int co_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_vec), tmp_vec, 1); const simd_int float_min_vec = (simd_int) _mm256_set1_ps(-FLT_MAX); const simd_int shuffle_mask_celloff = _mm256_set_epi8( @@ -118,6 +118,7 @@ 3, 2, 1, 0, 3, 2, 1, 0); #else // SSE case + const simd_int tmp_vec = simdi32_set4(0x40000000,0x00400000,0x00004000,0x00000040); const simd_int co_vec = tmp_vec; const simd_int float_min_vec = (simd_int) simdf32_set(-FLT_MAX); #endif @@ -408,7 +409,8 @@ /* efgh 0000 0000 HGFE 0000 */ const __m128i efgh = _mm256_extracti128_si256(abcdefgh, 1); _mm_storel_epi64((__m128i*)&sCO_MI_DG_IM_GD_MM_vec[j], _mm_or_si128(abcd, efgh)); -#else +#elif defined(SSE) + byte_result_vec = _mm_packs_epi32(byte_result_vec, byte_result_vec); byte_result_vec = _mm_packus_epi16(byte_result_vec, byte_result_vec); int int_result = _mm_cvtsi128_si32(byte_result_vec); diff -Nru hhsuite-3.0~beta2+dfsg/src/hhviterbi.h hhsuite-3.0~beta3+dfsg/src/hhviterbi.h --- hhsuite-3.0~beta2+dfsg/src/hhviterbi.h 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/hhviterbi.h 2017-07-14 10:34:03.000000000 +0000 @@ -122,6 +122,7 @@ int elem, bool self, int par_min_overlap); +#if defined(SSE) // X86* arch only for _mm_prefetch static inline simd_float ScalarProd20Vec(simd_float* qi, simd_float* tj) { _mm_prefetch((char * ) &qi[4], _MM_HINT_T0); _mm_prefetch((char * ) &tj[4], _MM_HINT_T0); @@ -158,6 +159,35 @@ return simdf32_add(res0, res2); } +#else + static inline simd_float ScalarProd20Vec(simd_float* qi, simd_float* tj) { + + simd_float res0 = simdf32_mul(tj[0], qi[0]); + simd_float res1 = simdf32_mul(tj[1], qi[1]); + simd_float res2 = simdf32_mul(tj[2], qi[2]); + simd_float res3 = simdf32_mul(tj[3], qi[3]); + res0 = simdf32_add(simdf32_mul(tj[ 4],qi[ 4]), res0); + res1 = simdf32_add(simdf32_mul(tj[ 5],qi[ 5]), res1); + res2 = simdf32_add(simdf32_mul(tj[ 6],qi[ 6]), res2); + res3 = simdf32_add(simdf32_mul(tj[ 7],qi[ 7]), res3); + res0 = simdf32_add(simdf32_mul(tj[ 8],qi[ 8]), res0); + res1 = simdf32_add(simdf32_mul(tj[ 9],qi[ 9]), res1); + res2 = simdf32_add(simdf32_mul(tj[10],qi[10]), res2); + res3 = simdf32_add(simdf32_mul(tj[11],qi[11]), res3); + res0 = simdf32_add(simdf32_mul(tj[12],qi[12]), res0); + res1 = simdf32_add(simdf32_mul(tj[13],qi[13]), res1); + res2 = simdf32_add(simdf32_mul(tj[14],qi[14]), res2); + res3 = simdf32_add(simdf32_mul(tj[15],qi[15]), res3); + res0 = simdf32_add(simdf32_mul(tj[16],qi[16]), res0); + res1 = simdf32_add(simdf32_mul(tj[17],qi[17]), res1); + res2 = simdf32_add(simdf32_mul(tj[18],qi[18]), res2); + res3 = simdf32_add(simdf32_mul(tj[19],qi[19]), res3); + res0 = simdf32_add(res0, res1); + res2 = simdf32_add(res2, res3); + return simdf32_add(res0, res2); + + } +#endif // Calculate secondary structure score between columns i and j of two HMMs (query and template) static inline float ScoreSS(const HMM* q, const HMM* t, const int i, diff -Nru hhsuite-3.0~beta2+dfsg/src/hhviterbirunner.cpp hhsuite-3.0~beta3+dfsg/src/hhviterbirunner.cpp --- hhsuite-3.0~beta2+dfsg/src/hhviterbirunner.cpp 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/hhviterbirunner.cpp 2017-07-14 10:34:03.000000000 +0000 @@ -10,12 +10,16 @@ void ViterbiConsumerThread::align(int maxres, int nseqdis, const float smin) { - - int ss_hmm_mode = HMM::computeScoreSSMode(q_simd->GetHMM(0), t_hmm_simd->GetHMM(0)); - for(size_t i = 1; i < maxres; i++){ - ss_hmm_mode = std::min(ss_hmm_mode, - HMM::computeScoreSSMode(q_simd->GetHMM(0), t_hmm_simd->GetHMM(i))); + int consensus_ss_hmm_mode = 0xFF; + for(size_t i = 0; i < maxres; i++){ + consensus_ss_hmm_mode &= HMM::computeScoreSSMode(q_simd->GetHMM(0), t_hmm_simd->GetHMM(i)); } + // The following code solves the problem if more than 1 bit is set in "consensus_ss_hmm_mode". + // It will pick the best possible mode + int ss_hmm_mode = (consensus_ss_hmm_mode & HMM::PRED_DSSP); + ss_hmm_mode = (ss_hmm_mode == 0) ? consensus_ss_hmm_mode & HMM::DSSP_PRED : 0; + ss_hmm_mode = (ss_hmm_mode == 0) ? consensus_ss_hmm_mode & HMM::PRED_PRED : 0; + Viterbi::ViterbiResult* viterbiResult = viterbiAlgo->Align(q_simd, t_hmm_simd, viterbiMatrix, maxres, ss_hmm_mode); for (int elem = 0; elem < maxres; elem++) { HMM * curr_t_hmm = t_hmm_simd->GetHMM(elem); @@ -55,9 +59,9 @@ hit_cur.j2 = viterbiResult->j[elem]; hit_cur.entry = curr_t_hmm->entry; - + // std::cout << "Thread: " << thread_id << std::endl; - // printf ("%-12.12s %-12.12s irep=%-2i score=%6.2f i=%d j=%d\n",hit_cur->name,hit_cur->fam,hit_cur->irep,hit_cur->score,viterbiResult.i[elem], viterbiResult.j[elem]); +// HH_LOG(INFO) << string_format ("%d %-12.12s %-12.12s irep=%-2i score=%6.2f ss_scor=%6.2f i=%d j=%d nstep=%d ssm_mode=%d t_ss_pred=%d t_ss_dssp=%d",elem, hit_cur.name,hit_cur.fam,hit_cur.irep,hit_cur.score, hit_cur.score_ss,viterbiResult->i[elem], viterbiResult->j[elem], hit_cur.nsteps, ss_hmm_mode, t_hmm_simd->GetHMM(elem)->nss_pred, t_hmm_simd->GetHMM(elem)->nss_dssp) << std::endl; // printf ("%-12.12s %-12.12s irep=%-2i score=%6.2f\n",hit_cur.file,hit_cur.fam,hit_cur.irep,backtraceScore.score); hits.push_back(hit_cur); // insert hit at beginning of list (last repeats first!) Deep Copy of hit_cur } diff -Nru hhsuite-3.0~beta2+dfsg/src/log.h hhsuite-3.0~beta3+dfsg/src/log.h --- hhsuite-3.0~beta2+dfsg/src/log.h 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/log.h 2017-07-14 10:34:03.000000000 +0000 @@ -112,8 +112,8 @@ typedef Log FILELog; #define HH_LOG(level) \ - if (level > Log::reporting_level()) ; \ - else Log().Get(level) + if (level <= Log::reporting_level()) \ + Log().Get(level) inline std::string NowTime() { diff -Nru hhsuite-3.0~beta2+dfsg/src/simd.h hhsuite-3.0~beta3+dfsg/src/simd.h --- hhsuite-3.0~beta2+dfsg/src/simd.h 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/simd.h 2017-07-14 10:34:03.000000000 +0000 @@ -321,10 +321,156 @@ #define simdi32_srli(x,y) _mm_srli_epi32(x,y) // shift integers in a right by y #define simdi32_i2f(x) _mm_cvtepi32_ps(x) // convert integer to s.p. float #define simdi_i2fcast(x) _mm_castsi128_ps(x) + +#define simdi32_set4(x,y,z,t) _mm_set_epi32(x,y,z,t) // Added with Power8, hhviterbialgorithm needs _set4 + #endif //SIMD_INT #endif //SSE +/* + * Power8/LE Altivec/VSX SIMD + */ +#ifdef __ALTIVEC__ // || __VSX__ + +#include + +// double support +#ifndef SIMD_DOUBLE +#define SIMD_DOUBLE +#define ALIGN_DOUBLE 16 +#define VECSIZE_DOUBLE 2 +typedef __vector double simd_double; +#define simdf64_add(x,y) vec_add(x,y) +#define simdf64_sub(x,y) vec_sub(x,y) +#define simdf64_mul(x,y) vec_mul(x,y) +#define simdf64_div(x,y) vec_div(x,y) +#define simdf64_max(x,y) vec_max(x,y) +#define simdf64_load(x) vec_vsx_ld(0,x) // vec_ld +#define simdf64_store(x,y) vec_vsx_st(y,0,x) // vec_st +#define simdf64_set(x) vec_splats(x) +//#define simdf64_set2(x,y) _mm_set_ps(x,y) +#define simdf64_setzero(x) vec_splats(0) +#define simdf64_gt(x,y) vec_cmpgt(x,y) +#define simdf64_lt(x,y) vec_cmplt(x,y) +#define simdf64_or(x,y) vec_or(x,y) +#define simdf64_and(x,y) vec_and(x,y) +#define simdf64_andnot(x,y) vec_nand(x,y) +#define simdf64_xor(x,y) vec_xor(x,y) +#endif // SIMD_DOUBLE + +// float support +#ifndef SIMD_FLOAT +#define SIMD_FLOAT +#define ALIGN_FLOAT 16 +#define VECSIZE_FLOAT 4 +typedef __vector float simd_float; +#define simdf32_add(x,y) x + y //vec_add(x,y) +#define simdf32_sub(x,y) x - y //vec_sub(x,y) +#define simdf32_mul(x,y) x * y //vec_mul(x,y) +#define simdf32_div(x,y) x / y //vec_div(x,y) +#define simdf32_max(x,y) vec_max(x,y) +#define simdf32_min(x,y) vec_min(x,y) +#define simdf32_rcp(x) vec_re(x) +#define simdf32_load(x) vec_vsx_ld(0,x) // vec_ld +#define simdf32_store(x,y) vec_vsx_st(y,0,x) // vec_st +#define simdf32_set(x) vec_splats((float)x) +//#define simdf32_set2(x,y) _mm_set_ps(x,y) +//#define simdf32_set4(x,y,z,t) +#define simdf32_setzero(x) vec_splats(0) +#define simdf32_gt(x,y) (simd_float)vec_cmpgt(x,y) +#define simdf32_eq(x,y) (simd_float)vec_cmpeq(x,y) +#define simdf32_lt(x,y) (simd_float)vec_cmplt(x,y) +#define simdf32_or(x,y) vec_or(x,y) +#define simdf32_and(x,y) vec_and(x,y) +#define simdf32_andnot(x,y) vec_nand(x,y) +#define simdf32_xor(x,y) vec_xor(x,y) +#define simdf32_extract(x,imm) vec_extract(x,imm) + +#define simdf32_f2i(x) vec_cts(x,0) // convert s.p. float to integer +#define simdf_f2icast(x) (simd_int)(x) // compile time cast +#endif // SIMD_FLOAT + +// integer support +#ifndef SIMD_INT +#define SIMD_INT +#define ALIGN_INT 16 +#define VECSIZE_INT 4 +typedef __vector int simd_int; +typedef __vector signed char simd_s8; +typedef __vector unsigned char simd_u8; + +#define simdi32_add(x,y) vec_add(x,y) +#define simdi32_sub(x,y) vec_sub(x,y) +#define simdi32_mul(x,y) vec_mul(x,y) +#define simdi32_max(x,y) vec_max(x,y) +#define simdi_load(x) vec_vsx_ld(0,x) +#define simdi_store(x,y) vec_vsx_st(y,0,x) +#define simdi32_set(x) vec_splats((signed int)x) +#define simdi32_set4(x,y,z,t) (simd_int){x,y,z,t} +#define simdi_setzero(x) vec_splats(0) +#define simdi32_gt(x,y) (simd_int)vec_cmpgt(x,y) +#define simdi32_lt(x,y) (simd_int)vec_cmplt(x,y) +#define simdi_or(x,y) vec_or(x,y) +#define simdi_and(x,y) vec_and(x,y) +#define simdi_andnot(x,y) vec_nand(x,y) +#define simdi_xor(x,y) vec_xor(x,y) +#define simdi32_slli(x,y) vec_sll(x,vec_splats((unsigned)y)) // shift integers in a left by y +#define simdi32_srli(x,y) vec_srl(x,vec_splats((unsigned)y)) // shift integers in a right by y +#define simdi32_i2f(x) vec_ctf(x,0) // convert integer to s.p. float +#define simdi_i2fcast(x) (simd_float)(x) +#define simdi8_set(x) (simd_int)vec_splats((unsigned char)x) +#define simdi8_gt(x,y) (simd_int)vec_cmpgt((simd_s8)x,(simd_s8)y) +#define simdi8_eq(x,y) (simd_int)vec_cmpeq((simd_s8)x,(simd_s8)y) +#define simdui8_max(x,y) (simd_int)vec_max((vector unsigned char)x, (vector unsigned char)y) +#define simdui8_adds(x,y) (simd_int)vec_adds((simd_u8)x,(simd_u8)y) +#define simdui8_subs(x,y) (simd_int)vec_subs((simd_u8)x,(simd_u8)y) +#define simdi8_shiftl(x,y) (simd_int)vec_sll(x,vec_splats((char)y)) // shift integers in a left by y +#define simdi8_shiftr(x,y) (simd_int)vec_srl(x,vec_splats((char)y)) // shift integers in a right by y +#define simdi8_movemask(x) v_movemask(x) + + +// There is no altivec/vsx equivalent, C version +// +inline int v_movemask(simd_int x) +{ + unsigned int result=0; + + union { + simd_int si; + char as_char[16]; + } t; + + t.si = x; + result |= (t.as_char[15] & 0x80) << (15-7); + result |= (t.as_char[14] & 0x80) << (14-7); + result |= (t.as_char[13] & 0x80) << (13-7); + result |= (t.as_char[12] & 0x80) << (12-7); + result |= (t.as_char[11] & 0x80) << (11-7); + result |= (t.as_char[10] & 0x80) << (10-7); + result |= (t.as_char[9] & 0x80) << (9-7); + result |= (t.as_char[8] & 0x80) << (8-7); + result |= (t.as_char[7] & 0x80); + result |= (t.as_char[6] & 0x80) >> (7-6); + result |= (t.as_char[5] & 0x80) >> (7-5); + result |= (t.as_char[4] & 0x80) >> (7-4); + result |= (t.as_char[3] & 0x80) >> (7-3); + result |= (t.as_char[2] & 0x80) >> (7-2); + result |= (t.as_char[1] & 0x80) >> (7-1); + result |= (t.as_char[0] & 0x80) >> 7; + + return result; +} + + +#endif // SIMD_INT + +#endif // __ALTIVEC__ || __VSX__ + + + + + /* horizontal max */ template inline F simd_hmax(const F * in, unsigned int n) diff -Nru hhsuite-3.0~beta2+dfsg/src/util.h hhsuite-3.0~beta3+dfsg/src/util.h --- hhsuite-3.0~beta2+dfsg/src/util.h 2016-07-21 16:04:12.000000000 +0000 +++ hhsuite-3.0~beta3+dfsg/src/util.h 2017-07-14 10:34:03.000000000 +0000 @@ -45,14 +45,16 @@ // Compute mem sizes rounded up to nearest multiple of ALIGN_FLOAT size_t size_pointer_array = ICEIL(dim1*sizeof(T*), ALIGN_FLOAT); size_t dim2_padded = ICEIL(dim2*sizeof(T), ALIGN_FLOAT)/sizeof(T); - + T** matrix = (T**) mem_align( ALIGN_FLOAT, size_pointer_array + dim1*dim2_padded*sizeof(T) ); if (matrix == NULL) return matrix; - + T* ptr = (T*) (matrix + (size_pointer_array/sizeof(T*)) ); for (int i=0; i