diff -Nru vsearch-2.18.0/configure.ac vsearch-2.21.1/configure.ac --- vsearch-2.18.0/configure.ac 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/configure.ac 2022-01-18 17:26:39.000000000 +0000 @@ -2,7 +2,7 @@ # Process this file with autoconf to produce a configure script. AC_PREREQ([2.63]) -AC_INIT([vsearch], [2.18.0], [torognes@ifi.uio.no], [vsearch], [https://github.com/torognes/vsearch]) +AC_INIT([vsearch], [2.21.1], [torognes@ifi.uio.no], [vsearch], [https://github.com/torognes/vsearch]) AC_CANONICAL_TARGET AM_INIT_AUTOMAKE([subdir-objects]) AC_LANG([C++]) @@ -12,11 +12,6 @@ AC_SUBST(MACOSX_DEPLOYMENT_TARGET) MACOSX_DEPLOYMENT_TARGET="10.9" -# Set default gcc and g++ options - -CFLAGS='-g' -CXXFLAGS='-g -std=c++11' - # Checks for programs. AC_PROG_CXX AC_PROG_RANLIB diff -Nru vsearch-2.18.0/debian/changelog vsearch-2.21.1/debian/changelog --- vsearch-2.18.0/debian/changelog 2021-08-28 14:25:48.000000000 +0000 +++ vsearch-2.21.1/debian/changelog 2022-01-22 11:11:02.000000000 +0000 @@ -1,3 +1,17 @@ +vsearch (2.21.1-1) unstable; urgency=medium + + * New upstream version 2.21.1 + + -- Nilesh Patra Sat, 22 Jan 2022 16:41:02 +0530 + +vsearch (2.21.0-1) unstable; urgency=medium + + * Fix watchfile to detect new versions on github + * New upstream version + * Standards-Version: 4.6.0 (routine-update) + + -- Andreas Tille Mon, 17 Jan 2022 15:44:41 +0100 + vsearch (2.18.0-1) unstable; urgency=medium * New upstream version 2.18.0 diff -Nru vsearch-2.18.0/debian/control vsearch-2.21.1/debian/control --- vsearch-2.18.0/debian/control 2021-04-23 18:33:41.000000000 +0000 +++ vsearch-2.21.1/debian/control 2022-01-22 11:10:04.000000000 +0000 @@ -10,8 +10,9 @@ libbz2-dev, markdown, ghostscript, - time -Standards-Version: 4.5.1 + time, + debhelper +Standards-Version: 4.6.0 Vcs-Browser: https://salsa.debian.org/med-team/vsearch Vcs-Git: https://salsa.debian.org/med-team/vsearch.git Homepage: https://github.com/torognes/vsearch/ diff -Nru vsearch-2.18.0/debian/patches/compilerflags.patch vsearch-2.21.1/debian/patches/compilerflags.patch --- vsearch-2.18.0/debian/patches/compilerflags.patch 2021-01-30 10:55:24.000000000 +0000 +++ vsearch-2.21.1/debian/patches/compilerflags.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,16 +0,0 @@ -Description: do not override system compiler flags -Author: Sascha Steinbiss ---- a/configure.ac -+++ b/configure.ac -@@ -12,11 +12,6 @@ - AC_SUBST(MACOSX_DEPLOYMENT_TARGET) - MACOSX_DEPLOYMENT_TARGET="10.9" - --# Set default gcc and g++ options -- --CFLAGS='-g' --CXXFLAGS='-g -std=c++11' -- - # Checks for programs. - AC_PROG_CXX - AC_PROG_RANLIB diff -Nru vsearch-2.18.0/debian/patches/series vsearch-2.21.1/debian/patches/series --- vsearch-2.18.0/debian/patches/series 2021-01-30 10:52:51.000000000 +0000 +++ vsearch-2.21.1/debian/patches/series 2022-01-22 11:10:04.000000000 +0000 @@ -1,2 +1 @@ sysconf_memory_sizing.patch -compilerflags.patch diff -Nru vsearch-2.18.0/debian/watch vsearch-2.21.1/debian/watch --- vsearch-2.18.0/debian/watch 2021-04-23 18:34:13.000000000 +0000 +++ vsearch-2.21.1/debian/watch 2022-01-22 11:10:04.000000000 +0000 @@ -1,4 +1,4 @@ version=4 opts="downloadurlmangle=s/\/tree\/(.*)/\/archive\/$1.tar.gz/" \ - https://github.com/torognes/vsearch/releases .*/archive/.*/v([0-9.rc-]+)\.(?:tar(?:\.gz|\.bz2)?|tgz) + https://github.com/torognes/vsearch/tags .*/v?([0-9.rc-]+)\.(?:tar(?:\.gz|\.bz2)?|tgz) diff -Nru vsearch-2.18.0/Dockerfile vsearch-2.21.1/Dockerfile --- vsearch-2.18.0/Dockerfile 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/Dockerfile 2022-01-18 17:26:39.000000000 +0000 @@ -5,7 +5,7 @@ libstdc++ zlib-dev bzip2-dev \ autoconf automake make g++ && \ ./autogen.sh && \ - ./configure && \ + ./configure CFLAGS="-O3" CXXFLAGS="-O3" && \ make clean && \ make && \ make install && \ diff -Nru vsearch-2.18.0/man/vsearch.1 vsearch-2.21.1/man/vsearch.1 --- vsearch-2.18.0/man/vsearch.1 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/man/vsearch.1 2022-01-18 17:26:39.000000000 +0000 @@ -1,5 +1,5 @@ .\" ============================================================================ -.TH vsearch 1 "August 27, 2021" "version 2.18.0" "USER COMMANDS" +.TH vsearch 1 "January 18, 2022" "version 2.21.1" "USER COMMANDS" .\" ============================================================================ .SH NAME vsearch \(em a versatile open-source tool for microbiome analysis, @@ -36,6 +36,10 @@ .RE Dereplication and rereplication: .RS +\fBvsearch\fR \-\-fastx_uniques (\fIfastafile\fR | \fIfastqfile\fR) +(\-\-fastaout | \-\-fastqout | \-\-tabbedout | \-\-uc) \fIoutputfile\fR +[\fIoptions\fR] +.PP \fBvsearch\fR (\-\-derep_fulllength | \-\-derep_id | \-\-derep_prefix) \fIfastafile\fR (\-\-output | \-\-uc) \fIoutputfile\fR [\fIoptions\fR] .PP @@ -140,13 +144,14 @@ .RS \fBvsearch\fR \-\-search_exact \fIfastafile\fR \-\-db \fIfastafile\fR (\-\-alnout | \-\-biomout | \-\-blast6out | \-\-mothur_shared_out | -\-\-otutabout | \-\-samout | \-\-uc | \-\-userout) \fIoutputfile\fR -[\fIoptions\fR] +\-\-otutabout | \-\-samout | \-\-uc | \-\-userout | \-\-lcaout) +\fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-usearch_global \fIfastafile\fR \-\-db \fIfastafile\fR (\-\-alnout | \-\-biomout | \-\-blast6out | \-\-mothur_shared_out | \-\-otutabout | \-\-samout | \-\-uc | -\-\-userout) \fIoutputfile\fR \-\-id \fIreal\fR [\fIoptions\fR] +\-\-userout | \-\-lcaout) \fIoutputfile\fR \-\-id \fIreal\fR +[\fIoptions\fR] .PP .RE Shuffling and sorting: @@ -346,6 +351,11 @@ When reading from a pipe streaming gzip-compressed data, decompress the data. This option is not needed when reading from a standard gzip-compressed file. +.TAG label_suffix +.TP +.BI \-\-label_suffix\~ string +When writing FASTA or FASTQ files, add the suffix \fIstring\fR to +sequence headers. .TAG log .TP .BI \-\-log \0filename @@ -383,6 +393,12 @@ .B \-\-quiet Suppress all messages to stdout and stderr except for warnings and fatal error messages. +.TAG sample +.TP +.BI \-\-sample\~ string +When writing FASTA or FASTQ files, add the the given sample identifier +\fIstring\fR to sequence headers. For instance, if the given string is +ABC, the text ";sample=ABC" will be added to the header. .TAG threads .TP .BI \-\-threads\~ "positive integer" @@ -449,11 +465,13 @@ .TAG db .TP .BI \-\-db \0filename -When using \-\-uchime_ref, detect chimeras using the fasta-formatted -reference sequences contained in \fIfilename\fR. Reference sequences -are assumed to be chimera-free. Chimeras cannot be detected if their -parents, or sufficiently close relatives, are not present in the -database. +When using \-\-uchime_ref, detect chimeras using the reference +sequences contained in \fIfilename\fR. Reference sequences are assumed +to be chimera-free. Chimeras cannot be detected if their parents, or +sufficiently close relatives, are not present in the database. The +file name must refer to a FASTA file or to a UDB file. If a UDB file +is used, it should be created using the \-\-makeudb_usearch command +with the \-\-dbmask dust option. .TAG dn .TP .BI \-\-dn\~ "strictly positive real number" @@ -884,6 +902,11 @@ \fIdust\fR or the \fIsoft\fR methods, or do not mask (\fInone\fR). Warning, when using \fIsoft\fR masking, clustering becomes case sensitive. The default is to mask using \fIdust\fR. +.TAG qsegout +.TP +.BI \-\-qsegout \0filename +Write the aligned part of each query sequence to \fIfilename\fR in +FASTA format. .TAG relabel .TP .BI \-\-relabel \0string @@ -945,6 +968,11 @@ .BI \-\-strand\~ "plus|both" When comparing sequences with the cluster seed, check the \fIplus\fR strand only (default) or check \fIboth\fR strands. +.TAG tsegout +.TP +.BI \-\-tsegout \0filename +Write the aligned part of each target sequence to \fIfilename\fR in +FASTA format. .TAG uc .TP .BI \-\-uc \0filename @@ -1013,7 +1041,32 @@ .\" ---------------------------------------------------------------------------- .TAG dereplication-and-rereplication-options Dereplication and rereplication options: +.PP .RS +VSEARCH can dereplicate sequences with the commands +\-\-derep_fulllength, \-\-derep_id, \-\-derep_prefix and +\-\-fastx_uniques. The \-\-derep_fulllength command is depreciated and +is replaced by the new \-\-fastx_uniques command that can also handle +FASTQ files in addition to FASTA files. The \-\-derep_fulllength and +\-\-fastx_uniques commands requires strictly identical sequences of +the same length, but ignores upper/lower case and treats T and U as +identical symbols. The \-\-derep_id command requires both identical +sequences and identical headers/labels. The \-\-derep_prefix command +will group sequences with a common prefix and does not require them to +be equally long. The \-\-fastx_uniques command can write FASTQ output +(specified with \-\-fastqout) or FASTA output (specified with +\-\-fastaout) as well as a special tab-separated column text format +(with \-\-tabbedout). The other commands can write FASTA output to the +file specified with the \-\-output option. All dereplication commands +can write output to a special UCLUST-like file specified with the +\-\-uc option. The \-\-rereplicate command can duplicate sequences in +the input file according to the abundance of each input +sequence. Other valid options are \-\-fastq_ascii, \-\-fastq_asciiout, +\-\-fastq_qmax, \-\-fastq_qmaxout, \-\-fastq_qmin, \-\-fastq_qminout, +\-\-fastq_qout_max, \-\-maxuniquesize, \-\-minuniquesize, \-\-relabel, +\-\-relabel_keep, \-\-relabel_md5, \-\-relabel_self, \-\-relabel_sha1, +\-\-sizein, \-\-sizeout, \-\-strand, \-\-topn, and \-\-xsize. +.PP .TAG derep_fulllength .TP 9 .BI \-\-derep_fulllength \0filename @@ -1024,7 +1077,7 @@ to take into account and compute abundance values. This command does not support multithreading. .TAG derep_id -.TP 9 +.TP .BI \-\-derep_id \0filename Merge strictly identical sequences contained in \fIfilename\fR, as with the \-\-derep_fulllength command, but the sequence labels @@ -1041,6 +1094,88 @@ using sequence headers and sequence input order. Sequence comparisons are case insensitive, and T and U are considered identical. This command does not support multithreading. +.TAG fastaout +.TP +.BI \-\-fastaout \0filename +Write the dereplicated sequences to \fIfilename\fR, in fasta format +and sorted by decreasing abundance. Identical sequences receive the +header of the first sequence of their group. If \-\-sizeout is used, +the number of occurrences (i.e. abundance) of each sequence is +indicated at the end of their fasta header using the +pattern ';size=\fIinteger\fR;'. This option is only valid for +\-\-fastx_uniques. +.TAG fastqout +.TP +.BI \-\-fastqout \0filename +Write the dereplicated sequences to \fIfilename\fR, in fastq format +and sorted by decreasing abundance. Identical sequences receive the +header of the first sequence of their group. If \-\-sizeout is used, +the number of occurrences (i.e. abundance) of each sequence is +indicated at the end of their fastq header using the +pattern ';size=\fIinteger\fR;'. This option is only valid for +\-\-fastx_uniques. +.TAG fastq_ascii +.TP +.BI \-\-fastq_ascii\~ "positive integer" +Define the ASCII character number used as the basis for the FASTQ +quality score. The default is 33, which is used by the Sanger / +Illumina 1.8+ FASTQ format (phred+33). The value 64 is used by the +Solexa, Illumina 1.3+ and Illumina 1.5+ formats (phred+64). Only 33 +and 64 are valid arguments. +.TAG fastq_asciiout +.TP +.BI \-\-fastq_asciiout\~ "positive integer" +When using \-\-fastq_convert, \-\-sff_convert or \-\-fasta2fastq, +define the ASCII character number used as the basis for the FASTQ +quality score when writing FASTQ output files. The default is 33. Only +33 and 64 are valid arguments. +.TAG fastq_qmax +.TP +.BI \-\-fastq_qmax\~ "positive integer" +Specify the maximum quality score accepted when reading FASTQ +files. The default is 41, which is usual for recent Sanger/Illumina +1.8+ files. +.TAG fastq_qmaxout +.TP +.BI \-\-fastq_qmaxout\~ "positive integer" +Specify the maximum quality score used when writing +FASTQ files. The default +is 41, which is usual for recent Sanger/Illumina 1.8+ files. Older +formats may use a maximum quality score of 40. +.TAG fastq_qmin +.TP +.BI \-\-fastq_qmin\~ "positive integer" +Specify the minimum quality score accepted for FASTQ files. The +default is 0, which is usual for recent Sanger/Illumina 1.8+ +files. Older formats may use scores between -5 and 2. +.TAG fastq_qminout +.TP +.BI \-\-fastq_qminout\~ "positive integer" +Specify the minimum quality score used when writing FASTQ files. The +default is 0, which is usual for Sanger/Illumina 1.8+ files. Older +versions of the format may use scores between -5 and 2. +.TAG fastq_qout_max +.TP +.BI \-\-fastq_qout_max +For \-\-fastx_uniques, indicate that the new quality scores computed +when dereplicating FASTQ files should be equal to the maximum (best) +of the input quality scores for each position (corresponding to the +lowest error probability). The default is to output a quality score +corresponding to the average of the error probabilities for each +position. +.TAG fastx_uniques +.TP +.BI \-\-fastx_uniques \0filename +Merge strictly identical sequences contained in FASTA or FASTQ file +\fIfilename\fR. Identical sequences are defined as having the same +length and the same string of nucleotides (case insensitive, T and U +are considered the same). See the options \-\-sizein and \-\-sizeout +to take into account and compute abundance values. This command does +not support multithreading. By default, the quality scores in FASTQ +output files will correspond to the average error probability of the +nucleotides in the each position. If the \-\-fastq_qout_max option is +given, the quality score will be the highest (best) quality score +observed in each position. .TAG maxuniquesize .TP .BI \-\-maxuniquesize\~ "positive integer" @@ -1059,7 +1194,8 @@ header of the first sequence of their group. If \-\-sizeout is used, the number of occurrences (i.e. abundance) of each sequence is indicated at the end of their fasta header using the -pattern ';size=\fIinteger\fR;'. +pattern ';size=\fIinteger\fR;'. This option is not allowed for +fastx_uniques. .TP .TAG relabel .BI \-\-relabel \0string @@ -1116,6 +1252,20 @@ .BI \-\-strand\~ "plus|both" When searching for strictly identical sequences, check the \fIplus\fR strand only (default) or check \fIboth\fR strands. +.TAG tabbedout +.TP +.BI \-\-tabbedout \0filename +Output clustering info to the specified tab-separated text file with 6 +columns and a row for each input sequence. Column 1 contains the +original label/header of the sequence. Column 2 contains the label of +the output sequence which is equal to the label/header of the first +sequence in each cluster, but potentially relabelled. Column 3 +contains the cluster number, starting from 0. Column 4 contains the +sequence number within each cluster, starting at 0. Column 5 contains +the number of sequences in the cluster. Column 6 contains the original +label/header of the first sequence in the cluster before any potential +relabelling. This option is only valid for the \-\-fastx_uniques +command. .TAG topn .TP .BI \-\-topn\~ "positive integer" @@ -1861,11 +2011,6 @@ string. The default is a string of I's equal in length to the sequence padding string. The letter I corresponds to a base quality score of 40 indicating a very high quality base with error probability of 0.0001. -.TAG label_suffix -.TP -.BI \-\-label_suffix\~ string -When using \-\-fastx_revcomp or \-\-fastq_mergepairs, add the suffix -\fIstring\fR to sequence headers. .TAG maxsize .TP .BI \-\-maxsize\~ "positive integer" @@ -2143,74 +2288,20 @@ .RE .PP .\" ---------------------------------------------------------------------------- -.TAG restriction-site-cutting-options -Restriction site cutting options: -.RS -.PP -The input sequences in the file specified with the \-\-cut command are -cut into fragments at all restriction sites matching the pattern given -with the \-\-cut_pattern option. The fragments on the forward strand -are written to the file specified with the \-\-fastaout file and the -fragments on the reverse strand are written to the file specified with -the \-\-fastaout_rev option. Input sequences that do not match are -written to the file specified with the option \-\-fastaout_discarded, -and their reverse complement are also written to the file specfied -with the \-\-fastaout_discarded_rev option. The relabel options -(\-\-relabel, \-\-relabel_self, \-\-relabel_keep, \-\-relabel_md5, and -\-\-relabel_sha1) may be used to relabel the output sequences). -.TAG cut -.TP 9 -.BI \-\-cut \0filename -Specify the input file with sequences in FASTA format. -.TAG cut_pattern -.TP -.BI \-\-cut_pattern \0string -Specify the restriction site cutting pattern and positions. The -pattern is a string of lower- or uppercase letters specifying the -nucleotides that must match, and may include ambiguous nucleotide -symbols. The special characters "^" (circumflex) and "_" (underscore) -are used to indicate the cutting position on the forward and reverse -strand, respectively. For example, the pattern "G^AATT_C" is the -pattern for the EcoRI restriction site. For such palindromic patterns -(identical to its reverse complement) the command will output all -possible fragments on both strands. For non-palindromic sites, it may -be necessary to run the command also on the reverse complemented input -sequences. Exactly one cutting site on each strand must be indicated. -.TAG fastaout -.TP -.BI \-\-fastaout \0filename -Specify the output file for the resulting fragments on the forward -strand. -.TAG fastaout_rev -.TP -.BI \-\-fastaout_rev \0filename -Specify the output file for the resulting fragments on the reverse -strand. -.TAG fastaout_discarded -.TP -.BI \-\-fastaout_discarded \0filename -Specify the output file for the non-matching sequences. -.TAG fastaout_discarded_rev -.TP -.BI \-\-fastaout_discarded_rev \0filename -Specify the output file for the non-matching seqeunces, reverse -complemented. -.RE -.PP -.\" ---------------------------------------------------------------------------- .TAG pairwise-alignment-options Pairwise alignment options: .RS .PP The results of the n * (n-1) / 2 pairwise alignments are written to the result files specified with \-\-alnout, \-\-blast6out, -\-\-fastapairs \-\-matched, \-\-notmatched, \-\-samout, \-\-uc or -\-\-userout (see Searching section below). Specify either the -\-\-acceptall option to output all pairwise alignments, or specify an -identity level with \-\-id to discard weak alignments. Most other -accept/reject options (see Searching options below) may also be -used. Sequences are aligned on their \fIplus\fR strand only. Masking -is performed as usual and specified with \-\-qmask and \-\-hardmask. +\-\-fastapairs \-\-matched, \-\-notmatched, \-\-qsegout, \-\-samout, +\-\-tsegout, \-\-uc or \-\-userout (see Searching section +below). Specify either the \-\-acceptall option to output all pairwise +alignments, or specify an identity level with \-\-id to discard weak +alignments. Most other accept/reject options (see Searching options +below) may also be used. Sequences are aligned on their \fIplus\fR +strand only. Masking is performed as usual and specified with +\-\-qmask and \-\-hardmask. .TAG acceptall .TP 9 .B \-\-acceptall @@ -2276,6 +2367,61 @@ .RE .PP .\" ---------------------------------------------------------------------------- +.TAG restriction-site-cutting-options +Restriction site cutting options: +.RS +.PP +The input sequences in the file specified with the \-\-cut command are +cut into fragments at all restriction sites matching the pattern given +with the \-\-cut_pattern option. The fragments on the forward strand +are written to the file specified with the \-\-fastaout file and the +fragments on the reverse strand are written to the file specified with +the \-\-fastaout_rev option. Input sequences that do not match are +written to the file specified with the option \-\-fastaout_discarded, +and their reverse complement are also written to the file specfied +with the \-\-fastaout_discarded_rev option. The relabel options +(\-\-relabel, \-\-relabel_self, \-\-relabel_keep, \-\-relabel_md5, and +\-\-relabel_sha1) may be used to relabel the output sequences). +.TAG cut +.TP 9 +.BI \-\-cut \0filename +Specify the input file with sequences in FASTA format. +.TAG cut_pattern +.TP +.BI \-\-cut_pattern \0string +Specify the restriction site cutting pattern and positions. The +pattern is a string of lower- or uppercase letters specifying the +nucleotides that must match, and may include ambiguous nucleotide +symbols. The special characters "^" (circumflex) and "_" (underscore) +are used to indicate the cutting position on the forward and reverse +strand, respectively. For example, the pattern "G^AATT_C" is the +pattern for the EcoRI restriction site. For such palindromic patterns +(identical to its reverse complement) the command will output all +possible fragments on both strands. For non-palindromic sites, it may +be necessary to run the command also on the reverse complemented input +sequences. Exactly one cutting site on each strand must be indicated. +.TAG fastaout +.TP +.BI \-\-fastaout \0filename +Specify the output file for the resulting fragments on the forward +strand. +.TAG fastaout_rev +.TP +.BI \-\-fastaout_rev \0filename +Specify the output file for the resulting fragments on the reverse +strand. +.TAG fastaout_discarded +.TP +.BI \-\-fastaout_discarded \0filename +Specify the output file for the non-matching sequences. +.TAG fastaout_discarded_rev +.TP +.BI \-\-fastaout_discarded_rev \0filename +Specify the output file for the non-matching seqeunces, reverse +complemented. +.RE +.PP +.\" ---------------------------------------------------------------------------- .TAG searching-options Searching options: .RS @@ -2496,6 +2642,31 @@ .BI \-\-idsuffix\~ "positive integer" Reject the sequence match if the last \fIinteger\fR nucleotides of the target do not match the query. +.TAG lca_cutoff +.TP +.BI \-\-lca_cutoff \0real +Adjust the fraction of matching hits required for the last common +ancestor (LCA) output with the \-\-lcaout option during searches. The +default value is 1.0 which requires all hits to match at each +taxonomic rank for that rank to be included. If a lower cutoff value +is used, e.g. 0.95, a small fraction of non-matching hits are allowed +while that rank will still be reported. The argument to this option +must be larger than 0.5, but not larger than 1.0. +.TAG lcaout +.TP +.BI \-\-lcaout \0filename +Output last common ancestor (LCA) information about the hits of each +query to a text file in a tab-separated format. The first column +contains the query id, while the second column contains the taxonomic +information. The headers of the sequences in the database must contain +taxonomic information in the same format as used with the \-\-sintax +command, e.g. "tax=k:Archaea,p:Euryarchaeota,c:Halobacteria". Only the +initial parts of the taxonomy that are common to a large fraction of +the hits of each query will be output. It is necessary to set the +\-\-maxaccepts option to a value differrent from 1 for this +information to be useful. The \-\-top_hits_only option may also be +useful. The fraction of matching hits required may be adjusted by the +\-\-lca_cutoff option (default 1.0). .TAG leftjust .TP .B \-\-leftjust @@ -2665,6 +2836,11 @@ using the dust or the soft algorithms, or do not mask (none). Warning, when using soft masking search commands become case sensitive. The default is to mask using \fIdust\fR. +.TAG qsegout +.TP +.BI \-\-qsegout \0filename +Write the aligned part of each query sequence to \fIfilename\fR in +FASTA format. .TAG query_cov .TP .BI \-\-query_cov \0real @@ -2797,15 +2973,21 @@ .TAG top_hits_only .TP .B \-\-top_hits_only -Only the top hits between the query and database sequence sets are -written to the output specified with the options \-\-alnout, -\-\-samout, \-\-userout, \-\-blast6out, \-\-uc, \-\-fastapairs, -\-\-matched or \-\-notmatched (but not \-\-dbmatched and -\-\-dbnotmatched). For each query, the top hit is the one presenting -the highest percentage of identity (see the \-\-iddef option to change -the way identity is measured). For a given query, if several top hits -present exactly the same percentage of identity, the number of hits -reported is controlled by the \-\-maxaccepts value (1 by default). +Only the top hits with an equally high percentage of identity between +the query and database sequence sets are written to the output +specified with the options \-\-lcaout, \-\-alnout, \-\-samout, +\-\-userout, \-\-blast6out, \-\-uc, \-\-fastapairs, \-\-matched or +\-\-notmatched (but not \-\-dbmatched and \-\-dbnotmatched). For each +query, the top hit is the one presenting the highest percentage of +identity (see the \-\-iddef option to change the way identity is +measured). For a given query, if several top hits present exactly the +same percentage of identity, the number of hits reported is controlled +by the \-\-maxaccepts value (1 by default). +.TAG tsegout +.TP +.BI \-\-tsegout \0filename +Write the aligned part of each target sequence to \fIfilename\fR in +FASTA format. .TAG uc .TP .BI \-\-uc \0filename @@ -3203,11 +3385,14 @@ .URL https://doi.org/10.1101/074161 (link) .PP The name of the fasta file containing the input sequences to be -classified is given as an argument to the \-\-sintax command. The reference -sequence database is specified with the \-\-db option. The results are -written in a tab delimited text file whose name is specified with the -\-\-tabbedout option. The \-\-sintax_cutoff option may be used to set a -minimum level of bootstrap support for the taxonomic ranks to be reported. +classified is given as an argument to the \-\-sintax command. The +reference sequence database is specified with the \-\-db option. The +results are written in a tab delimited text file whose name is +specified with the \-\-tabbedout option. The \-\-sintax_cutoff option +may be used to set a minimum level of bootstrap support for the +taxonomic ranks to be reported. The `--randseed` option may be +included to specify a seed for initialisation of the random number +generator used by the algorithm. .PP Multithreading is supported. Databases in UDB files are supported. The strand option may be specified. @@ -3231,6 +3416,13 @@ .BI \-\-db \0filename Read the reference sequences from \fIfilename\fR, in FASTA, FASTQ or UDB format. These sequences needs to be annotated with taxonomy. +.TAG randseed +.TP +.BI \-\-randseed\~ "positive integer" +Use \fIinteger\fR as seed for the random number generator used in the +Sintax algorithm. A given seed always produces the same output order +(useful for replicability). Set to 0 to use a pseudo-random seed +(default behavior). .TAG sintax_cutoff .TP .BI \-\-sintax_cutoff\~ "real" @@ -4417,6 +4609,28 @@ Added the fasta2fastq command. Fixed search bug on ppc64le. Fixed bug with removal of size and ee info in uc files. Fixed compilation errors in some cases. Made some general code improvements. Updated manual. +.TP +.BR v2.19.0\~ "released December 21st, 2021" +Added the lcaout and lca_cutoff options to enable the output of last +common ancestor (LCA) information about hits when searching. The +randseed option was added as a valid option to the sintax +command. Code improvements. +.TP +.BR v2.20.0\~ "released January 10th, 2022" +Added the fastx_uniques command and the fastq_qout_max option for +dereplication of FASTQ files. Some code cleaning. +.TP +.BR v2.20.1\~ "released January 11th, 2022" +Fixes a bug in fastq_mergepair that caused an occational hang at the +end when using multiple threads. +.TP +.BR v2.21.0\~ "released January 12th, 2022" +This version adds the sample, qsegout and tsegout options. It enables +the use of UDB databases with uchime_ref. +.TP +.BR v2.21.1\~ "released January 18th, 2022" +Fix a problem with dereplication of empty input files. Update Altivec +code on ppc64le for improved compiler compatibility (vector->__vector). .LP .\" ============================================================================ .\" TODO: diff -Nru vsearch-2.18.0/README.md vsearch-2.21.1/README.md --- vsearch-2.18.0/README.md 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/README.md 2022-01-18 17:26:39.000000000 +0000 @@ -37,7 +37,7 @@ ## Getting Help -If you can't find an answer in the [VSEARCH documentation](https://github.com/torognes/vsearch/releases/download/v2.18.0/vsearch_manual.pdf), please visit the [VSEARCH Web Forum](https://groups.google.com/forum/#!forum/vsearch-forum) to post a question or start a discussion. +If you can't find an answer in the [VSEARCH documentation](https://github.com/torognes/vsearch/releases/download/v2.21.1/vsearch_manual.pdf), please visit the [VSEARCH Web Forum](https://groups.google.com/forum/#!forum/vsearch-forum) to post a question or start a discussion. ## Example @@ -50,16 +50,16 @@ **Source distribution** To download the source distribution from a [release](https://github.com/torognes/vsearch/releases) and build the executable and the documentation, use the following commands: ``` -wget https://github.com/torognes/vsearch/archive/v2.18.0.tar.gz -tar xzf v2.18.0.tar.gz -cd vsearch-2.18.0 +wget https://github.com/torognes/vsearch/archive/v2.21.1.tar.gz +tar xzf v2.21.1.tar.gz +cd vsearch-2.21.1 ./autogen.sh -./configure +./configure CFLAGS="-O3" CXXFLAGS="-O3" make make install # as root or sudo make install ``` -You may customize the installation directory using the `--prefix=DIR` option to `configure`. If the compression libraries [zlib](https://www.zlib.net) and/or [bzip2](https://www.sourceware.org/bzip2/) are installed on the system, they will be detected automatically and support for compressed files will be included in vsearch. Support for compressed files may be disabled using the `--disable-zlib` and `--disable-bzip2` options to `configure`. A PDF version of the manual will be created from the `vsearch.1` manual file if `ps2pdf` is available, unless disabled using the `--disable-pdfman` option to `configure`. Other options may also be applied to `configure`, please run `configure -h` to see them all. GNU autoconf (version 2.63 or later), automake and the GCC C++ compiler is required to build vsearch. +You may customize the installation directory using the `--prefix=DIR` option to `configure`. If the compression libraries [zlib](https://www.zlib.net) and/or [bzip2](https://www.sourceware.org/bzip2/) are installed on the system, they will be detected automatically and support for compressed files will be included in vsearch. Support for compressed files may be disabled using the `--disable-zlib` and `--disable-bzip2` options to `configure`. A PDF version of the manual will be created from the `vsearch.1` manual file if `ps2pdf` is available, unless disabled using the `--disable-pdfman` option to `configure`. It is recommended to run configure with the options `CFLAGS="-O3"` and `CXXFLAGS="-O3"`. Other options may also be applied to `configure`, please run `configure -h` to see them all. GNU autoconf (version 2.63 or later), automake and the GCC C++ compiler is required to build vsearch. Version 3.82 or later of Make may be required on Linux, while version 3.81 is sufficient on macOS. The distributed Linux ppc64le and aarch64 binaries and the Windows binary were compiled using the [Mingw-w64](http://mingw-w64.org/) C++ cross-compiler. @@ -81,43 +81,43 @@ Download the appropriate executable for your system using the following commands if you are using a Linux x86_64 system: ```sh -wget https://github.com/torognes/vsearch/releases/download/v2.18.0/vsearch-2.18.0-linux-x86_64.tar.gz -tar xzf vsearch-2.18.0-linux-x86_64.tar.gz +wget https://github.com/torognes/vsearch/releases/download/v2.21.1/vsearch-2.21.1-linux-x86_64.tar.gz +tar xzf vsearch-2.21.1-linux-x86_64.tar.gz ``` Or these commands if you are using a Linux ppc64le system: ```sh -wget https://github.com/torognes/vsearch/releases/download/v2.18.0/vsearch-2.18.0-linux-ppc64le.tar.gz -tar xzf vsearch-2.18.0-linux-ppc64le.tar.gz +wget https://github.com/torognes/vsearch/releases/download/v2.21.1/vsearch-2.21.1-linux-ppc64le.tar.gz +tar xzf vsearch-2.21.1-linux-ppc64le.tar.gz ``` Or these commands if you are using a Linux aarch64 (arm64) system: ```sh -wget https://github.com/torognes/vsearch/releases/download/v2.18.0/vsearch-2.18.0-linux-aarch64.tar.gz -tar xzf vsearch-2.18.0-linux-aarch64.tar.gz +wget https://github.com/torognes/vsearch/releases/download/v2.21.1/vsearch-2.21.1-linux-aarch64.tar.gz +tar xzf vsearch-2.21.1-linux-aarch64.tar.gz ``` Or these commands if you are using a Mac: ```sh -wget https://github.com/torognes/vsearch/releases/download/v2.18.0/vsearch-2.18.0-macos-x86_64.tar.gz -tar xzf vsearch-2.18.0-macos-x86_64.tar.gz +wget https://github.com/torognes/vsearch/releases/download/v2.21.1/vsearch-2.21.1-macos-x86_64.tar.gz +tar xzf vsearch-2.21.1-macos-x86_64.tar.gz ``` Or if you are using Windows, download and extract (unzip) the contents of this file: ``` -https://github.com/torognes/vsearch/releases/download/v2.18.0/vsearch-2.18.0-win-x86_64.zip +https://github.com/torognes/vsearch/releases/download/v2.21.1/vsearch-2.21.1-win-x86_64.zip ``` -Linux and Mac: You will now have the binary distribution in a folder called `vsearch-2.18.0-linux-x86_64` or `vsearch-2.18.0-macos-x86_64` in which you will find three subfolders `bin`, `man` and `doc`. We recommend making a copy or a symbolic link to the vsearch binary `bin/vsearch` in a folder included in your `$PATH`, and a copy or a symbolic link to the vsearch man page `man/vsearch.1` in a folder included in your `$MANPATH`. The PDF version of the manual is available in `doc/vsearch_manual.pdf`. Versions with statically compiled libraries are available for Linux systems. These have "-static" in their name, and could be used on systems that do not have all the necessary libraries installed. +Linux and Mac: You will now have the binary distribution in a folder called `vsearch-2.21.1-linux-x86_64` or `vsearch-2.21.1-macos-x86_64` in which you will find three subfolders `bin`, `man` and `doc`. We recommend making a copy or a symbolic link to the vsearch binary `bin/vsearch` in a folder included in your `$PATH`, and a copy or a symbolic link to the vsearch man page `man/vsearch.1` in a folder included in your `$MANPATH`. The PDF version of the manual is available in `doc/vsearch_manual.pdf`. Versions with statically compiled libraries are available for Linux systems. These have "-static" in their name, and could be used on systems that do not have all the necessary libraries installed. -Windows: You will now have the binary distribution in a folder called `vsearch-2.18.0-win-x86_64`. The vsearch executable is called `vsearch.exe`. The manual in PDF format is called `vsearch_manual.pdf`. +Windows: You will now have the binary distribution in a folder called `vsearch-2.21.1-win-x86_64`. The vsearch executable is called `vsearch.exe`. The manual in PDF format is called `vsearch_manual.pdf`. -**Documentation** The VSEARCH user's manual is available in the `man` folder in the form of a [man page](https://github.com/torognes/vsearch/blob/master/man/vsearch.1). A pdf version ([vsearch_manual.pdf](https://github.com/torognes/vsearch/releases/download/v2.18.0/vsearch_manual.pdf)) will be generated by `make`. To install the manpage manually, copy the `vsearch.1` file or a create a symbolic link to `vsearch.1` in a folder included in your `$MANPATH`. The manual in both formats is also available with the binary distribution. The manual in PDF form ([vsearch_manual.pdf](https://github.com/torognes/vsearch/releases/download/v2.18.0/vsearch_manual.pdf)) is also attached to the latest [release](https://github.com/torognes/vsearch/releases). +**Documentation** The VSEARCH user's manual is available in the `man` folder in the form of a [man page](https://github.com/torognes/vsearch/blob/master/man/vsearch.1). A pdf version ([vsearch_manual.pdf](https://github.com/torognes/vsearch/releases/download/v2.21.1/vsearch_manual.pdf)) will be generated by `make`. To install the manpage manually, copy the `vsearch.1` file or a create a symbolic link to `vsearch.1` in a folder included in your `$MANPATH`. The manual in both formats is also available with the binary distribution. The manual in PDF form ([vsearch_manual.pdf](https://github.com/torognes/vsearch/releases/download/v2.21.1/vsearch_manual.pdf)) is also attached to the latest [release](https://github.com/torognes/vsearch/releases). ## Packages, plugins, and wrappers @@ -191,7 +191,7 @@ ## Code -The code is written in C++ but most of it is actually mostly C with some C++ syntax conventions. +The code is written mostly in C++. File | Description ---|--- @@ -205,6 +205,7 @@ **city.cc** | CityHash code **cluster.cc** | Clustering (cluster\_fast and cluster\_smallmem) **cpu.cc** | Code dependent on specific cpu features (e.g. ssse3) +**cut.cc** | Restriction site cutting **db.cc** | Handles the database file read, access etc **dbhash.cc** | Database hashing for exact searches **dbindex.cc** | Indexes the database by identifying unique kmers in the sequences @@ -238,9 +239,11 @@ **sha1.c** | SHA1 message digest **showalign.cc** | Output an alignment in a human-readable way given a CIGAR-string and the sequences **shuffle.cc** | Shuffle sequences +**sintax.cc** | Taxonomic classification using Sintax method **sortbylength.cc** | Code for sorting by length **sortbysize.cc** | Code for sorting by size (abundance) **subsample.cc** | Subsampling reads from a FASTA file +**tax.cc** | Taxonomy information parsing **udb.cc** | UDB database file handling **unique.cc** | Find unique kmers in a sequence **userfields.cc** | Code for parsing the userfields option argument @@ -316,6 +319,11 @@ *Bioinformatics*, 27 (16): 2194-2200. doi:[10.1093/bioinformatics/btr381](https://doi.org/10.1093/bioinformatics/btr381) +* Edgar RC, Flyvbjerg H (2015) +**Error filtering, pair assembly and error correction for next-generation sequencing reads.** +*Bioinformatics*, 31 (21): 3476-3482. +doi:[10.1093/bioinformatics/btv401](https://doi.org/10.1093/bioinformatics/btv401) + * Guillou L, Bachar D, Audic S, Bass D, Berney C, Bittner L, Boutte C, Burgaud G, de Vargas C, Decelle J, del Campo J, Dolan J, Dunthorn M, Edvardsen B, Holzmann M, Kooistra W, Lara E, Lebescot N, Logares R, Mahé F, Massana R, Montresor M, Morard R, Not F, Pawlowski J, Probert I, Sauvadet A-L, Siano R, Stoeck T, Vaulot D, Zimmermann P & Christen R (2013) **The Protist Ribosomal Reference database (PR2): a catalog of unicellular eukaryote Small Sub-Unit rRNA sequences with curated taxonomy.** *Nucleic Acids Research*, 41 (D1), D597-D604. diff -Nru vsearch-2.18.0/src/align_simd.cc vsearch-2.21.1/src/align_simd.cc --- vsearch-2.18.0/src/align_simd.cc 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/src/align_simd.cc 2022-01-18 17:26:39.000000000 +0000 @@ -102,25 +102,25 @@ #ifdef __PPC__ -typedef vector signed short VECTOR_SHORT; +typedef __vector signed short VECTOR_SHORT; -const vector unsigned char perm_merge_long_low = +const __vector unsigned char perm_merge_long_low = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17}; -const vector unsigned char perm_merge_long_high = +const __vector unsigned char perm_merge_long_high = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; #define v_init(a,b,c,d,e,f,g,h) (const VECTOR_SHORT){a,b,c,d,e,f,g,h} #define v_load(a) vec_ld(0, (VECTOR_SHORT *)(a)) -#define v_store(a, b) vec_st((vector unsigned char)(b), 0, \ - (vector unsigned char *)(a)) +#define v_store(a, b) vec_st((__vector unsigned char)(b), 0, \ + (__vector unsigned char *)(a)) #define v_add(a, b) vec_adds((a), (b)) #define v_sub(a, b) vec_subs((a), (b)) #define v_sub_unsigned(a, b) ((VECTOR_SHORT) \ - vec_subs((vector unsigned short) (a), \ - (vector unsigned short) (b))) + vec_subs((__vector unsigned short) (a), \ + (__vector unsigned short) (b))) #define v_max(a, b) vec_max((a), (b)) #define v_min(a, b) vec_min((a), (b)) #define v_dup(a) vec_splat((VECTOR_SHORT){(short)(a), 0, 0, 0, 0, 0, 0, 0}, 0); @@ -298,13 +298,13 @@ { #ifdef __PPC__ - vector signed short reg0, reg1, reg2, reg3, + __vector signed short reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; - vector signed int reg8, reg9, reg10,reg11, + __vector signed int reg8, reg9, reg10,reg11, reg12,reg13,reg14,reg15; - vector signed long long reg16,reg17,reg18,reg19, + __vector signed long long reg16,reg17,reg18,reg19, reg20,reg21,reg22,reg23; - vector signed long long reg24,reg25,reg26,reg27, + __vector signed long long reg24,reg25,reg26,reg27, reg28,reg29,reg30,reg31; #else VECTOR_SHORT reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; @@ -323,39 +323,39 @@ reg7 = v_load(score_matrix_word + d[7] + i); #ifdef __PPC__ - reg8 = (vector signed int) vec_mergeh(reg0, reg1); - reg9 = (vector signed int) vec_mergel(reg0, reg1); - reg10 = (vector signed int) vec_mergeh(reg2, reg3); - reg11 = (vector signed int) vec_mergel(reg2, reg3); - reg12 = (vector signed int) vec_mergeh(reg4, reg5); - reg13 = (vector signed int) vec_mergel(reg4, reg5); - reg14 = (vector signed int) vec_mergeh(reg6, reg7); - reg15 = (vector signed int) vec_mergel(reg6, reg7); - - reg16 = (vector signed long long) vec_mergeh(reg8, reg10); - reg17 = (vector signed long long) vec_mergel(reg8, reg10); - reg18 = (vector signed long long) vec_mergeh(reg12, reg14); - reg19 = (vector signed long long) vec_mergel(reg12, reg14); - reg20 = (vector signed long long) vec_mergeh(reg9, reg11); - reg21 = (vector signed long long) vec_mergel(reg9, reg11); - reg22 = (vector signed long long) vec_mergeh(reg13, reg15); - reg23 = (vector signed long long) vec_mergel(reg13, reg15); + reg8 = (__vector signed int) vec_mergeh(reg0, reg1); + reg9 = (__vector signed int) vec_mergel(reg0, reg1); + reg10 = (__vector signed int) vec_mergeh(reg2, reg3); + reg11 = (__vector signed int) vec_mergel(reg2, reg3); + reg12 = (__vector signed int) vec_mergeh(reg4, reg5); + reg13 = (__vector signed int) vec_mergel(reg4, reg5); + reg14 = (__vector signed int) vec_mergeh(reg6, reg7); + reg15 = (__vector signed int) vec_mergel(reg6, reg7); + + reg16 = (__vector signed long long) vec_mergeh(reg8, reg10); + reg17 = (__vector signed long long) vec_mergel(reg8, reg10); + reg18 = (__vector signed long long) vec_mergeh(reg12, reg14); + reg19 = (__vector signed long long) vec_mergel(reg12, reg14); + reg20 = (__vector signed long long) vec_mergeh(reg9, reg11); + reg21 = (__vector signed long long) vec_mergel(reg9, reg11); + reg22 = (__vector signed long long) vec_mergeh(reg13, reg15); + reg23 = (__vector signed long long) vec_mergel(reg13, reg15); - reg24 = (vector signed long long) vec_perm + reg24 = (__vector signed long long) vec_perm (reg16, reg18, perm_merge_long_low); - reg25 = (vector signed long long) vec_perm + reg25 = (__vector signed long long) vec_perm (reg16, reg18, perm_merge_long_high); - reg26 = (vector signed long long) vec_perm + reg26 = (__vector signed long long) vec_perm (reg17, reg19, perm_merge_long_low); - reg27 = (vector signed long long) vec_perm + reg27 = (__vector signed long long) vec_perm (reg17, reg19, perm_merge_long_high); - reg28 = (vector signed long long) vec_perm + reg28 = (__vector signed long long) vec_perm (reg20, reg22, perm_merge_long_low); - reg29 = (vector signed long long) vec_perm + reg29 = (__vector signed long long) vec_perm (reg20, reg22, perm_merge_long_high); - reg30 = (vector signed long long) vec_perm + reg30 = (__vector signed long long) vec_perm (reg21, reg23, perm_merge_long_low); - reg31 = (vector signed long long) vec_perm + reg31 = (__vector signed long long) vec_perm (reg21, reg23, perm_merge_long_high); #else reg8 = v_merge_lo_16(reg0, reg1); @@ -428,38 +428,38 @@ /* The VSX vec_bperm instruction puts the 16 selected bits of the first source into bits 48-63 of the destination. */ -const vector unsigned char perm = { 120, 112, 104, 96, 88, 80, 72, 64, +const __vector unsigned char perm = { 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0 }; #define ALIGNCORE(H, N, F, V, RES, QR_q, R_q, QR_t, R_t, H_MIN, H_MAX) \ { \ - vector unsigned short W, X, Y, Z; \ - vector unsigned int WX, YZ; \ - vector short VV; \ + __vector unsigned short W, X, Y, Z; \ + __vector unsigned int WX, YZ; \ + __vector short VV; \ VV = v_load(&V); \ H = v_add(H, VV); \ - W = (vector unsigned short) VECTORBYTEPERMUTE \ - ((vector unsigned char) vec_cmpgt(F, H), perm); \ + W = (__vector unsigned short) VECTORBYTEPERMUTE \ + ((__vector unsigned char) vec_cmpgt(F, H), perm); \ H = v_max(H, F); \ - X = (vector unsigned short) VECTORBYTEPERMUTE \ - ((vector unsigned char) vec_cmpgt(E, H), perm); \ + X = (__vector unsigned short) VECTORBYTEPERMUTE \ + ((__vector unsigned char) vec_cmpgt(E, H), perm); \ H = v_max(H, E); \ H_MIN = v_min(H_MIN, H); \ H_MAX = v_max(H_MAX, H); \ N = H; \ HF = v_sub(H, QR_t); \ F = v_sub(F, R_t); \ - Y = (vector unsigned short) VECTORBYTEPERMUTE \ - ((vector unsigned char) vec_cmpgt(F, HF), perm); \ + Y = (__vector unsigned short) VECTORBYTEPERMUTE \ + ((__vector unsigned char) vec_cmpgt(F, HF), perm); \ F = v_max(F, HF); \ HE = v_sub(H, QR_q); \ E = v_sub(E, R_q); \ - Z = (vector unsigned short) VECTORBYTEPERMUTE \ - ((vector unsigned char) vec_cmpgt(E, HE), perm); \ + Z = (__vector unsigned short) VECTORBYTEPERMUTE \ + ((__vector unsigned char) vec_cmpgt(E, HE), perm); \ E = v_max(E, HE); \ - WX = (vector unsigned int) vec_mergel(W, X); \ - YZ = (vector unsigned int) vec_mergel(Y, Z); \ - RES = (vector unsigned long long) vec_mergeh(WX, YZ); \ + WX = (__vector unsigned int) vec_mergel(W, X); \ + YZ = (__vector unsigned int) vec_mergel(Y, Z); \ + RES = (__vector unsigned long long) vec_mergeh(WX, YZ); \ } #else @@ -527,7 +527,7 @@ VECTOR_SHORT h_max = v_zero; #ifdef __PPC__ - vector unsigned long long RES1, RES2, RES; + __vector unsigned long long RES1, RES2, RES; #endif int64_t i; @@ -679,7 +679,7 @@ VECTOR_SHORT h_max = v_zero; #ifdef __PPC__ - vector unsigned long long RES1, RES2, RES; + __vector unsigned long long RES1, RES2, RES; #endif int64_t i; diff -Nru vsearch-2.18.0/src/allpairs.cc vsearch-2.21.1/src/allpairs.cc --- vsearch-2.18.0/src/allpairs.cc 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/src/allpairs.cc 2022-01-18 17:26:39.000000000 +0000 @@ -80,6 +80,8 @@ static FILE * fp_fastapairs = nullptr; static FILE * fp_matched = nullptr; static FILE * fp_notmatched = nullptr; +static FILE * fp_qsegout = nullptr; +static FILE * fp_tsegout = nullptr; static int count_matched = 0; static int count_notmatched = 0; @@ -171,6 +173,26 @@ qsequence_rc); } + if (fp_qsegout) + { + results_show_qsegout_one(fp_qsegout, + hp, + query_head, + qsequence, + qseqlen, + qsequence_rc); + } + + if (fp_tsegout) + { + results_show_tsegout_one(fp_tsegout, + hp, + query_head, + qsequence, + qseqlen, + qsequence_rc); + } + if (fp_uc) { if ((t==0) || opt_uc_allhits) @@ -647,6 +669,24 @@ } } + if (opt_qsegout) + { + fp_qsegout = fopen_output(opt_qsegout); + if (! fp_qsegout) + { + fatal("Unable to open qsegout output file for writing"); + } + } + + if (opt_tsegout) + { + fp_tsegout = fopen_output(opt_tsegout); + if (! fp_tsegout) + { + fatal("Unable to open tsegout output file for writing"); + } + } + if (opt_matched) { fp_matched = fopen_output(opt_matched); @@ -738,6 +778,14 @@ { fclose(fp_fastapairs); } + if (opt_qsegout) + { + fclose(fp_qsegout); + } + if (opt_tsegout) + { + fclose(fp_tsegout); + } if (fp_uc) { fclose(fp_uc); diff -Nru vsearch-2.18.0/src/chimera.cc vsearch-2.21.1/src/chimera.cc --- vsearch-2.18.0/src/chimera.cc 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/src/chimera.cc 2022-01-18 17:26:39.000000000 +0000 @@ -1771,19 +1771,29 @@ /* prepare queries / database */ if (opt_uchime_ref) { - db_read(opt_db, 0); + /* check if the reference database may be an UDB file */ - if (opt_dbmask == MASK_DUST) + bool is_udb = udb_detect_isudb(opt_db); + + if (is_udb) { - dust_all(); + udb_read(opt_db, true, true); } - else if ((opt_dbmask == MASK_SOFT) && (opt_hardmask)) + else { - hardmask_all(); + db_read(opt_db, 0); + if (opt_dbmask == MASK_DUST) + { + dust_all(); + } + else if ((opt_dbmask == MASK_SOFT) && (opt_hardmask)) + { + hardmask_all(); + } + dbindex_prepare(1, opt_dbmask); + dbindex_addallsequences(opt_dbmask); } - dbindex_prepare(1, opt_dbmask); - dbindex_addallsequences(opt_dbmask); query_fasta_h = fasta_open(opt_uchime_ref); progress_total = fasta_get_size(query_fasta_h); } diff -Nru vsearch-2.18.0/src/cluster.cc vsearch-2.21.1/src/cluster.cc --- vsearch-2.18.0/src/cluster.cc 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/src/cluster.cc 2022-01-18 17:26:39.000000000 +0000 @@ -91,6 +91,8 @@ static FILE * fp_otutabout = nullptr; static FILE * fp_mothur_shared_out = nullptr; static FILE * fp_biomout = nullptr; +static FILE * fp_qsegout = nullptr; +static FILE * fp_tsegout = nullptr; static pthread_attr_t attr; @@ -454,6 +456,26 @@ qsequence_rc); } + if (fp_qsegout) + { + results_show_qsegout_one(fp_qsegout, + best, + query_head, + qsequence, + qseqlen, + qsequence_rc); + } + + if (fp_tsegout) + { + results_show_tsegout_one(fp_tsegout, + best, + query_head, + qsequence, + qseqlen, + qsequence_rc); + } + if (fp_userout) { results_show_userout_one(fp_userout, best, query_head, @@ -1202,6 +1224,24 @@ } } + if (opt_qsegout) + { + fp_qsegout = fopen_output(opt_qsegout); + if (! fp_qsegout) + { + fatal("Unable to open qsegout output file for writing"); + } + } + + if (opt_tsegout) + { + fp_tsegout = fopen_output(opt_tsegout); + if (! fp_tsegout) + { + fatal("Unable to open tsegout output file for writing"); + } + } + if (opt_matched) { fp_matched = fopen_output(opt_matched); @@ -1678,6 +1718,14 @@ { fclose(fp_fastapairs); } + if (opt_qsegout) + { + fclose(fp_qsegout); + } + if (opt_tsegout) + { + fclose(fp_tsegout); + } if (fp_blast6out) { fclose(fp_blast6out); diff -Nru vsearch-2.18.0/src/cpu.cc vsearch-2.21.1/src/cpu.cc --- vsearch-2.18.0/src/cpu.cc 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/src/cpu.cc 2022-01-18 17:26:39.000000000 +0000 @@ -121,32 +121,32 @@ unsigned char * bitmap, unsigned int totalbits) { - const vector unsigned char c1 = + const __vector unsigned char c1 = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 }; - const vector unsigned char c2 = + const __vector unsigned char c2 = { 0xfe, 0xfd, 0xfb, 0xf7, 0xef, 0xdf, 0xbf, 0x7f, 0xfe, 0xfd, 0xfb, 0xf7, 0xef, 0xdf, 0xbf, 0x7f }; - const vector unsigned char c3 = + const __vector unsigned char c3 = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; unsigned short * p = (unsigned short *)(bitmap); - vector signed short * q = (vector signed short *) (counters); + __vector signed short * q = (__vector signed short *) (counters); int r = (totalbits + 15) / 16; for(int j=0; j alloc_seqs)) + if (extra_info && (sequencecount + 1 > alloc_seqs)) { uint64_t new_alloc_seqs = 2 * alloc_seqs; @@ -423,6 +514,7 @@ char * seq = fastx_get_sequence(h); char * header = fastx_get_header(h); int64_t headerlen = fastx_get_header_length(h); + char * qual = fastx_get_quality(h); // nullptr if FASTA /* normalize sequence: uppercase and replace U by T */ string_normalize(seq_up, seq, seqlen); @@ -488,10 +580,9 @@ { bp = rc_bp; j = k; - if (opt_uc) + if (extra_info) { match_strand[sequencecount] = 1; - } } } @@ -503,15 +594,75 @@ if (bp->size) { /* at least one identical sequence already */ - bp->size += ab; - - if (opt_uc) + if (extra_info) { unsigned int last = bp->seqno_last; nextseqtab[last] = sequencecount; bp->seqno_last = sequencecount; headertab[sequencecount] = xstrdup(header); } + + int64_t s1 = bp->size; + int64_t s2 = ab; + int64_t s3 = s1 + s2; + + if (opt_fastqout) + { + /* update quality scores */ + for (int i = 0; i < seqlen; i++) + { + int q1 = bp->qual[i]; + int q2 = qual[i]; + double p1 = convert_q_to_p(q1); + double p2 = convert_q_to_p(q2); + double p3; + + /* how to compute the new quality score? */ + + if (opt_fastq_qout_max) + { + // fastq_qout_max + /* min error prob, highest quality */ + p3 = MIN(p1, p2); + } + else + { + // fastq_qout_avg + /* average, as in USEARCH */ + p3 = (p1 * s1 + p2 * s2) / s3; + } + + // fastq_qout_min + /* max error prob, lowest quality */ + // p3 = MAX(p1, p2); + + // fastq_qout_first + /* keep first */ + // p3 = p1; + + // fastq_qout_last + /* keep last */ + // p3 = p2; + + // fastq_qout_ef + /* Compute as multiple independent observations + Edgar & Flyvbjerg (2015) + But what about s1 and s2? */ + // p3 = p1 * p2 / 3.0 / (1.0 - p1 - p2 + (4.0 * p1 * p2 / 3.0)); + + /* always worst quality possible, certain error */ + // p3 = 1.0; + + // always best quality possible, perfect, no errors */ + // p3 = 0.0; + + int q3 = convert_p_to_q(p3); + bp->qual[i] = q3; + } + } + + bp->size = s3; + bp->count++; } else { @@ -522,6 +673,11 @@ bp->seqno_last = sequencecount; bp->seq = xstrdup(seq); bp->header = xstrdup(header); + bp->count = 1; + if (qual) + bp->qual = xstrdup(qual); + else + bp->qual = nullptr; clusters++; } @@ -700,9 +856,9 @@ /* write output */ - if (opt_output) + if (opt_output || opt_fastaout) { - progress_init("Writing output file", clusters); + progress_init("Writing FASTA output file", clusters); int64_t relabel_count = 0; for (uint64_t i=0; i= opt_minuniquesize) && (size <= opt_maxuniquesize)) { relabel_count++; - fasta_print_general(fp_output, + fasta_print_general(fp_fastaout, nullptr, bp->seq, strlen(bp->seq), @@ -731,7 +887,40 @@ } progress_done(); - fclose(fp_output); + fclose(fp_fastaout); + } + + if (opt_fastqout) + { + progress_init("Writing FASTQ output file", clusters); + + int64_t relabel_count = 0; + for (uint64_t i=0; isize; + if ((size >= opt_minuniquesize) && (size <= opt_maxuniquesize)) + { + relabel_count++; + fastq_print_general(fp_fastqout, + bp->seq, + strlen(bp->seq), + bp->header, + strlen(bp->header), + bp->qual, + size, + relabel_count, + -1.0); + if (relabel_count == opt_topn) + { + break; + } + } + progress_update(i); + } + + progress_done(); + fclose(fp_fastqout); } show_rusage(); @@ -775,6 +964,46 @@ progress_done(); } + if (opt_tabbedout) + { + progress_init("Writing tab separated file", clusters); + for (uint64_t i=0; iheader; + + if (opt_relabel) + fprintf(fp_tabbedout, + "%s\t%s%" PRIu64 "\t%" PRIu64 "\t%" PRIu64 "\t%u\t%s\n", + hh, opt_relabel, i + 1, i, (uint64_t) 0, bp->count, hh); + else + fprintf(fp_tabbedout, + "%s\t%s\t%" PRIu64 "\t%" PRIu64 "\t%u\t%s\n", + hh, hh, i, (uint64_t) 0, bp->count, hh); + + uint64_t j = 1; + for (unsigned int next = nextseqtab[bp->seqno_first]; + next != terminal; + next = nextseqtab[next]) + { + if (opt_relabel) + fprintf(fp_tabbedout, + "%s\t%s%" PRIu64 "\t%" PRIu64 "\t%" PRIu64 "\t%u\t%s\n", + headertab[next], opt_relabel, i + 1, i, j, bp->count, hh); + else + fprintf(fp_tabbedout, + "%s\t%s\t%" PRIu64 "\t%" PRIu64 "\t%u\t%s\n", + headertab[next], hh, i, j, bp->count, hh); + j++; + } + + progress_update(i); + } + fclose(fp_tabbedout); + progress_done(); + } + + show_rusage(); if (selected < clusters) @@ -809,6 +1038,8 @@ { xfree(bp->seq); xfree(bp->header); + if (bp->qual) + xfree(bp->qual); } } @@ -838,6 +1069,11 @@ FILE * fp_output = nullptr; FILE * fp_uc = nullptr; + if (opt_strand > 1) + { + fatal("Option '--strand both' not supported with --derep_prefix"); + } + if (opt_output) { fp_output = fopen_output(opt_output); @@ -1233,13 +1469,3 @@ xfree(hashtable); db_free(); } - -void derep_fulllength() -{ - derep(opt_derep_fulllength, false); -} - -void derep_id() -{ - derep(opt_derep_id, true); -} diff -Nru vsearch-2.18.0/src/derep.h vsearch-2.21.1/src/derep.h --- vsearch-2.18.0/src/derep.h 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/src/derep.h 2022-01-18 17:26:39.000000000 +0000 @@ -58,6 +58,5 @@ */ -void derep_fulllength(); -void derep_id(); +void derep(char * input_filename, bool use_header); void derep_prefix(); diff -Nru vsearch-2.18.0/src/eestats.cc vsearch-2.21.1/src/eestats.cc --- vsearch-2.18.0/src/eestats.cc 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/src/eestats.cc 2022-01-18 17:26:39.000000000 +0000 @@ -117,6 +117,9 @@ void fastq_eestats() { + if (!opt_output) + fatal("Output file for fastq_eestats must be specified with --output"); + fastx_handle h = fastq_open(opt_fastq_eestats); uint64_t filesize = fastq_get_size(h); @@ -436,6 +439,9 @@ void fastq_eestats2() { + if (!opt_output) + fatal("Output file for fastq_eestats2 must be specified with --output"); + fastx_handle h = fastq_open(opt_fastq_eestats2); uint64_t filesize = fastq_get_size(h); diff -Nru vsearch-2.18.0/src/fasta.cc vsearch-2.21.1/src/fasta.cc --- vsearch-2.18.0/src/fasta.cc 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/src/fasta.cc 2022-01-18 17:26:39.000000000 +0000 @@ -401,6 +401,11 @@ fprintf(fp, "%s", opt_label_suffix); } + if (opt_sample) + { + fprintf(fp, ";sample=%s", opt_sample); + } + if (clustersize > 0) { fprintf(fp, ";seqs=%d", clustersize); diff -Nru vsearch-2.18.0/src/fastq.cc vsearch-2.21.1/src/fastq.cc --- vsearch-2.18.0/src/fastq.cc 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/src/fastq.cc 2022-01-18 17:26:39.000000000 +0000 @@ -557,6 +557,11 @@ fprintf(fp, "%s", opt_label_suffix); } + if (opt_sample) + { + fprintf(fp, ";sample=%s", opt_sample); + } + if (opt_sizeout && (abundance > 0)) { fprintf(fp, ";size=%u", abundance); diff -Nru vsearch-2.18.0/src/fastqops.cc vsearch-2.21.1/src/fastqops.cc --- vsearch-2.18.0/src/fastqops.cc 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/src/fastqops.cc 2022-01-18 17:26:39.000000000 +0000 @@ -732,6 +732,9 @@ char * seq_buffer = (char*) xmalloc(buffer_alloc); char * qual_buffer = (char*) xmalloc(buffer_alloc); + if ((!opt_fastaout) && (!opt_fastqout)) + fatal("No output files specified"); + fastx_handle h = fastx_open(opt_fastx_revcomp); if (!h) @@ -867,6 +870,9 @@ void fastq_convert() { + if (! opt_fastqout) + fatal("No output file specified with --fastqout"); + fastx_handle h = fastq_open(opt_fastq_convert); if (!h) diff -Nru vsearch-2.18.0/src/fastx.cc vsearch-2.21.1/src/fastx.cc --- vsearch-2.18.0/src/fastx.cc 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/src/fastx.cc 2022-01-18 17:26:39.000000000 +0000 @@ -464,6 +464,11 @@ return h->is_fastq || h->is_empty; } +bool fastx_is_empty(fastx_handle h) +{ + return h->is_empty; +} + void fastx_close(fastx_handle h) { /* Warn about stripped chars */ diff -Nru vsearch-2.18.0/src/fastx.h vsearch-2.21.1/src/fastx.h --- vsearch-2.18.0/src/fastx.h 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/src/fastx.h 2022-01-18 17:26:39.000000000 +0000 @@ -115,6 +115,7 @@ /* fastx input */ bool fastx_is_fastq(fastx_handle h); +bool fastx_is_empty(fastx_handle h); void fastx_filter_header(fastx_handle h, bool truncateatspace); fastx_handle fastx_open(const char * filename); void fastx_close(fastx_handle h); diff -Nru vsearch-2.18.0/src/Makefile.am vsearch-2.21.1/src/Makefile.am --- vsearch-2.18.0/src/Makefile.am 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/src/Makefile.am 2022-01-18 17:26:39.000000000 +0000 @@ -1,16 +1,16 @@ bin_PROGRAMS = $(top_builddir)/bin/vsearch if TARGET_PPC -AM_CXXFLAGS=-Wall -Wsign-compare -O3 -g -mcpu=powerpc64le -maltivec +AM_CFLAGS=-Wall -Wsign-compare -mcpu=powerpc64le -maltivec else if TARGET_AARCH64 -AM_CXXFLAGS=-Wall -Wsign-compare -O3 -g -march=armv8-a+simd -mtune=generic +AM_CFLAGS=-Wall -Wsign-compare -march=armv8-a+simd -mtune=generic else -AM_CXXFLAGS=-Wall -Wsign-compare -O3 -g -march=x86-64 -mtune=generic +AM_CFLAGS=-Wall -Wsign-compare -march=x86-64 -mtune=generic endif endif -AM_CFLAGS=$(AM_CXXFLAGS) +AM_CXXFLAGS=$(AM_CFLAGS) -std=c++11 export MACOSX_DEPLOYMENT_TARGET=10.9 @@ -64,6 +64,7 @@ sortbylength.h \ sortbysize.h \ subsample.h \ +tax.h \ udb.h \ unique.h \ userfields.h \ @@ -158,6 +159,7 @@ sortbylength.cc \ sortbysize.cc \ subsample.cc \ +tax.cc \ udb.cc \ unique.cc \ userfields.cc \ diff -Nru vsearch-2.18.0/src/mask.cc vsearch-2.21.1/src/mask.cc --- vsearch-2.18.0/src/mask.cc 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/src/mask.cc 2022-01-18 17:26:39.000000000 +0000 @@ -254,6 +254,9 @@ void maskfasta() { + if (!opt_output) + fatal("Output file for masking must be specified with --output"); + FILE * fp_output = fopen_output(opt_output); if (!fp_output) { @@ -293,6 +296,9 @@ FILE * fp_fastaout = nullptr; FILE * fp_fastqout = nullptr; + if ((!opt_fastaout) && (!opt_fastqout)) + fatal("Specify output files for masking with --fastaout and/or --fastqout"); + if (opt_fastaout) { fp_fastaout = fopen_output(opt_fastaout); diff -Nru vsearch-2.18.0/src/mergepairs.cc vsearch-2.21.1/src/mergepairs.cc --- vsearch-2.18.0/src/mergepairs.cc 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/src/mergepairs.cc 2022-01-18 17:26:39.000000000 +0000 @@ -1125,6 +1125,10 @@ if (r < chunk_size) { finished_reading = true; + if (pairs_written >= pairs_read) + { + finished_all = true; + } } xpthread_cond_broadcast(&cond_chunks); } diff -Nru vsearch-2.18.0/src/rerep.cc vsearch-2.21.1/src/rerep.cc --- vsearch-2.18.0/src/rerep.cc 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/src/rerep.cc 2022-01-18 17:26:39.000000000 +0000 @@ -63,6 +63,9 @@ void rereplicate() { + if (!opt_output) + fatal("FASTA output file for rereplicate must be specified with --output"); + opt_xsize = true; FILE * fp_output = nullptr; diff -Nru vsearch-2.18.0/src/results.cc vsearch-2.21.1/src/results.cc --- vsearch-2.18.0/src/results.cc 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/src/results.cc 2022-01-18 17:26:39.000000000 +0000 @@ -114,6 +114,65 @@ } +void results_show_qsegout_one(FILE * fp, + struct hit * hp, + char * query_head, + char * qsequence, + int64_t qseqlen, + char * rc) +{ + if (hp) + { + char * qseg = (hp->strand ? rc : qsequence) + hp->trim_q_left; + int qseglen = qseqlen + - hp->trim_q_left - hp->trim_q_right; + + fasta_print_general(fp, + nullptr, + qseg, + qseglen, + query_head, + strlen(query_head), + 0, + 0, + -1.0, + -1, + -1, + nullptr, + 0.0); + } +} + +void results_show_tsegout_one(FILE * fp, + struct hit * hp, + char * query_head, + char * qsequence, + int64_t qseqlen, + char * rc) +{ + if (hp) + { + char * tseg = db_getsequence(hp->target) + hp->trim_t_left; + int tseglen = db_getsequencelen(hp->target) + - hp->trim_t_left - hp->trim_t_right; + + fasta_print_general(fp, + nullptr, + tseg, + tseglen, + db_getheader(hp->target), + db_getheaderlen(hp->target), + 0, + 0, + -1.0, + -1, + -1, + nullptr, + 0.0); + } +} + + void results_show_blast6out_one(FILE * fp, struct hit * hp, char * query_head, @@ -455,6 +514,82 @@ fprintf(fp, "\n"); } +void results_show_lcaout(FILE * fp, + struct hit * hits, + int hitcount, + char * query_head, + char * qsequence, + int64_t qseqlen, + char * rc) +{ + /* Output last common ancestor (LCA) of the hits, + in a similar way to the Sintax command */ + + int first_level_start[tax_levels]; + int first_level_len[tax_levels]; + int level_match[tax_levels]; + char * first_h = nullptr; + + fprintf(fp, "%s\t", query_head); + + if (hitcount > 0) + { + for (int t = 0; t < hitcount; t++) + { + int seqno = hits[t].target; + if (t == 0) + { + tax_split(seqno, first_level_start, first_level_len); + first_h = db_getheader(seqno); + for (int j = 0; j < tax_levels; j++) + { + level_match[j] = 1; + } + } + else + { + int level_start[tax_levels]; + int level_len[tax_levels]; + tax_split(seqno, level_start, level_len); + char * h = db_getheader(seqno); + for (int j = 0; j < tax_levels; j++) + { + /* For each taxonomic level */ + if ((level_len[j] == first_level_len[j]) && + (strncmp(first_h + first_level_start[j], + h + level_start[j], + level_len[j]) == 0)) + { + level_match[j]++; + } + } + } + } + + bool comma = false; + for (int j = 0; j < tax_levels; j++) + { + if (1.0 * level_match[j] / hitcount < opt_lca_cutoff) + { + break; + } + + if (first_level_len[j] > 0) + { + fprintf(fp, + "%s%c:%.*s", + (comma ? "," : ""), + tax_letters[j], + first_level_len[j], + first_h + first_level_start[j]); + comma = true; + } + } + } + + fprintf(fp, "\n"); +} + void results_show_alnout(FILE * fp, struct hit * hits, int hitcount, diff -Nru vsearch-2.18.0/src/results.h vsearch-2.21.1/src/results.h --- vsearch-2.18.0/src/results.h 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/src/results.h 2022-01-18 17:26:39.000000000 +0000 @@ -66,6 +66,14 @@ int64_t qseqlen, char * rc); +void results_show_lcaout(FILE * fp, + struct hit * hits, + int hitcount, + char * query_head, + char * qsequence, + int64_t qseqlen, + char * rc); + void results_show_blast6out_one(FILE * fp, struct hit * hp, char * query_head, @@ -95,6 +103,20 @@ int64_t qseqlen, char * rc); +void results_show_qsegout_one(FILE * fp, + struct hit * hp, + char * query_head, + char * qsequence, + int64_t qseqlen, + char * rc); + +void results_show_tsegout_one(FILE * fp, + struct hit * hp, + char * query_head, + char * qsequence, + int64_t qseqlen, + char * rc); + void results_show_samheader(FILE * fp, char * cmdline, char * dbname); diff -Nru vsearch-2.18.0/src/search.cc vsearch-2.21.1/src/search.cc --- vsearch-2.18.0/src/search.cc 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/src/search.cc 2022-01-18 17:26:39.000000000 +0000 @@ -91,6 +91,9 @@ static FILE * fp_otutabout = nullptr; static FILE * fp_mothur_shared_out = nullptr; static FILE * fp_biomout = nullptr; +static FILE * fp_lcaout = nullptr; +static FILE * fp_qsegout = nullptr; +static FILE * fp_tsegout = nullptr; static int count_matched = 0; static int count_notmatched = 0; @@ -119,6 +122,17 @@ qsequence_rc); } + if (fp_lcaout) + { + results_show_lcaout(fp_lcaout, + hits, + toreport, + query_head, + qsequence, + qseqlen, + qsequence_rc); + } + if (fp_samout) { results_show_samout(fp_samout, @@ -160,6 +174,26 @@ qsequence_rc); } + if (fp_qsegout) + { + results_show_qsegout_one(fp_qsegout, + hp, + query_head, + qsequence, + qseqlen, + qsequence_rc); + } + + if (fp_tsegout) + { + results_show_tsegout_one(fp_tsegout, + hp, + query_head, + qsequence, + qseqlen, + qsequence_rc); + } + if (fp_uc) { if ((t==0) || opt_uc_allhits) @@ -535,6 +569,15 @@ fprintf(fp_alnout, "%s\n", progheader); } + if (opt_lcaout) + { + fp_lcaout = fopen_output(opt_lcaout); + if (! fp_lcaout) + { + fatal("Unable to open lca output file for writing"); + } + } + if (opt_samout) { fp_samout = fopen_output(opt_samout); @@ -580,6 +623,24 @@ } } + if (opt_qsegout) + { + fp_qsegout = fopen_output(opt_qsegout); + if (! fp_qsegout) + { + fatal("Unable to open qsegout output file for writing"); + } + } + + if (opt_tsegout) + { + fp_tsegout = fopen_output(opt_tsegout); + if (! fp_tsegout) + { + fatal("Unable to open tsegout output file for writing"); + } + } + if (opt_matched) { fp_matched = fopen_output(opt_matched); @@ -689,6 +750,10 @@ dbindex_free(); db_free(); + if (opt_lcaout) + { + fclose(fp_lcaout); + } if (opt_matched) { fclose(fp_matched); @@ -701,6 +766,14 @@ { fclose(fp_fastapairs); } + if (opt_qsegout) + { + fclose(fp_qsegout); + } + if (opt_tsegout) + { + fclose(fp_tsegout); + } if (fp_uc) { fclose(fp_uc); diff -Nru vsearch-2.18.0/src/searchexact.cc vsearch-2.21.1/src/searchexact.cc --- vsearch-2.18.0/src/searchexact.cc 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/src/searchexact.cc 2022-01-18 17:26:39.000000000 +0000 @@ -89,6 +89,8 @@ static FILE * fp_otutabout = nullptr; static FILE * fp_mothur_shared_out = nullptr; static FILE * fp_biomout = nullptr; +static FILE * fp_qsegout = nullptr; +static FILE * fp_tsegout = nullptr; static int count_matched = 0; static int count_notmatched = 0; @@ -234,6 +236,26 @@ qsequence_rc); } + if (fp_qsegout) + { + results_show_qsegout_one(fp_qsegout, + hp, + query_head, + qsequence, + qseqlen, + qsequence_rc); + } + + if (fp_tsegout) + { + results_show_tsegout_one(fp_tsegout, + hp, + query_head, + qsequence, + qseqlen, + qsequence_rc); + } + if (fp_uc) { if ((t==0) || opt_uc_allhits) @@ -622,6 +644,24 @@ } } + if (opt_qsegout) + { + fp_qsegout = fopen_output(opt_qsegout); + if (! fp_qsegout) + { + fatal("Unable to open qsegout output file for writing"); + } + } + + if (opt_tsegout) + { + fp_tsegout = fopen_output(opt_tsegout); + if (! fp_tsegout) + { + fatal("Unable to open tsegout output file for writing"); + } + } + if (opt_matched) { fp_matched = fopen_output(opt_matched); @@ -740,6 +780,14 @@ { fclose(fp_fastapairs); } + if (opt_qsegout) + { + fclose(fp_qsegout); + } + if (opt_tsegout) + { + fclose(fp_tsegout); + } if (fp_uc) { fclose(fp_uc); diff -Nru vsearch-2.18.0/src/shuffle.cc vsearch-2.21.1/src/shuffle.cc --- vsearch-2.18.0/src/shuffle.cc 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/src/shuffle.cc 2022-01-18 17:26:39.000000000 +0000 @@ -62,6 +62,9 @@ void shuffle() { + if (!opt_output) + fatal("Output file for shuffling must be specified with --output"); + FILE * fp_output = fopen_output(opt_output); if (!fp_output) { diff -Nru vsearch-2.18.0/src/sintax.cc vsearch-2.21.1/src/sintax.cc --- vsearch-2.18.0/src/sintax.cc 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/src/sintax.cc 2022-01-18 17:26:39.000000000 +0000 @@ -85,8 +85,6 @@ static pthread_attr_t attr; static fastx_handle query_fastx_h; -const int tax_levels = 8; -const char * tax_letters = "dkpcofgs"; const int subset_size = 32; const int bootstrap_count = 100; @@ -97,128 +95,6 @@ static int queries = 0; static int classified = 0; -bool sintax_parse_tax(const char * header, - int header_length, - int * tax_start, - int * tax_end) -{ - /* - Identify the first occurence of the pattern (^|;)tax=([^;]*)(;|$) - */ - - if (! header) - { - return false; - } - - const char * attribute = "tax="; - - int hlen = header_length; - int alen = strlen(attribute); - - int i = 0; - - while (i < hlen - alen) - { - char * r = (char *) strstr(header + i, attribute); - - /* no match */ - if (r == nullptr) - { - break; - } - - i = r - header; - - /* check for ';' in front */ - if ((i > 0) && (header[i-1] != ';')) - { - i += alen + 1; - continue; - } - - * tax_start = i; - - /* find end (semicolon or end of header) */ - const char * s = strchr(header+i+alen, ';'); - if (s == nullptr) - { - * tax_end = hlen; - } - else - { - * tax_end = s - header; - } - - return true; - } - return false; -} - -void sintax_split(int seqno, int * level_start, int * level_len) -{ - /* Parse taxonomy string into the following parts - d domain - k kingdom - p phylum - c class - o order - f family - g genus - s species - */ - - for (int i = 0; i < tax_levels; i++) - { - level_start[i] = 0; - level_len[i] = 0; - } - - int tax_start, tax_end; - char * h = db_getheader(seqno); - int hlen = db_getheaderlen(seqno); - if (sintax_parse_tax(h, hlen, & tax_start, & tax_end)) - { - int t = tax_start + 4; - - while (t < tax_end) - { - /* Is the next char a recogized tax level letter? */ - const char * r = strchr(tax_letters, tolower(h[t])); - if (r) - { - int level = r - tax_letters; - - /* Is there a colon after it? */ - if (h[t + 1] == ':') - { - level_start[level] = t + 2; - - char * z = strchr(h + t + 2, ','); - if (z) - { - level_len[level] = z - h - t - 2; - } - else - { - level_len[level] = tax_end - t - 2; - } - } - } - - /* skip past next comma */ - char * x = strchr(h + t, ','); - if (x) - { - t = x - h + 1; - } - else - { - t = tax_end; - } - } - } -} void sintax_analyse(char * query_head, int strand, @@ -236,7 +112,7 @@ { char * best_h = db_getheader(best_seqno); - sintax_split(best_seqno, best_level_start, best_level_len); + tax_split(best_seqno, best_level_start, best_level_len); for (int & j : level_match) @@ -250,7 +126,7 @@ int level_start[tax_levels]; int level_len[tax_levels]; - sintax_split(all_seqno[i], level_start, level_len); + tax_split(all_seqno[i], level_start, level_len); char * h = db_getheader(all_seqno[i]); diff -Nru vsearch-2.18.0/src/sortbylength.cc vsearch-2.21.1/src/sortbylength.cc --- vsearch-2.18.0/src/sortbylength.cc 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/src/sortbylength.cc 2022-01-18 17:26:39.000000000 +0000 @@ -118,6 +118,9 @@ void sortbylength() { + if (!opt_output) + fatal("FASTA output file for sortbylength must be specified with --output"); + FILE * fp_output = fopen_output(opt_output); if (!fp_output) { diff -Nru vsearch-2.18.0/src/sortbysize.cc vsearch-2.21.1/src/sortbysize.cc --- vsearch-2.18.0/src/sortbysize.cc 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/src/sortbysize.cc 2022-01-18 17:26:39.000000000 +0000 @@ -108,6 +108,9 @@ void sortbysize() { + if (!opt_output) + fatal("FASTA output file for sortbysize must be specified with --output"); + FILE * fp_output = fopen_output(opt_output); if (!fp_output) { diff -Nru vsearch-2.18.0/src/tax.cc vsearch-2.21.1/src/tax.cc --- vsearch-2.18.0/src/tax.cc 1970-01-01 00:00:00.000000000 +0000 +++ vsearch-2.21.1/src/tax.cc 2022-01-18 17:26:39.000000000 +0000 @@ -0,0 +1,186 @@ +/* + + VSEARCH: a versatile open source tool for metagenomics + + Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + All rights reserved. + + Contact: Torbjorn Rognes , + Department of Informatics, University of Oslo, + PO Box 1080 Blindern, NO-0316 Oslo, Norway + + This software is dual-licensed and available under a choice + of one of two licenses, either under the terms of the GNU + General Public License version 3 or the BSD 2-Clause License. + + + GNU General Public License version 3 + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + + + The BSD 2-Clause License + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "vsearch.h" + +const char * tax_letters = "dkpcofgs"; + +bool tax_parse(const char * header, + int header_length, + int * tax_start, + int * tax_end) +{ + /* + Identify the first occurence of the pattern (^|;)tax=([^;]*)(;|$) + */ + + if (! header) + { + return false; + } + + const char * attribute = "tax="; + + int hlen = header_length; + int alen = strlen(attribute); + + int i = 0; + + while (i < hlen - alen) + { + char * r = (char *) strstr(header + i, attribute); + + /* no match */ + if (r == nullptr) + { + break; + } + + i = r - header; + + /* check for ';' in front */ + if ((i > 0) && (header[i-1] != ';')) + { + i += alen + 1; + continue; + } + + * tax_start = i; + + /* find end (semicolon or end of header) */ + const char * s = strchr(header+i+alen, ';'); + if (s == nullptr) + { + * tax_end = hlen; + } + else + { + * tax_end = s - header; + } + + return true; + } + return false; +} + +void tax_split(int seqno, int * level_start, int * level_len) +{ + /* Parse taxonomy string into the following parts + d domain + k kingdom + p phylum + c class + o order + f family + g genus + s species + */ + + for (int i = 0; i < tax_levels; i++) + { + level_start[i] = 0; + level_len[i] = 0; + } + + int tax_start, tax_end; + char * h = db_getheader(seqno); + int hlen = db_getheaderlen(seqno); + if (tax_parse(h, hlen, & tax_start, & tax_end)) + { + int t = tax_start + 4; + + while (t < tax_end) + { + /* Is the next char a recogized tax level letter? */ + const char * r = strchr(tax_letters, tolower(h[t])); + if (r) + { + int level = r - tax_letters; + + /* Is there a colon after it? */ + if (h[t + 1] == ':') + { + level_start[level] = t + 2; + + char * z = strchr(h + t + 2, ','); + if (z) + { + level_len[level] = z - h - t - 2; + } + else + { + level_len[level] = tax_end - t - 2; + } + } + } + + /* skip past next comma */ + char * x = strchr(h + t, ','); + if (x) + { + t = x - h + 1; + } + else + { + t = tax_end; + } + } + } +} diff -Nru vsearch-2.18.0/src/tax.h vsearch-2.21.1/src/tax.h --- vsearch-2.18.0/src/tax.h 1970-01-01 00:00:00.000000000 +0000 +++ vsearch-2.21.1/src/tax.h 2022-01-18 17:26:39.000000000 +0000 @@ -0,0 +1,69 @@ +/* + + VSEARCH: a versatile open source tool for metagenomics + + Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + All rights reserved. + + Contact: Torbjorn Rognes , + Department of Informatics, University of Oslo, + PO Box 1080 Blindern, NO-0316 Oslo, Norway + + This software is dual-licensed and available under a choice + of one of two licenses, either under the terms of the GNU + General Public License version 3 or the BSD 2-Clause License. + + + GNU General Public License version 3 + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + + + The BSD 2-Clause License + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + +*/ + +const int tax_levels = 8; +extern const char * tax_letters; + +bool tax_parse(const char * header, + int header_length, + int * tax_start, + int * tax_end); + +void tax_split(int seqno, int * level_start, int * level_len); diff -Nru vsearch-2.18.0/src/udb.cc vsearch-2.21.1/src/udb.cc --- vsearch-2.18.0/src/udb.cc 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/src/udb.cc 2022-01-18 17:26:39.000000000 +0000 @@ -151,13 +151,16 @@ return nbyte; } -bool udb_detect_isudb(const char * filename) +auto udb_detect_isudb(const char * filename) -> bool { /* Detect whether the given filename seems to refer to an UDB file. It must be an uncompressed regular file, not a pipe. */ + constexpr uint32_t udb_file_signature {0x55444246}; + constexpr uint64_t expected_n_bytes {sizeof(uint32_t)}; + xstat_t fs; if (xstat(filename, & fs)) @@ -179,10 +182,10 @@ } unsigned int magic = 0; - uint64_t bytesread = read(fd, & magic, 4); + uint64_t bytesread = read(fd, & magic, expected_n_bytes); close(fd); - if ((bytesread == 4) && (magic == 0x55444246)) + if ((bytesread == expected_n_bytes) && (magic == udb_file_signature)) { return true; } @@ -602,6 +605,9 @@ void udb_fasta() { + if (!opt_output) + fatal("FASTA output file must be specified with --output"); + /* open FASTA file for writing */ FILE * fp_output = fopen_output(opt_output); @@ -864,6 +870,9 @@ void udb_make() { + if (!opt_output) + fatal("UDB output file must be specified with --output"); + int fd_output = 0; fd_output = xopen_write(opt_output); diff -Nru vsearch-2.18.0/src/vsearch.cc vsearch-2.21.1/src/vsearch.cc --- vsearch-2.18.0/src/vsearch.cc 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/src/vsearch.cc 2022-01-18 17:26:39.000000000 +0000 @@ -73,6 +73,7 @@ bool opt_gzip_decompress; bool opt_label_substr_match; bool opt_no_progress; +bool opt_fastq_qout_max; bool opt_quiet; bool opt_relabel_keep; bool opt_relabel_md5; @@ -134,6 +135,7 @@ char * opt_fastx_mask; char * opt_fastx_revcomp; char * opt_fastx_subsample; +char * opt_fastx_uniques; char * opt_join_padgap; char * opt_join_padgapq; char * opt_label; @@ -142,6 +144,7 @@ char * opt_label_word; char * opt_label_words; char * opt_label_field; +char * opt_lcaout; char * opt_log; char * opt_makeudb_usearch; char * opt_maskfasta; @@ -156,10 +159,12 @@ char * opt_output; char * opt_pattern; char * opt_profile; +char * opt_qsegout; char * opt_relabel; char * opt_rereplicate; char * opt_reverse; char * opt_samout; +char * opt_sample; char * opt_search_exact; char * opt_sff_convert; char * opt_shuffle; @@ -167,6 +172,7 @@ char * opt_sortbylength; char * opt_sortbysize; char * opt_tabbedout; +char * opt_tsegout; char * opt_udb2fasta; char * opt_udbinfo; char * opt_udbstats; @@ -187,6 +193,7 @@ double opt_fastq_maxee_rate; double opt_fastq_truncee; double opt_id; +double opt_lca_cutoff; double opt_max_unmasked_pct; double opt_maxid; double opt_maxqt; @@ -803,6 +810,7 @@ opt_fastq_qmaxout = 41; opt_fastq_qmin = 0; opt_fastq_qminout = 0; + opt_fastq_qout_max = false; opt_fastq_stats = nullptr; opt_fastq_stripleft = 0; opt_fastq_stripright = 0; @@ -855,6 +863,8 @@ opt_length_cutoffs_increment = 50; opt_length_cutoffs_longest = INT_MAX; opt_length_cutoffs_shortest = 50; + opt_lca_cutoff = 1.0; + opt_lcaout = nullptr; opt_log = nullptr; opt_makeudb_usearch = nullptr; opt_maskfasta = nullptr; @@ -904,6 +914,7 @@ opt_pattern = nullptr; opt_profile = nullptr; opt_qmask = MASK_DUST; + opt_qsegout = nullptr; opt_query_cov = 0.0; opt_quiet = false; opt_randseed = 0; @@ -918,6 +929,7 @@ opt_rowlen = 64; opt_samheader = false; opt_samout = nullptr; + opt_sample = nullptr; opt_sample_pct = 0; opt_sample_size = 0; opt_search_exact = nullptr; @@ -942,6 +954,7 @@ opt_threads = 0; opt_top_hits_only = 0; opt_topn = LONG_MAX; + opt_tsegout = nullptr; opt_udb2fasta = nullptr; opt_udbinfo = nullptr; opt_udbstats = nullptr; @@ -1039,6 +1052,7 @@ option_fastq_qmaxout, option_fastq_qmin, option_fastq_qminout, + option_fastq_qout_max, option_fastq_stats, option_fastq_stripleft, option_fastq_stripright, @@ -1060,6 +1074,7 @@ option_fastx_mask, option_fastx_revcomp, option_fastx_subsample, + option_fastx_uniques, option_fulldp, option_gapext, option_gapopen, @@ -1081,6 +1096,8 @@ option_label_word, option_label_words, option_labels, + option_lca_cutoff, + option_lcaout, option_leftjust, option_length_cutoffs, option_log, @@ -1133,6 +1150,7 @@ option_pattern, option_profile, option_qmask, + option_qsegout, option_query_cov, option_quiet, option_randseed, @@ -1147,6 +1165,7 @@ option_rowlen, option_samheader, option_samout, + option_sample, option_sample_pct, option_sample_size, option_search_exact, @@ -1171,6 +1190,7 @@ option_threads, option_top_hits_only, option_topn, + option_tsegout, option_uc, option_uc_allhits, option_uchime2_denovo, @@ -1270,6 +1290,7 @@ {"fastq_qmaxout", required_argument, nullptr, 0 }, {"fastq_qmin", required_argument, nullptr, 0 }, {"fastq_qminout", required_argument, nullptr, 0 }, + {"fastq_qout_max", no_argument, nullptr, 0 }, {"fastq_stats", required_argument, nullptr, 0 }, {"fastq_stripleft", required_argument, nullptr, 0 }, {"fastq_stripright", required_argument, nullptr, 0 }, @@ -1291,6 +1312,7 @@ {"fastx_mask", required_argument, nullptr, 0 }, {"fastx_revcomp", required_argument, nullptr, 0 }, {"fastx_subsample", required_argument, nullptr, 0 }, + {"fastx_uniques", required_argument, nullptr, 0 }, {"fulldp", no_argument, nullptr, 0 }, {"gapext", required_argument, nullptr, 0 }, {"gapopen", required_argument, nullptr, 0 }, @@ -1312,6 +1334,8 @@ {"label_word", required_argument, nullptr, 0 }, {"label_words", required_argument, nullptr, 0 }, {"labels", required_argument, nullptr, 0 }, + {"lca_cutoff", required_argument, nullptr, 0 }, + {"lcaout", required_argument, nullptr, 0 }, {"leftjust", no_argument, nullptr, 0 }, {"length_cutoffs", required_argument, nullptr, 0 }, {"log", required_argument, nullptr, 0 }, @@ -1364,6 +1388,7 @@ {"pattern", required_argument, nullptr, 0 }, {"profile", required_argument, nullptr, 0 }, {"qmask", required_argument, nullptr, 0 }, + {"qsegout", required_argument, nullptr, 0 }, {"query_cov", required_argument, nullptr, 0 }, {"quiet", no_argument, nullptr, 0 }, {"randseed", required_argument, nullptr, 0 }, @@ -1378,6 +1403,7 @@ {"rowlen", required_argument, nullptr, 0 }, {"samheader", no_argument, nullptr, 0 }, {"samout", required_argument, nullptr, 0 }, + {"sample", required_argument, nullptr, 0 }, {"sample_pct", required_argument, nullptr, 0 }, {"sample_size", required_argument, nullptr, 0 }, {"search_exact", required_argument, nullptr, 0 }, @@ -1402,6 +1428,7 @@ {"threads", required_argument, nullptr, 0 }, {"top_hits_only", no_argument, nullptr, 0 }, {"topn", required_argument, nullptr, 0 }, + {"tsegout", required_argument, nullptr, 0 }, {"uc", required_argument, nullptr, 0 }, {"uc_allhits", no_argument, nullptr, 0 }, {"uchime2_denovo", required_argument, nullptr, 0 }, @@ -2420,6 +2447,34 @@ opt_fasta2fastq = optarg; break; + case option_lcaout: + opt_lcaout = optarg; + break; + + case option_lca_cutoff: + opt_lca_cutoff = args_getdouble(optarg); + break; + + case option_fastx_uniques: + opt_fastx_uniques = optarg; + break; + + case option_fastq_qout_max: + opt_fastq_qout_max = true; + break; + + case option_sample: + opt_sample = optarg; + break; + + case option_qsegout: + opt_qsegout = optarg; + break; + + case option_tsegout: + opt_tsegout = optarg; + break; + default: fatal("Internal error in option parsing"); } @@ -2466,6 +2521,7 @@ option_fastx_mask, option_fastx_revcomp, option_fastx_subsample, + option_fastx_uniques, option_h, option_help, option_makeudb_usearch, @@ -2497,7 +2553,7 @@ The first line is the command and the lines below are the valid options. */ - const int valid_options[][92] = + const int valid_options[][96] = { { option_allpairs_global, @@ -2518,6 +2574,7 @@ option_iddef, option_idprefix, option_idsuffix, + option_label_suffix, option_leftjust, option_log, option_match, @@ -2550,6 +2607,7 @@ option_output_no_hits, option_pattern, option_qmask, + option_qsegout, option_query_cov, option_quiet, option_relabel, @@ -2561,6 +2619,7 @@ option_rowlen, option_samheader, option_samout, + option_sample, option_self, option_selfid, option_sizein, @@ -2569,6 +2628,7 @@ option_target_cov, option_threads, option_top_hits_only, + option_tsegout, option_uc, option_userfields, option_userout, @@ -2603,6 +2663,7 @@ option_iddef, option_idprefix, option_idsuffix, + option_label_suffix, option_leftjust, option_log, option_match, @@ -2639,6 +2700,7 @@ option_pattern, option_profile, option_qmask, + option_qsegout, option_query_cov, option_quiet, option_relabel, @@ -2650,6 +2712,7 @@ option_rowlen, option_samheader, option_samout, + option_sample, option_self, option_selfid, option_sizein, @@ -2660,6 +2723,7 @@ option_target_cov, option_threads, option_top_hits_only, + option_tsegout, option_uc, option_userfields, option_userout, @@ -2694,6 +2758,7 @@ option_iddef, option_idprefix, option_idsuffix, + option_label_suffix, option_leftjust, option_log, option_match, @@ -2730,6 +2795,7 @@ option_pattern, option_profile, option_qmask, + option_qsegout, option_query_cov, option_quiet, option_relabel, @@ -2741,6 +2807,7 @@ option_rowlen, option_samheader, option_samout, + option_sample, option_self, option_selfid, option_sizein, @@ -2751,6 +2818,7 @@ option_target_cov, option_threads, option_top_hits_only, + option_tsegout, option_uc, option_userfields, option_userout, @@ -2785,6 +2853,7 @@ option_iddef, option_idprefix, option_idsuffix, + option_label_suffix, option_leftjust, option_log, option_match, @@ -2821,6 +2890,7 @@ option_pattern, option_profile, option_qmask, + option_qsegout, option_query_cov, option_quiet, option_relabel, @@ -2832,6 +2902,7 @@ option_rowlen, option_samheader, option_samout, + option_sample, option_self, option_selfid, option_sizein, @@ -2842,6 +2913,7 @@ option_target_cov, option_threads, option_top_hits_only, + option_tsegout, option_uc, option_userfields, option_userout, @@ -2877,6 +2949,7 @@ option_iddef, option_idprefix, option_idsuffix, + option_label_suffix, option_leftjust, option_log, option_match, @@ -2911,6 +2984,7 @@ option_notrunclabels, option_otutabout, option_output_no_hits, + option_qsegout, option_pattern, option_profile, option_qmask, @@ -2925,6 +2999,7 @@ option_rowlen, option_samheader, option_samout, + option_sample, option_self, option_selfid, option_sizein, @@ -2935,6 +3010,7 @@ option_target_cov, option_threads, option_top_hits_only, + option_tsegout, option_uc, option_unoise_alpha, option_userfields, @@ -2955,6 +3031,7 @@ option_fastaout_discarded_rev, option_fastaout_rev, option_gzip_decompress, + option_label_suffix, option_log, option_no_progress, option_notrunclabels, @@ -2964,6 +3041,7 @@ option_relabel_md5, option_relabel_self, option_relabel_sha1, + option_sample, option_sizein, option_sizeout, option_xee, @@ -2988,6 +3066,7 @@ option_relabel_md5, option_relabel_self, option_relabel_sha1, + option_sample, option_sizein, option_sizeout, option_strand, @@ -3002,6 +3081,7 @@ option_bzip2_decompress, option_fasta_width, option_gzip_decompress, + option_label_suffix, option_log, option_maxseqlength, option_maxuniquesize, @@ -3016,6 +3096,7 @@ option_relabel_md5, option_relabel_self, option_relabel_sha1, + option_sample, option_sizein, option_sizeout, option_strand, @@ -3030,6 +3111,7 @@ option_bzip2_decompress, option_fasta_width, option_gzip_decompress, + option_label_suffix, option_log, option_maxseqlength, option_maxuniquesize, @@ -3044,6 +3126,7 @@ option_relabel_md5, option_relabel_self, option_relabel_sha1, + option_sample, option_sizein, option_sizeout, option_strand, @@ -3060,6 +3143,7 @@ option_fastq_qmaxout, option_fastqout, option_gzip_decompress, + option_label_suffix, option_log, option_no_progress, option_quiet, @@ -3068,6 +3152,7 @@ option_relabel_md5, option_relabel_self, option_relabel_sha1, + option_sample, option_sizein, option_sizeout, option_threads, @@ -3095,6 +3180,7 @@ option_fastq_qminout, option_fastqout, option_gzip_decompress, + option_label_suffix, option_log, option_no_progress, option_quiet, @@ -3103,6 +3189,7 @@ option_relabel_md5, option_relabel_self, option_relabel_sha1, + option_sample, option_sizein, option_sizeout, option_threads, @@ -3166,6 +3253,7 @@ option_fastqout_discarded_rev, option_fastqout_rev, option_gzip_decompress, + option_label_suffix, option_log, option_maxsize, option_minsize, @@ -3177,6 +3265,7 @@ option_relabel_self, option_relabel_sha1, option_reverse, + option_sample, option_sizein, option_sizeout, option_threads, @@ -3195,6 +3284,7 @@ option_gzip_decompress, option_join_padgap, option_join_padgapq, + option_label_suffix, option_log, option_no_progress, option_quiet, @@ -3251,6 +3341,7 @@ option_relabel_self, option_relabel_sha1, option_reverse, + option_sample, option_sizein, option_sizeout, option_threads, @@ -3299,6 +3390,7 @@ option_fastqout_discarded_rev, option_fastqout_rev, option_gzip_decompress, + option_label_suffix, option_log, option_maxsize, option_minsize, @@ -3311,6 +3403,7 @@ option_relabel_self, option_relabel_sha1, option_reverse, + option_sample, option_sizein, option_sizeout, option_threads, @@ -3329,6 +3422,7 @@ option_gzip_decompress, option_label, option_label_substr_match, + option_label_suffix, option_log, option_no_progress, option_notmatched, @@ -3340,6 +3434,7 @@ option_relabel_md5, option_relabel_self, option_relabel_sha1, + option_sample, option_sizein, option_sizeout, option_threads, @@ -3359,6 +3454,7 @@ option_label, option_label_field, option_label_substr_match, + option_label_suffix, option_label_word, option_label_words, option_labels, @@ -3373,6 +3469,7 @@ option_relabel_md5, option_relabel_self, option_relabel_sha1, + option_sample, option_sizein, option_sizeout, option_threads, @@ -3391,6 +3488,7 @@ option_gzip_decompress, option_label, option_label_substr_match, + option_label_suffix, option_log, option_no_progress, option_notmatched, @@ -3402,6 +3500,7 @@ option_relabel_md5, option_relabel_self, option_relabel_sha1, + option_sample, option_sizein, option_sizeout, option_subseq_end, @@ -3421,6 +3520,7 @@ option_fastqout, option_gzip_decompress, option_hardmask, + option_label_suffix, option_log, option_max_unmasked_pct, option_min_unmasked_pct, @@ -3433,6 +3533,7 @@ option_relabel_md5, option_relabel_self, option_relabel_sha1, + option_sample, option_sizein, option_sizeout, option_threads, @@ -3459,6 +3560,7 @@ option_relabel_md5, option_relabel_self, option_relabel_sha1, + option_sample, option_sizein, option_sizeout, option_threads, @@ -3477,6 +3579,7 @@ option_fastqout, option_fastqout_discarded, option_gzip_decompress, + option_label_suffix, option_log, option_no_progress, option_notrunclabels, @@ -3487,6 +3590,7 @@ option_relabel_md5, option_relabel_self, option_relabel_sha1, + option_sample, option_sample_pct, option_sample_size, option_sizein, @@ -3496,6 +3600,45 @@ option_xsize, -1 }, + { option_fastx_uniques, + option_bzip2_decompress, + option_fasta_width, + option_fastaout, + option_fastq_ascii, + option_fastq_asciiout, + option_fastq_qmax, + option_fastq_qmaxout, + option_fastq_qmin, + option_fastq_qminout, + option_fastq_qout_max, + option_fastqout, + option_gzip_decompress, + option_label_suffix, + option_log, + option_maxseqlength, + option_maxuniquesize, + option_minseqlength, + option_minuniquesize, + option_no_progress, + option_notrunclabels, + option_quiet, + option_relabel, + option_relabel_keep, + option_relabel_md5, + option_relabel_self, + option_relabel_sha1, + option_sample, + option_sizein, + option_sizeout, + option_strand, + option_tabbedout, + option_threads, + option_topn, + option_uc, + option_xee, + option_xsize, + -1 }, + { option_h, option_log, option_quiet, @@ -3528,6 +3671,7 @@ option_fasta_width, option_gzip_decompress, option_hardmask, + option_label_suffix, option_log, option_max_unmasked_pct, option_maxseqlength, @@ -3543,6 +3687,7 @@ option_relabel_md5, option_relabel_self, option_relabel_sha1, + option_sample, option_sizein, option_sizeout, option_threads, @@ -3558,6 +3703,7 @@ option_fastaout, option_fastqout, option_gzip_decompress, + option_label_suffix, option_log, option_no_progress, option_notmatched, @@ -3569,6 +3715,7 @@ option_relabel_md5, option_relabel_self, option_relabel_sha1, + option_sample, option_sizein, option_sizeout, option_tabbedout, @@ -3582,6 +3729,7 @@ option_bzip2_decompress, option_fasta_width, option_gzip_decompress, + option_label_suffix, option_log, option_no_progress, option_notrunclabels, @@ -3592,6 +3740,7 @@ option_relabel_md5, option_relabel_self, option_relabel_sha1, + option_sample, option_sizein, option_sizeout, option_threads, @@ -3612,6 +3761,9 @@ option_fastapairs, option_gzip_decompress, option_hardmask, + option_label_suffix, + option_lca_cutoff, + option_lcaout, option_log, option_match, option_matched, @@ -3635,6 +3787,7 @@ option_otutabout, option_output_no_hits, option_qmask, + option_qsegout, option_quiet, option_relabel, option_relabel_keep, @@ -3644,12 +3797,14 @@ option_rowlen, option_samheader, option_samout, + option_sample, option_self, option_sizein, option_sizeout, option_strand, option_threads, option_top_hits_only, + option_tsegout, option_uc, option_uc_allhits, option_userfields, @@ -3663,6 +3818,7 @@ option_fastq_qmaxout, option_fastq_qminout, option_fastqout, + option_label_suffix, option_log, option_no_progress, option_quiet, @@ -3671,6 +3827,7 @@ option_relabel_md5, option_relabel_self, option_relabel_sha1, + option_sample, option_sff_clip, option_sizeout, option_threads, @@ -3683,6 +3840,7 @@ option_fastq_qmax, option_fastq_qmin, option_gzip_decompress, + option_label_suffix, option_log, option_maxseqlength, option_minseqlength, @@ -3696,6 +3854,7 @@ option_relabel_md5, option_relabel_self, option_relabel_sha1, + option_sample, option_sizein, option_sizeout, option_threads, @@ -3712,10 +3871,12 @@ option_fastq_qmax, option_fastq_qmin, option_gzip_decompress, + option_label_suffix, option_log, option_no_progress, option_notrunclabels, option_quiet, + option_randseed, option_sintax_cutoff, option_strand, option_tabbedout, @@ -3730,6 +3891,7 @@ option_fastq_qmax, option_fastq_qmin, option_gzip_decompress, + option_label_suffix, option_log, option_maxseqlength, option_minseqlength, @@ -3742,6 +3904,7 @@ option_relabel_md5, option_relabel_self, option_relabel_sha1, + option_sample, option_sizein, option_sizeout, option_threads, @@ -3757,6 +3920,7 @@ option_fastq_qmax, option_fastq_qmin, option_gzip_decompress, + option_label_suffix, option_log, option_maxseqlength, option_maxsize, @@ -3771,6 +3935,7 @@ option_relabel_md5, option_relabel_self, option_relabel_sha1, + option_sample, option_sizein, option_sizeout, option_threads, @@ -3790,6 +3955,7 @@ option_gapext, option_gapopen, option_hardmask, + option_label_suffix, option_log, option_match, option_mindiffs, @@ -3806,6 +3972,7 @@ option_relabel_md5, option_relabel_self, option_relabel_sha1, + option_sample, option_sizein, option_sizeout, option_threads, @@ -3828,6 +3995,7 @@ option_gapext, option_gapopen, option_hardmask, + option_label_suffix, option_log, option_match, option_mindiffs, @@ -3844,6 +4012,7 @@ option_relabel_md5, option_relabel_self, option_relabel_sha1, + option_sample, option_sizein, option_sizeout, option_threads, @@ -3866,6 +4035,7 @@ option_gapext, option_gapopen, option_hardmask, + option_label_suffix, option_log, option_match, option_mindiffs, @@ -3882,6 +4052,7 @@ option_relabel_md5, option_relabel_self, option_relabel_sha1, + option_sample, option_sizein, option_sizeout, option_threads, @@ -3906,6 +4077,7 @@ option_gapext, option_gapopen, option_hardmask, + option_label_suffix, option_log, option_match, option_mindiffs, @@ -3922,6 +4094,7 @@ option_relabel_md5, option_relabel_self, option_relabel_sha1, + option_sample, option_self, option_selfid, option_sizein, @@ -3938,6 +4111,7 @@ { option_udb2fasta, option_fasta_width, + option_label_suffix, option_log, option_no_progress, option_output, @@ -3947,6 +4121,7 @@ option_relabel_md5, option_relabel_self, option_relabel_sha1, + option_sample, option_sizein, option_sizeout, option_threads, @@ -3989,6 +4164,9 @@ option_iddef, option_idprefix, option_idsuffix, + option_label_suffix, + option_lca_cutoff, + option_lcaout, option_leftjust, option_log, option_match, @@ -4023,6 +4201,7 @@ option_output_no_hits, option_pattern, option_qmask, + option_qsegout, option_query_cov, option_quiet, option_relabel, @@ -4034,6 +4213,7 @@ option_rowlen, option_samheader, option_samout, + option_sample, option_self, option_selfid, option_sizein, @@ -4043,6 +4223,7 @@ option_target_cov, option_threads, option_top_hits_only, + option_tsegout, option_uc, option_uc_allhits, option_userfields, @@ -4355,6 +4536,11 @@ fatal("The argument to sintax_cutoff must be in the range 0.0 to 1.0"); } + if ((opt_lca_cutoff <= 0.5) || (opt_lca_cutoff > 1.0)) + { + fatal("The argument to lca_cutoff must be larger than 0.5, but not larger than 1.0"); + } + if (opt_minuniquesize < 1) { fatal("The argument to minuniquesize must be at least 1"); @@ -4643,6 +4829,7 @@ " --derep_fulllength FILENAME dereplicate sequences in the given FASTA file\n" " --derep_id FILENAME dereplicate using both identifiers and sequences\n" " --derep_prefix FILENAME dereplicate sequences in file based on prefixes\n" + " --fastx_uniques FILENAME dereplicate sequences in the FASTA/FASTQ file\n" " --rereplicate FILENAME rereplicate sequences in the given FASTA file\n" " Parameters\n" " --maxuniquesize INT maximum abundance for output from dereplication\n" @@ -4650,13 +4837,21 @@ " --sizein propagate abundance annotation from input\n" " --strand plus|both dereplicate plus or both strands (plus)\n" " Output\n" - " --output FILENAME output FASTA file\n" + " --fastq_ascii INT FASTQ input quality score ASCII base char (33)\n" + " --fastq_qmax INT maximum base quality value for FASTQ input (41)\n" + " --fastq_qmaxout INT maximum base quality value for FASTQ output (41)\n" + " --fastq_qmin INT minimum base quality value for FASTQ input (0)\n" + " --fastq_qminout INT minimum base quality value for FASTQ output (0)\n" + " --fastaout FILENAME output FASTA file (for fastx_uniques)\n" + " --fastqout FILENAME output FASTQ file (for fastx_uniques)\n" + " --output FILENAME output FASTA file (not for fastx_uniques)\n" " --relabel STRING relabel with this prefix string\n" " --relabel_keep keep the old label after the new when relabelling\n" " --relabel_md5 relabel with md5 digest of normalized sequence\n" " --relabel_self relabel with the sequence itself as label\n" " --relabel_sha1 relabel with sha1 digest of normalized sequence\n" " --sizeout write abundance annotation to output\n" + " --tabbedout FILENAME write cluster info to tsv file for fastx_uniques\n" " --topn INT output only n most abundant sequences after derep\n" " --uc FILENAME filename for UCLUST-like dereplication output\n" " --xsize strip abundance information in derep output\n" @@ -4820,6 +5015,7 @@ " --iddef INT id definition, 0-4=CD-HIT,all,int,MBL,BLAST (2)\n" " --idprefix INT reject if first n nucleotides do not match\n" " --idsuffix INT reject if last n nucleotides do not match\n" + " --lca_cutoff REAL fraction of matching hits required for LCA (1.0)\n" " --leftjust reject if terminal gaps at alignment left end\n" " --match INT score for match (2)\n" " --maxaccepts INT number of hits to accept and show per strand (1)\n" @@ -4860,6 +5056,7 @@ " --dbmatched FILENAME FASTA file for matching database sequences\n" " --dbnotmatched FILENAME FASTA file for non-matching database sequences\n" " --fastapairs FILENAME FASTA file with pairs of query and target\n" + " --lcaout FILENAME output LCA of matching sequences to file\n" " --matched FILENAME FASTA file for matching query sequences\n" " --mothur_shared_out FN filename for OTU table output in mothur format\n" " --notmatched FILENAME FASTA file for non-matching query sequences\n" @@ -5011,7 +5208,7 @@ (!opt_dbmatched) && (!opt_dbnotmatched) && (!opt_samout) && (!opt_otutabout) && (!opt_biomout) && (!opt_mothur_shared_out) && - (!opt_fastapairs)) + (!opt_fastapairs) && (!opt_lcaout)) { fatal("No output files specified"); } @@ -5039,7 +5236,7 @@ (!opt_dbmatched) && (!opt_dbnotmatched) && (!opt_samout) && (!opt_otutabout) && (!opt_biomout) && (!opt_mothur_shared_out) && - (!opt_fastapairs)) + (!opt_fastapairs) && (!opt_lcaout)) { fatal("No output files specified"); } @@ -5052,94 +5249,6 @@ search_exact(cmdline, progheader); } -void cmd_sortbysize() -{ - if (!opt_output) - { - fatal("FASTA output file for sortbysize must be specified with --output"); - } - - sortbysize(); -} - -void cmd_sortbylength() -{ - if (!opt_output) - { - fatal("FASTA output file for sortbylength must be specified with --output"); - } - - sortbylength(); -} - -void cmd_rereplicate() -{ - if (!opt_output) - { - fatal("FASTA output file for rereplicate must be specified with --output"); - } - - rereplicate(); -} - -void cmd_derep() -{ - if ((!opt_output) && (!opt_uc)) - { - fatal("Output file for dereplication must be specified with --output or --uc"); - } - - if (opt_derep_fulllength) - { - derep_fulllength(); - } - else if (opt_derep_id) - { - derep_id(); - } - else - { - if (opt_strand > 1) - { - fatal("Option '--strand both' not supported with --derep_prefix"); - } - else - { - derep_prefix(); - } - } -} - -void cmd_shuffle() -{ - if (!opt_output) - { - fatal("Output file for shuffling must be specified with --output"); - } - - shuffle(); -} - -void cmd_fastq_eestats() -{ - if (!opt_output) - { - fatal("Output file for fastq_eestats must be specified with --output"); - } - - fastq_eestats(); -} - -void cmd_fastq_eestats2() -{ - if (!opt_output) - { - fatal("Output file for fastq_eestats2 must be specified with --output"); - } - - fastq_eestats2(); -} - void cmd_subsample() { if ((!opt_fastaout) && (!opt_fastqout)) @@ -5155,44 +5264,6 @@ subsample(); } -void cmd_maskfasta() -{ - if (!opt_output) - { - fatal("Output file for masking must be specified with --output"); - } - - maskfasta(); -} - -void cmd_makeudb_usearch() -{ - if (!opt_output) - { - fatal("UDB output file must be specified with --output"); - } - udb_make(); -} - -void cmd_udb2fasta() -{ - if (!opt_output) - { - fatal("FASTA output file must be specified with --output"); - } - udb_fasta(); -} - -void cmd_fastx_mask() -{ - if ((!opt_fastaout) && (!opt_fastqout)) - { - fatal("Specify output files for masking with --fastaout and/or --fastqout"); - } - - fastx_mask(); -} - void cmd_none() { if (! opt_quiet) @@ -5206,7 +5277,6 @@ "vsearch --allpairs_global FILENAME --id 0.5 --alnout FILENAME\n" "vsearch --cluster_size FILENAME --id 0.97 --centroids FILENAME\n" "vsearch --cut FILENAME --cut_pattern G^AATT_C --fastaout FILENAME\n" - "vsearch --derep_fulllength FILENAME --output FILENAME\n" "vsearch --fastq_chars FILENAME\n" "vsearch --fastq_convert FILENAME --fastqout FILENAME --fastq_ascii 64\n" "vsearch --fastq_eestats FILENAME --output FILENAME\n" @@ -5218,6 +5288,7 @@ "vsearch --fastx_mask FILENAME --fastaout FILENAME\n" "vsearch --fastx_revcomp FILENAME --fastqout FILENAME\n" "vsearch --fastx_subsample FILENAME --fastaout FILENAME --sample_pct 1\n" + "vsearch --fastx_uniques FILENAME --output FILENAME\n" "vsearch --makeudb_usearch FILENAME --output FILENAME\n" "vsearch --search_exact FILENAME --db FILENAME --alnout FILENAME\n" "vsearch --sff_convert FILENAME --output FILENAME --sff_clip\n" @@ -5230,35 +5301,15 @@ "vsearch --usearch_global FILENAME --db FILENAME --id 0.97 --alnout FILENAME\n" "\n" "Other commands: cluster_fast, cluster_smallmem, cluster_unoise, cut,\n" - " derep_id, derep_prefix, fasta2fastq, fastq_filter,\n" - " fastq_join, fastx_getseqs, fastx_getsubseqs, maskfasta,\n" - " orient, rereplicate, uchime2_denovo, uchime3_denovo,\n" - " udb2fasta, udbinfo, udbstats, version\n" + " derep_id, derep_fulllength, derep_prefix, fasta2fastq,\n" + " fastq_filter, fastq_join, fastx_getseqs, fastx_getsubseqs,\n" + " maskfasta, orient, rereplicate, uchime2_denovo,\n" + " uchime3_denovo, udb2fasta, udbinfo, udbstats, version\n" "\n", progname); } } -void cmd_fastx_revcomp() -{ - if ((!opt_fastaout) && (!opt_fastqout)) - { - fatal("No output files specified"); - } - - fastx_revcomp(); -} - -void cmd_fastq_convert() -{ - if (! opt_fastqout) - { - fatal("No output file specified with --fastqout"); - } - - fastq_convert(); -} - void cmd_cluster() { if ((!opt_alnout) && (!opt_userout) && @@ -5464,19 +5515,27 @@ } else if (opt_sortbysize) { - cmd_sortbysize(); + sortbysize(); } else if (opt_sortbylength) { - cmd_sortbylength(); + sortbylength(); + } + else if (opt_derep_fulllength) + { + derep(opt_derep_fulllength, false); } - else if (opt_derep_fulllength || opt_derep_id || opt_derep_prefix) + else if (opt_derep_prefix) { - cmd_derep(); + derep_prefix(); + } + else if (opt_derep_id) + { + derep(opt_derep_id, true); } else if (opt_shuffle) { - cmd_shuffle(); + shuffle(); } else if (opt_fastx_subsample) { @@ -5484,7 +5543,7 @@ } else if (opt_maskfasta) { - cmd_maskfasta(); + maskfasta(); } else if (opt_cluster_smallmem || opt_cluster_fast || opt_cluster_size || opt_cluster_unoise) { @@ -5512,7 +5571,7 @@ } else if (opt_fastx_revcomp) { - cmd_fastx_revcomp(); + fastx_revcomp(); } else if (opt_search_exact) { @@ -5520,11 +5579,11 @@ } else if (opt_fastx_mask) { - cmd_fastx_mask(); + fastx_mask(); } else if (opt_fastq_convert) { - cmd_fastq_convert(); + fastq_convert(); } else if (opt_fastq_mergepairs) { @@ -5532,11 +5591,11 @@ } else if (opt_fastq_eestats) { - cmd_fastq_eestats(); + fastq_eestats(); } else if (opt_fastq_eestats2) { - cmd_fastq_eestats2(); + fastq_eestats2(); } else if (opt_fastq_join) { @@ -5544,7 +5603,7 @@ } else if (opt_rereplicate) { - cmd_rereplicate(); + rereplicate(); } else if (opt_version) { @@ -5552,11 +5611,11 @@ } else if (opt_makeudb_usearch) { - cmd_makeudb_usearch(); + udb_make(); } else if (opt_udb2fasta) { - cmd_udb2fasta(); + udb_fasta(); } else if (opt_udbinfo) { @@ -5598,6 +5657,10 @@ { fasta2fastq(); } + else if (opt_fastx_uniques) + { + derep(opt_fastx_uniques, false); + } else { cmd_none(); diff -Nru vsearch-2.18.0/src/vsearch.h vsearch-2.21.1/src/vsearch.h --- vsearch-2.18.0/src/vsearch.h 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/src/vsearch.h 2022-01-18 17:26:39.000000000 +0000 @@ -250,6 +250,7 @@ #include "otutable.h" #include "udb.h" #include "kmerhash.h" +#include "tax.h" #include "sintax.h" #include "fastqjoin.h" #include "sffconvert.h" @@ -271,6 +272,7 @@ extern bool opt_gzip_decompress; extern bool opt_label_substr_match; extern bool opt_no_progress; +extern bool opt_fastq_qout_max; extern bool opt_quiet; extern bool opt_relabel_keep; extern bool opt_relabel_md5; @@ -332,6 +334,7 @@ extern char * opt_fastx_mask; extern char * opt_fastx_revcomp; extern char * opt_fastx_subsample; +extern char * opt_fastx_uniques; extern char * opt_join_padgap; extern char * opt_join_padgapq; extern char * opt_label; @@ -340,6 +343,7 @@ extern char * opt_label_word; extern char * opt_label_words; extern char * opt_label_field; +extern char * opt_lcaout; extern char * opt_log; extern char * opt_makeudb_usearch; extern char * opt_maskfasta; @@ -354,10 +358,12 @@ extern char * opt_output; extern char * opt_pattern; extern char * opt_profile; +extern char * opt_qsegout; extern char * opt_relabel; extern char * opt_rereplicate; extern char * opt_reverse; extern char * opt_samout; +extern char * opt_sample; extern char * opt_search_exact; extern char * opt_sff_convert; extern char * opt_shuffle; @@ -365,6 +371,7 @@ extern char * opt_sortbylength; extern char * opt_sortbysize; extern char * opt_tabbedout; +extern char * opt_tsegout; extern char * opt_uc; extern char * opt_uchime2_denovo; extern char * opt_uchime3_denovo; @@ -385,6 +392,7 @@ extern double opt_fastq_maxee_rate; extern double opt_fastq_truncee; extern double opt_id; +extern double opt_lca_cutoff; extern double opt_max_unmasked_pct; extern double opt_maxid; extern double opt_maxqt; diff -Nru vsearch-2.18.0/.travis.yml vsearch-2.21.1/.travis.yml --- vsearch-2.18.0/.travis.yml 2021-08-27 10:35:25.000000000 +0000 +++ vsearch-2.21.1/.travis.yml 1970-01-01 00:00:00.000000000 +0000 @@ -1,40 +0,0 @@ -language: -- cpp - -arch: -#- amd64 -- arm64 -- ppc64le - -os: -- linux -#- osx - -dist: -- xenial - -osx_image: -- xcode12.5 - -addons: - apt: - packages: - - ghostscript - - valgrind - - groff - homebrew: - packages: - - ghostscript - -compiler: -- g++ -- clang - -script: -- ./autogen.sh -- ./configure -- make -- export PATH=$PWD/bin:$PATH -- git clone https://github.com/frederic-mahe/vsearch-tests.git -- cd vsearch-tests -- bash ./run_all_tests.sh