diff -Nru fastp-0.19.6+dfsg/debian/changelog fastp-0.20.0+dfsg/debian/changelog --- fastp-0.19.6+dfsg/debian/changelog 2019-01-18 21:24:34.000000000 +0000 +++ fastp-0.20.0+dfsg/debian/changelog 2019-07-22 13:23:32.000000000 +0000 @@ -1,3 +1,23 @@ +fastp (0.20.0+dfsg-1) unstable; urgency=medium + + * New upstream release. + * Standards-Version: 4.4.0. + * Buster is released, so upload to unstable. + + -- Dylan Aïssi Mon, 22 Jul 2019 15:23:32 +0200 + +fastp (0.19.8+dfsg-1) experimental; urgency=medium + + * New upstream release. + + -- Dylan Aïssi Sun, 31 Mar 2019 21:00:50 +0200 + +fastp (0.19.7+dfsg-1) experimental; urgency=medium + + * New upstream release. + + -- Dylan Aïssi Sat, 30 Mar 2019 10:55:06 +0100 + fastp (0.19.6+dfsg-1) unstable; urgency=medium * New upstream release. diff -Nru fastp-0.19.6+dfsg/debian/control fastp-0.20.0+dfsg/debian/control --- fastp-0.19.6+dfsg/debian/control 2019-01-18 21:24:34.000000000 +0000 +++ fastp-0.20.0+dfsg/debian/control 2019-07-22 13:23:32.000000000 +0000 @@ -6,7 +6,7 @@ Build-Depends: debhelper (>= 12), help2man, zlib1g-dev -Standards-Version: 4.3.0 +Standards-Version: 4.4.0 Vcs-Browser: https://salsa.debian.org/med-team/fastp Vcs-Git: https://salsa.debian.org/med-team/fastp.git Homepage: https://github.com/OpenGene/fastp diff -Nru fastp-0.19.6+dfsg/debian/patches/makefile.patch fastp-0.20.0+dfsg/debian/patches/makefile.patch --- fastp-0.19.6+dfsg/debian/patches/makefile.patch 2019-01-18 21:24:34.000000000 +0000 +++ fastp-0.20.0+dfsg/debian/patches/makefile.patch 2019-07-22 13:23:32.000000000 +0000 @@ -8,11 +8,11 @@ @@ -15,16 +15,16 @@ BIN_TARGET := ${TARGET} - CXX := g++ --CXXFLAGS := -std=c++11 -g -I${DIR_INC} $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) -+CXXFLAGS += -std=c++11 -g -I${DIR_INC} $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) + CXX ?= g++ +-CXXFLAGS := -std=c++11 -g -O3 -I${DIR_INC} $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) ${CXXFLAGS} ++CXXFLAGS += -std=c++11 -g -O3 -I${DIR_INC} $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) LIBS := -lz -lpthread --LD_FLAGS := $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(LIBS) +-LD_FLAGS := $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(LIBS) $(LD_FLAGS) +LDFLAGS += $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(LIBS) @@ -21,8 +21,8 @@ + $(CXX) $(OBJ) -o $@ $(LDFLAGS) ${DIR_OBJ}/%.o:${DIR_SRC}/%.cpp make_obj_dir -- $(CXX) $(CXXFLAGS) -O3 -c $< -o $@ -+ $(CXX) $(CPPFLAGS) $(CXXFLAGS) -O3 -c $< -o $@ +- $(CXX) -c $< -o $@ $(CXXFLAGS) ++ $(CXX) -c $< -o $@ $(CPPFLAGS) $(CXXFLAGS) .PHONY:clean clean: diff -Nru fastp-0.19.6+dfsg/debian/tests/run-sample-analysis fastp-0.20.0+dfsg/debian/tests/run-sample-analysis --- fastp-0.19.6+dfsg/debian/tests/run-sample-analysis 2019-01-18 21:24:34.000000000 +0000 +++ fastp-0.20.0+dfsg/debian/tests/run-sample-analysis 2019-07-22 13:23:32.000000000 +0000 @@ -12,9 +12,10 @@ fastp -i R1.fq -o out.R1.fq # Test for paired end data (gzip compressed) +gzip --keep R*.fq fastp -i R1.fq.gz -I R2.fq.gz -o out.R1.fq.gz -O out.R2.fq.gz # Test for Unique Molecular Identifier (UMI) processing -fastp -i R1.fq -o out.R1.fq -U --umi_loc=read1 --umi_len=8 +fastp -i R2.fq -o out.R2.fq -U --umi_loc=read1 --umi_len=8 rm -f $AUTOPKGTEST_TMP/* diff -Nru fastp-0.19.6+dfsg/Makefile fastp-0.20.0+dfsg/Makefile --- fastp-0.19.6+dfsg/Makefile 2019-01-02 02:56:02.000000000 +0000 +++ fastp-0.20.0+dfsg/Makefile 2019-04-17 03:23:22.000000000 +0000 @@ -14,17 +14,17 @@ BIN_TARGET := ${TARGET} -CXX := g++ -CXXFLAGS := -std=c++11 -g -I${DIR_INC} $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) +CXX ?= g++ +CXXFLAGS := -std=c++11 -g -O3 -I${DIR_INC} $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) ${CXXFLAGS} LIBS := -lz -lpthread -LD_FLAGS := $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(LIBS) +LD_FLAGS := $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(LIBS) $(LD_FLAGS) ${BIN_TARGET}:${OBJ} $(CXX) $(OBJ) -o $@ $(LD_FLAGS) ${DIR_OBJ}/%.o:${DIR_SRC}/%.cpp make_obj_dir - $(CXX) $(CXXFLAGS) -O3 -c $< -o $@ + $(CXX) -c $< -o $@ $(CXXFLAGS) .PHONY:clean clean: diff -Nru fastp-0.19.6+dfsg/README.md fastp-0.20.0+dfsg/README.md --- fastp-0.19.6+dfsg/README.md 2019-01-02 02:56:02.000000000 +0000 +++ fastp-0.20.0+dfsg/README.md 2019-04-17 03:23:22.000000000 +0000 @@ -21,6 +21,7 @@ * [unique molecular identifier (UMI) processing](#unique-molecular-identifier-umi-processing) * [output splitting](#output-splitting) * [overrepresented sequence analysis](#overrepresented-sequence-analysis) +* [merge paired-end reads](#merge-paired-end-reads) * [all options](#all-options) * [citation](#citation) @@ -29,7 +30,7 @@ 1. filter out bad reads (too low quality, too short, or too many N...) 2. cut low quality bases for per read in its 5' and 3' by evaluating the mean quality from a sliding window (like Trimmomatic but faster). 3. trim all reads in front and tail -4. cut adapters. Adapter sequences can be automatically detected,which means you don't have to input the adapter sequences to trim them. +4. cut adapters. Adapter sequences can be automatically detected, which means you don't have to input the adapter sequences to trim them. 5. correct mismatched base pairs in overlapped regions of paired end reads, if one base is with high quality while the other is with ultra low quality 6. trim polyG in 3' ends, which is commonly seen in NovaSeq/NextSeq data. Trim polyX in 3' ends to remove unwanted polyX tailing (i.e. polyA tailing for mRNA-Seq data) 7. preprocess unique molecular identifier (UMI) enabled data, shift UMI to sequence name. @@ -100,12 +101,21 @@ ## input from STDIN * specify `--stdin` if you want to read the STDIN for processing. * if the STDIN is an interleaved paired-end stream, specify `--interleaved_in` to indicate that. +## store the unpaired reads for PE data +* you can specify `--unpaired1` to store the reads that read1 passes filters but its paired read2 doesn't, as well as `--unpaired2` for unpaired read2. +* `--unpaired1` and `--unpaired2` can be the same, so the unpaired read1/read2 will be written to the same single file. +## store the reads that fail the filters +* give `--failed_out` to specify the file name to store the failed reads. +* if one read failed and is written to `--failed_out`, its `failure reason` will be appended to its read name. For example, `failed_quality_filter`, `failed_too_short` etc. +* for PE data, if unpaired reads are not stored (by giving --unpaired1 or --unpaired2), the failed pair of reads will be put together. If one read passes the filters but its pair doesn't, the `failure reason` will be `paired_read_is_failing`. ## process only part of the data If you don't want to process all the data, you can specify `--reads_to_process` to limit the reads to be processed. This is useful if you want to have a fast preview of the data quality, or you want to create a subset of the filtered data. ## do not overwrite exiting files You can enable the option `--dont_overwrite` to protect the existing files not to be overwritten by `fastp`. In this case, `fastp` will report an error and quit if it finds any of the output files (read1, read2, json report, html report) already exists before. ## split the output to multiple files for parallel processing See [output splitting](#output-splitting) +## merge PE reads +See [merge paired-end reads](#merge-paired-end-reads) # filtering Multiple filters have been implemented. @@ -116,6 +126,9 @@ * `-q, --qualified_quality_phred`       the quality value that a base is qualified. Default 15 means phred quality >=Q15 is qualified. * `-u, --unqualified_percent_limit`   how many percents of bases are allowed to be unqualified (0~100). Default 40 means 40% +You can also filter reads by its average quality score +* `-e, --average_qual` if one read's average quality score Illumina TruSeq Adapter Read 1 +AGATCGGAAGAGCACACGTCTGAACTCCAGTCA +>Illumina TruSeq Adapter Read 2 +AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT +>polyA +AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +``` + +The adapter sequence in this file should be at least 6bp long, otherwise it will be skipped. And you can give whatever you want to trim, rather than regular sequencing adapters (i.e. polyA). + +`fastp` first trims the auto-detected adapter or the adapter sequences given by `--adapter_sequence | --adapter_sequence_r2`, then trims the adapters given by `--adapter_fasta` one by one. + The sequence distribution of trimmed adapters can be found at the HTML/JSON reports. # per read cutting by quality score @@ -162,7 +189,7 @@ # base correction for PE data `fastp` perform `overlap analysis` for PE data, which try to find an overlap of each pair of reads. If an proper overlap is found, it can correct mismatched base pairs in overlapped regions of paired end reads, if one base is with high quality while the other is with ultra low quality. If a base is corrected, the quality of its paired base will be assigned to it so that they will share the same quality.   -This function is not enabled by default, specify `-c` or `--correction` to enable it. +This function is not enabled by default, specify `-c` or `--correction` to enable it. This function is based on overlapping detection, which has adjustable parameters `overlap_len_require (default 30)`, `overlap_diff_limit (default 5)` and `overlap_diff_limit_percent (default 20%)`. Please note that the reads should meet these three conditions simultaneously. # global trimming `fastp` supports global trimming, which means trim all reads in the front or the tail. This function is useful since sometimes you want to drop some cycles of a sequencing run. @@ -182,9 +209,9 @@ 5, quality pruning by sliding window (--cut_right) 6, quality pruning at 3' (--cut_tail) 7, trim polyG (--trim_poly_g, enabled by default for NovaSeq/NextSeq data) -8, trim polyX (--trim_poly_x) -9, trim adapter by overlap analysis (enabled by default for PE data) -10, trim adapter by adapter sequence (--adapter_sequence, --adapter_sequence_r2. For PE data, this step is skipped if last step succeeded) +8, trim adapter by overlap analysis (enabled by default for PE data) +9, trim adapter by adapter sequence (--adapter_sequence, --adapter_sequence_r2. For PE data, this step is skipped if last step succeeded) +10, trim polyX (--trim_poly_x) 11, trim to max length (---max_len) ``` @@ -250,6 +277,23 @@ `fastp` not only gives the counts of overrepresented sequence, but also gives the information that how they distribute over cycles. A figure is provided for each detected overrepresented sequence, from which you can know where this sequence is mostly found. +# merge paired-end reads +For paired-end (PE) input, fastp supports stiching them by specifying the `-m/--merge` option. In this `merging` mode: + +* `--merged_out` shouuld be given to specify the file to store merged reads, otherwise you should enable `--stdout` to stream the merged reads to STDOUT. The merged reads are also filtered. +* `--out1` and `--out2` will be the reads that cannot be merged successfully, but both pass all the filters. +* `--unpaired1` will be the reads that cannot be merged, `read1` passes filters but `read2` doesn't. +* `--unpaired2` will be the reads that cannot be merged, `read2` passes filters but `read1` doesn't. +* `--include_unmerged` can be enabled to make reads of `--out1`, `--out2`, `--unpaired1` and `--unpaired2` redirected to `--merged_out`. So you will get a single output file. This option is disabled by default. + +`--failed_out` can still be given to store the reads (either merged or unmerged) failed to passing filters. + +In the output file, a tag like `merged_xxx_yyy`will be added to each read name to indicate that how many base pairs are from read1 and from read2, respectively. For example, ` +@NB551106:9:H5Y5GBGX2:1:22306:18653:13119 1:N:0:GATCAG merged_150_15` +means that 150bp are from read1, and 15bp are from read2. `fastp` prefers the bases in read1 since they usually have higher quality than read2. + +Same as the [base correction feature](#base-correction-for-pe-data), this function is also based on overlapping detection, which has adjustable parameters `overlap_len_require (default 30)`, `overlap_diff_limit (default 5)` and `overlap_diff_limit_percent (default 20%)`. Please note that the reads should meet these three conditions simultaneously. + # all options ```shell usage: fastp -i -o [-I -O ] [options...] @@ -258,7 +302,13 @@ -i, --in1 read1 input file name (string) -o, --out1 read1 output file name (string [=]) -I, --in2 read2 input file name (string [=]) - -O, --out2 read2 output file name (string [=]) + -O, --out2 read2 output file name (string [=]) + --unpaired1 for PE input, if read1 passed QC but read2 not, it will be written to unpaired1. Default is to discard it. (string [=]) + --unpaired2 for PE input, if read2 passed QC but read1 not, it will be written to unpaired2. If --unpaired2 is same as --umpaired1 (default mode), both unpaired reads will be written to this same file. (string [=]) + --failed_out specify the file to store reads that cannot pass the filters. (string [=]) + -m, --merge for paired-end input, merge each pair of reads into a single read if they are overlapped. The merged reads will be written to the file given by --merged_out, the unmerged reads will be written to the files specified by --out1 and --out2. The merging mode is disabled by default. + --merged_out in the merging mode, specify the file name to store merged output, or specify --stdout to stream the merged output (string [=]) + --include_unmerged in the merging mode, write the unmerged or unpaired reads to the file specified by --merge. Disabled by default. -6, --phred64 indicate the input is using phred64 scoring (it'll be converted to phred33, so the output will still be phred33) -z, --compression compression level for gzip output (1 ~ 9). 1 is fastest, 9 is smallest, default is 4. (int [=4]) --stdin input from STDIN. If the STDIN is interleaved paired-end FASTQ, please also add --interleaved_in. @@ -271,6 +321,7 @@ -A, --disable_adapter_trimming adapter trimming is enabled by default. If this option is specified, adapter trimming is disabled -a, --adapter_sequence the adapter for read1. For SE data, if not specified, the adapter will be auto-detected. For PE data, this is used if R1/R2 are found not overlapped. (string [=auto]) --adapter_sequence_r2 the adapter for read2 (PE data only). This is used if R1/R2 are found not overlapped. If not specified, it will be the same as (string [=]) + --adapter_fasta specify a FASTA file to trim both read1 and read2 (if PE) by all the sequences in this FASTA file (string [=]) --detect_adapter_for_pe by default, the adapter sequence auto-detection is enabled for SE data only, turn on this option to enable it for PE data. # global trimming options @@ -308,6 +359,8 @@ -q, --qualified_quality_phred the quality value that a base is qualified. Default 15 means phred quality >=Q15 is qualified. (int [=15]) -u, --unqualified_percent_limit how many percents of bases are allowed to be unqualified (0~100). Default 40 means 40% (int [=40]) -n, --n_base_limit if one read's number of N base is >n_base_limit, then this read/pair is discarded. Default is 5 (int [=5]) + -e, --average_qual if one read's average quality score r1->length()/3) { - string adapter1 = r1->mSeq.mStr.substr(ol, r1->length() - ol); - string adapter2 = r2->mSeq.mStr.substr(ol, r2->length() - ol); + if(ov.overlapped && ov.offset < 0) { + + //5' ......frontTrimmed1......|------------------------------------------|----- 3' + //3' -----|-------------------------------------------|......frontTrimmed2..... 5' + + int len1 = min(r1->length(), ol + frontTrimmed2); + int len2 = min(r2->length(), ol + frontTrimmed1); + string adapter1 = r1->mSeq.mStr.substr(len1, r1->length() - len1); + string adapter2 = r2->mSeq.mStr.substr(len2, r2->length() - len2); if(_DEBUG) { cerr << adapter1 << endl; cerr << adapter2 << endl; + cerr << "frontTrimmed2: " << frontTrimmed1 << endl; + cerr << "frontTrimmed2: " << frontTrimmed2 << endl; cerr << "overlap:" << ov.offset << "," << ov.overlap_len << ", " << ov.diff << endl; r1->print(); r2->reverseComplement()->print(); cerr <mSeq.mStr = r1->mSeq.mStr.substr(0, ol); - r1->mQuality = r1->mQuality.substr(0, ol); - r2->mSeq.mStr = r2->mSeq.mStr.substr(0, ol); - r2->mQuality = r2->mQuality.substr(0, ol); + r1->mSeq.mStr = r1->mSeq.mStr.substr(0, len1); + r1->mQuality = r1->mQuality.substr(0, len1); + r2->mSeq.mStr = r2->mSeq.mStr.substr(0, len2); + r2->mQuality = r2->mQuality.substr(0, len2); fr->addAdapterTrimmed(adapter1, adapter2); return true; @@ -38,8 +45,31 @@ return false; } -bool AdapterTrimmer::trimBySequence(Read* r, FilterResult* fr, string& adapterseq, bool isR2) { - const int matchReq = 4; +bool AdapterTrimmer::trimByMultiSequences(Read* r, FilterResult* fr, vector& adapterList, bool isR2, bool incTrimmedCounter) { + int matchReq = 4; + if(adapterList.size() > 16) + matchReq = 5; + if(adapterList.size() > 256) + matchReq = 6; + bool trimmed = false; + + string originalSeq = r->mSeq.mStr; + for(int i=0; ilength(), originalSeq.length() - r->length()); + if(fr) + fr->addAdapterTrimmed(adapter, isR2, incTrimmedCounter); + else + cerr << adapter << endl; + } + + return trimmed; +} + +bool AdapterTrimmer::trimBySequence(Read* r, FilterResult* fr, string& adapterseq, bool isR2, int matchReq) { const int allowOneMismatchForEach = 8; int rlen = r->length(); @@ -112,5 +142,23 @@ "///EEEEEEEEEEEEEEEEEEEEEEEEEE////EEEEEEEEEEEEE////E////E"); string adapter = "TTTTCCACGGGGATACTACTG"; bool trimmed = AdapterTrimmer::trimBySequence(&r, NULL, adapter); - return r.mSeq.mStr == "TTTTAACCCCCCCCCCCCCCCCCCCCCCCCCCCCAATTTTAAAA"; + if (r.mSeq.mStr != "TTTTAACCCCCCCCCCCCCCCCCCCCCCCCCCCCAATTTTAAAA") + return false; + + Read read("@name", + "TTTTAACCCCCCCCCCCCCCCCCCCCCCCCCCCCAATTTTAAAATTTTCCCCGGGGAAATTTCCCGGGAAATTTCCCGGGATCGATCGATCGATCGAATTCC", + "+", + "///EEEEEEEEEEEEEEEEEEEEEEEEEE////EEEEEEEEEEEEE////E////EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE"); + vector adapterList; + adapterList.push_back("GCTAGCTAGCTAGCTA"); + adapterList.push_back("AAATTTCCCGGGAAATTTCCCGGG"); + adapterList.push_back("ATCGATCGATCGATCG"); + adapterList.push_back("AATTCCGGAATTCCGG"); + trimmed = AdapterTrimmer::trimByMultiSequences(&read, NULL, adapterList); + if (read.mSeq.mStr != "TTTTAACCCCCCCCCCCCCCCCCCCCCCCCCCCCAATTTTAAAATTTTCCCCGGGG") { + cerr << read.mSeq.mStr << endl; + return false; + } + + return true; } \ No newline at end of file diff -Nru fastp-0.19.6+dfsg/src/adaptertrimmer.h fastp-0.20.0+dfsg/src/adaptertrimmer.h --- fastp-0.19.6+dfsg/src/adaptertrimmer.h 2019-01-02 02:56:02.000000000 +0000 +++ fastp-0.20.0+dfsg/src/adaptertrimmer.h 2019-04-17 03:23:22.000000000 +0000 @@ -15,9 +15,10 @@ AdapterTrimmer(); ~AdapterTrimmer(); - static bool trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr); - static bool trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, OverlapResult ov); - static bool trimBySequence(Read* r1, FilterResult* fr, string& adapter, bool isR2 = false); + static bool trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, int diffLimit, int overlapRequire, double diffPercentLimit); + static bool trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, OverlapResult ov, int frontTrimmed1 = 0, int frontTrimmed2 = 0); + static bool trimBySequence(Read* r1, FilterResult* fr, string& adapter, bool isR2 = false, int matchReq = 4); + static bool trimByMultiSequences(Read* r1, FilterResult* fr, vector& adapterList, bool isR2 = false, bool incTrimmedCounter = true); static bool test(); diff -Nru fastp-0.19.6+dfsg/src/basecorrector.cpp fastp-0.20.0+dfsg/src/basecorrector.cpp --- fastp-0.19.6+dfsg/src/basecorrector.cpp 2019-01-02 02:56:02.000000000 +0000 +++ fastp-0.20.0+dfsg/src/basecorrector.cpp 2019-04-17 03:23:22.000000000 +0000 @@ -8,14 +8,14 @@ BaseCorrector::~BaseCorrector(){ } -int BaseCorrector::correctByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr) { - OverlapResult ov = OverlapAnalysis::analyze(r1, r2); +int BaseCorrector::correctByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, int diffLimit, int overlapRequire, double diffPercentLimit) { + OverlapResult ov = OverlapAnalysis::analyze(r1, r2, diffLimit, overlapRequire, diffPercentLimit); return correctByOverlapAnalysis(r1, r2, fr, ov); } int BaseCorrector::correctByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, OverlapResult ov) { - // we only correct overlap with up to 5 mismatches - if(ov.diff == 0 || ov.diff > 5) + // we only correct overlap + if(ov.diff == 0 || !ov.overlapped) return 0; int ol = ov.overlap_len; @@ -92,7 +92,7 @@ "+", "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEEEEEEEEE"); - correctByOverlapAnalysis(&r1, &r2, NULL); + correctByOverlapAnalysis(&r1, &r2, NULL, 5, 30, 0.2); if(r1.mSeq.mStr != "TTTTAACCCCCCCCCCCCCCCCCCCCCCCCCCCCAATTTTAAAATTTTCCCCGGGG") return false; diff -Nru fastp-0.19.6+dfsg/src/basecorrector.h fastp-0.20.0+dfsg/src/basecorrector.h --- fastp-0.19.6+dfsg/src/basecorrector.h 2019-01-02 02:56:02.000000000 +0000 +++ fastp-0.20.0+dfsg/src/basecorrector.h 2019-04-17 03:23:22.000000000 +0000 @@ -15,7 +15,7 @@ BaseCorrector(); ~BaseCorrector(); - static int correctByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr); + static int correctByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, int diffLimit, int overlapRequire, double diffPercentLimit); static int correctByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, OverlapResult ov); static bool test(); }; diff -Nru fastp-0.19.6+dfsg/src/common.h fastp-0.20.0+dfsg/src/common.h --- fastp-0.19.6+dfsg/src/common.h 2019-01-02 02:56:02.000000000 +0000 +++ fastp-0.20.0+dfsg/src/common.h 2019-04-17 03:23:22.000000000 +0000 @@ -1,7 +1,7 @@ #ifndef COMMON_H #define COMMON_H -#define FASTP_VER "0.19.6" +#define FASTP_VER "0.20.0" #define _DEBUG false @@ -17,6 +17,8 @@ typedef char int8; typedef unsigned char uint8; +const char ATCG_BASES[] = {'A', 'T', 'C', 'G'}; + #pragma pack(2) @@ -52,5 +54,16 @@ // how many types in total we support static const int FILTER_RESULT_TYPES = 32; +const static char* FAILED_TYPES[FILTER_RESULT_TYPES] = { + "passed", "", "", "", + "failed_polyx_filter", "", "", "", + "failed_bad_overlap", "", "", "", + "failed_too_many_n_bases", "", "", "", + "failed_too_short", "failed_too_long", "", "", + "failed_quality_filter", "", "", "", + "failed_low_complexity", "", "", "", + "", "", "", "" +}; + #endif /* COMMON_H */ diff -Nru fastp-0.19.6+dfsg/src/evaluator.cpp fastp-0.20.0+dfsg/src/evaluator.cpp --- fastp-0.19.6+dfsg/src/evaluator.cpp 2019-01-02 02:56:02.000000000 +0000 +++ fastp-0.20.0+dfsg/src/evaluator.cpp 2019-04-17 03:23:22.000000000 +0000 @@ -617,7 +617,7 @@ string matchedAdapter = matchKnownAdapter(adapter); if(!matchedAdapter.empty()) { map knownAdapters = getKnownAdapter(); - cerr << knownAdapters[matchedAdapter] << ": " << matchedAdapter << endl; + cerr << knownAdapters[matchedAdapter] << endl << matchedAdapter << endl; return matchedAdapter; } else { if(reachedLeaf) { diff -Nru fastp-0.19.6+dfsg/src/fastareader.cpp fastp-0.20.0+dfsg/src/fastareader.cpp --- fastp-0.19.6+dfsg/src/fastareader.cpp 1970-01-01 00:00:00.000000000 +0000 +++ fastp-0.20.0+dfsg/src/fastareader.cpp 2019-04-17 03:23:22.000000000 +0000 @@ -0,0 +1,120 @@ + +#include "fastareader.h" +#include "util.h" +#include + +FastaReader::FastaReader(string faFile, bool forceUpperCase) +{ + // Set locale and disable stdio synchronization to improve iostream performance + // http://www.drdobbs.com/the-standard-librarian-iostreams-and-std/184401305 + // http://stackoverflow.com/questions/5166263/how-to-get-iostream-to-perform-better + setlocale(LC_ALL,"C"); + ios_base::sync_with_stdio(false); + + mFastaFile = faFile; + mForceUpperCase = forceUpperCase; + if (is_directory(mFastaFile)) { + string error_msg = "There is a problem with the provided fasta file: \'"; + error_msg.append(mFastaFile); + error_msg.append("\' is a directory NOT a file...\n"); + throw invalid_argument(error_msg); + } + mFastaFileStream.open( mFastaFile.c_str(),ios::in); + // verify that the file can be read + if (!mFastaFileStream.is_open()) { + string msg = "There is a problem with the provided fasta file: could NOT read "; + msg.append(mFastaFile.c_str()); + msg.append("...\n"); + throw invalid_argument(msg); + } + + char c; + // seek to first contig + while (mFastaFileStream.get(c) && c != '>') { + if (mFastaFileStream.eof()) { + break; + } + } +} + +FastaReader::~FastaReader() +{ + if (mFastaFileStream.is_open()) { + mFastaFileStream.close(); + } +} + +void FastaReader::readNext() +{ + mCurrentID = ""; + mCurrentDescription = ""; + mCurrentSequence = ""; + bool foundHeader = false; + + char c; + stringstream ssSeq; + stringstream ssHeader; + while(true){ + mFastaFileStream.get(c); + if(c == '>' || mFastaFileStream.eof()) + break; + else { + if (foundHeader){ + if(mForceUpperCase && c>='a' && c<='z') { + c -= ('a' - 'A'); + } + ssSeq << c; + } + else + ssHeader << c; + } + + string line = ""; + getline(mFastaFileStream,line,'\n'); + + + if(foundHeader == false) { + ssHeader << line; + foundHeader = true; + } + else { + str_keep_valid_sequence(line, mForceUpperCase); + ssSeq << line; + } + } + mCurrentSequence = ssSeq.str(); + string header = ssHeader.str(); + + mCurrentID = header; +} + +bool FastaReader::hasNext() { + return !mFastaFileStream.eof(); +} + +void FastaReader::readAll() { + while(!mFastaFileStream.eof()){ + readNext(); + mAllContigs[mCurrentID] = mCurrentSequence; + } +} + +bool FastaReader::test(){ + FastaReader reader("testdata/tinyref.fa"); + reader.readAll(); + + string contig1 = "GATCACAGGTCTATCACCCTATTAATTGGTATTTTCGTCTGGGGGGTGTGGAGCCGGAGCACCCTATGTCGCAGT"; + string contig2 = "GTCTGCACAGCCGCTTTCCACACAGAACCCCCCCCTCCCCCCGCTTCTGGCAAACCCCAAAAACAAAGAACCCTA"; + + if(reader.mAllContigs.count("contig1") == 0 || reader.mAllContigs.count("contig2") == 0 ) + return false; + + if(reader.mAllContigs["contig1"] != contig1 || reader.mAllContigs["contig2"] != contig2 ) + return false; + + return true; + +} + + + diff -Nru fastp-0.19.6+dfsg/src/fastareader.h fastp-0.20.0+dfsg/src/fastareader.h --- fastp-0.19.6+dfsg/src/fastareader.h 1970-01-01 00:00:00.000000000 +0000 +++ fastp-0.20.0+dfsg/src/fastareader.h 2019-04-17 03:23:22.000000000 +0000 @@ -0,0 +1,66 @@ +#ifndef FASTA_READER_H +#define FASTA_READER_H + +// includes +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +class FastaReader +{ +public: + FastaReader(string fastaFile, bool forceUpperCase = true); + ~FastaReader(); + bool hasNext(); + void readNext(); + void readAll(); + + inline string currentID() + { + return mCurrentID; + } + + inline string currentDescription() + { + return mCurrentDescription; + } + + inline string currentSequence() + { + return mCurrentSequence; + } + + inline map& contigs() { + return mAllContigs; + } + + static bool test(); + + +public: + string mCurrentSequence; + string mCurrentID ; + string mCurrentDescription; + map mAllContigs; + +private: + bool readLine(); + bool endOfLine(char c); + void setFastaSequenceIdDescription(); + +private: + string mFastaFile; + ifstream mFastaFileStream; + bool mForceUpperCase; +}; + + +#endif + diff -Nru fastp-0.19.6+dfsg/src/fastqreader.cpp fastp-0.20.0+dfsg/src/fastqreader.cpp --- fastp-0.19.6+dfsg/src/fastqreader.cpp 2019-01-02 02:56:02.000000000 +0000 +++ fastp-0.20.0+dfsg/src/fastqreader.cpp 2019-04-17 03:23:22.000000000 +0000 @@ -92,6 +92,8 @@ } string FastqReader::getLine(){ + static int c=0; + c++; int copied = 0; int start = mBufUsedLen; @@ -112,7 +114,7 @@ // skip \n or \r end++; // handle \r\n - if(end < mBufDataLen-1 && mBuf[end] == '\n') + if(end < mBufDataLen-1 && mBuf[end-1]=='\r' && mBuf[end] == '\n') end++; mBufUsedLen = end; @@ -145,7 +147,6 @@ end++; mBufUsedLen = end; - return str; } // even this new buf is not enough, although impossible @@ -174,12 +175,17 @@ } string name = getLine(); - string sequence = getLine(); - string strand = getLine(); + // name should start with @ + while((name.empty() && !(mBufUsedLen >= mBufDataLen && eof())) || (!name.empty() && name[0]!='@')){ + name = getLine(); + } - if(name.empty() || sequence.empty() || strand.empty()) + if(name.empty()) return NULL; + string sequence = getLine(); + string strand = getLine(); + // WAR for FQ with no quality if (!mHasQuality){ string quality = string(sequence.length(), 'K'); @@ -187,8 +193,14 @@ } else { string quality = getLine(); - if(quality.empty()) + if(quality.length() != sequence.length()) { + cerr << "ERROR: sequence and quality have different length:" << endl; + cerr << name << endl; + cerr << sequence << endl; + cerr << strand << endl; + cerr << quality << endl; return NULL; + } return new Read(name, sequence, strand, quality, mPhred64); } diff -Nru fastp-0.19.6+dfsg/src/filter.cpp fastp-0.20.0+dfsg/src/filter.cpp --- fastp-0.19.6+dfsg/src/filter.cpp 2019-01-02 02:56:02.000000000 +0000 +++ fastp-0.20.0+dfsg/src/filter.cpp 2019-04-17 03:23:22.000000000 +0000 @@ -19,6 +19,7 @@ int rlen = r->length(); int lowQualNum = 0; int nBaseNum = 0; + int totalQual = 0; // need to recalculate lowQualNum and nBaseNum if the corresponding filters are enabled if(mOptions->qualfilter.enabled || mOptions->lengthFilter.enabled) { @@ -29,6 +30,8 @@ char base = seqstr[i]; char qual = qualstr[i]; + totalQual += qual - 33; + if(qual < mOptions->qualfilter.qualifiedQual) lowQualNum ++; @@ -40,6 +43,8 @@ if(mOptions->qualfilter.enabled) { if(lowQualNum > (mOptions->qualfilter.unqualifiedPercentLimit * rlen / 100.0) ) return FAIL_QUALITY; + else if(mOptions->qualfilter.avgQualReq > 0 && (totalQual / rlen)qualfilter.avgQualReq) + return FAIL_QUALITY; else if(nBaseNum > mOptions->qualfilter.nBaseLimit ) return FAIL_N_BASE; } @@ -75,9 +80,10 @@ return false; } -Read* Filter::trimAndCut(Read* r, int front, int tail) { +Read* Filter::trimAndCut(Read* r, int front, int tail, int& frontTrimmed) { + frontTrimmed = 0; // return the same read for speed if no change needed - if(front == 0 && tail == 0 && !mOptions->qualityCut.enabledFront && !mOptions->qualityCut.enabledTail) + if(front == 0 && tail == 0 && !mOptions->qualityCut.enabledFront && !mOptions->qualityCut.enabledTail && !mOptions->qualityCut.enabledRight) return r; @@ -91,6 +97,7 @@ } else if(!mOptions->qualityCut.enabledFront && !mOptions->qualityCut.enabledTail && !mOptions->qualityCut.enabledRight){ r->mSeq.mStr = r->mSeq.mStr.substr(front, rlen); r->mQuality = r->mQuality.substr(front, rlen); + frontTrimmed = front; return r; } @@ -205,6 +212,8 @@ r->mSeq.mStr = r->mSeq.mStr.substr(front, rlen); r->mQuality = r->mQuality.substr(front, rlen); + frontTrimmed = front; + return r; } @@ -257,7 +266,8 @@ opt.qualityCut.windowSizeTail = 4; opt.qualityCut.qualityTail = 20; Filter filter(&opt); - Read* ret = filter.trimAndCut(&r, 0, 1); + int frontTrimmed = 0; + Read* ret = filter.trimAndCut(&r, 0, 1, frontTrimmed); ret->print(); return ret->mSeq.mStr == "CCCCCCCCCCCCCCCCCCCCCCCCCCCC" diff -Nru fastp-0.19.6+dfsg/src/filter.h fastp-0.20.0+dfsg/src/filter.h --- fastp-0.19.6+dfsg/src/filter.h 2019-01-02 02:56:02.000000000 +0000 +++ fastp-0.20.0+dfsg/src/filter.h 2019-04-17 03:23:22.000000000 +0000 @@ -16,7 +16,7 @@ ~Filter(); int passFilter(Read* r); bool passLowComplexityFilter(Read* r); - Read* trimAndCut(Read* r, int front, int tail); + Read* trimAndCut(Read* r, int front, int tail, int& frontTrimmed); bool filterByIndex(Read* r); bool filterByIndex(Read* r1, Read* r2); static bool test(); diff -Nru fastp-0.19.6+dfsg/src/filterresult.cpp fastp-0.20.0+dfsg/src/filterresult.cpp --- fastp-0.19.6+dfsg/src/filterresult.cpp 2019-01-02 02:56:02.000000000 +0000 +++ fastp-0.20.0+dfsg/src/filterresult.cpp 2019-04-17 03:23:22.000000000 +0000 @@ -9,6 +9,7 @@ mPaired = paired; mTrimmedAdapterRead = 0; mTrimmedAdapterBases = 0; + mMergedPairs = 0; for(int i=0; i= FILTER_RESULT_TYPES) return ; - // for paired end data, both reads are filtered together - if(mPaired) - mFilterReadStats[result] += 2; - else - mFilterReadStats[result]++; + mFilterReadStats[result] += readNum; +} + +void FilterResult::addMergedPairs(int pairs) { + mMergedPairs += pairs; } FilterResult* FilterResult::merge(vector& list) { @@ -45,6 +46,12 @@ } result->mTrimmedAdapterRead += list[i]->mTrimmedAdapterRead; result->mTrimmedAdapterBases += list[i]->mTrimmedAdapterBases; + result->mMergedPairs += list[i]->mMergedPairs; + + for(int b=0; b<4; b++) { + result->mTrimmedPolyXReads[b] += list[i]->mTrimmedPolyXReads[b]; + result->mTrimmedPolyXBases[b] += list[i]->mTrimmedPolyXBases[b]; + } // merge adapter stats map::iterator iter; @@ -102,10 +109,11 @@ return mCorrectionMatrix[f*8 + t]; } -void FilterResult::addAdapterTrimmed(string adapter, bool isR2) { +void FilterResult::addAdapterTrimmed(string adapter, bool isR2, bool incTrimmedCounter ) { if(adapter.empty()) return; - mTrimmedAdapterRead++; + if(incTrimmedCounter) + mTrimmedAdapterRead++; mTrimmedAdapterBases += adapter.length(); if(!isR2) { if(mAdapter1.count(adapter) >0 ) @@ -138,6 +146,26 @@ } } +void FilterResult::addPolyXTrimmed(int base, int length) { + mTrimmedPolyXReads[base] += 1; + mTrimmedPolyXBases[base] += length; +} + +long FilterResult::getTotalPolyXTrimmedReads() { + long sum_reads = 0; + for(int b = 0; b < 4; b++) + sum_reads += mTrimmedPolyXReads[b]; + return sum_reads; +} + +long FilterResult::getTotalPolyXTrimmedBases() { + long sum_bases = 0; + for(int b = 0; b < 4; b++) + sum_bases += mTrimmedPolyXBases[b]; + return sum_bases; +} + + void FilterResult::print() { cerr << "reads passed filter: " << mFilterReadStats[PASS_FILTER] << endl; cerr << "reads failed due to low quality: " << mFilterReadStats[FAIL_QUALITY] << endl; @@ -154,6 +182,10 @@ cerr << "reads with adapter trimmed: " << mTrimmedAdapterRead << endl; cerr << "bases trimmed due to adapters: " << mTrimmedAdapterBases << endl; } + if(mOptions->polyXTrim.enabled) { + cerr << "reads with polyX in 3' end: " << getTotalPolyXTrimmedReads() << endl; + cerr << "bases trimmed in polyX tail: " << getTotalPolyXTrimmedBases() << endl; + } if(mOptions->correction.enabled) { cerr << "reads corrected by overlap analysis: " << mCorrectedReads << endl; cerr << "bases corrected by overlap analysis: " << getTotalCorrectedBases() << endl; @@ -241,6 +273,25 @@ ofs << padding << "}," << endl; } +void writeBaseCountsJson(ofstream& ofs, string pad, string key, long total, long (&counts)[4]) { + ofs << pad << "\t\"total_" << key << "\": " << total << "," << endl; + ofs << pad << "\t\"" << key << "\":{"; + for (int b=0; b<4; b++) { + if(b > 0) + ofs << ", "; + ofs << "\"" << ATCG_BASES[b] << "\": " << counts[b]; + } + ofs << "}"; +} + +void FilterResult::reportPolyXTrimJson(ofstream& ofs, string padding) { + ofs << padding << "{" << endl; + writeBaseCountsJson(ofs, padding, "polyx_trimmed_reads", getTotalPolyXTrimmedReads(), mTrimmedPolyXReads); + ofs << "," << endl; + writeBaseCountsJson(ofs, padding, "polyx_trimmed_bases", getTotalPolyXTrimmedBases(), mTrimmedPolyXBases); + ofs << endl << padding << "}," << endl; +} + /*void FilterResult::reportHtml(ofstream& ofs, long totalReads) { const int types = 4; const string divName = "filtering_result"; diff -Nru fastp-0.19.6+dfsg/src/filterresult.h fastp-0.20.0+dfsg/src/filterresult.h --- fastp-0.19.6+dfsg/src/filterresult.h 2019-01-02 02:56:02.000000000 +0000 +++ fastp-0.20.0+dfsg/src/filterresult.h 2019-04-17 03:23:22.000000000 +0000 @@ -29,17 +29,22 @@ FilterResult(Options* opt, bool paired = false); ~FilterResult(); inline long* getFilterReadStats() {return mFilterReadStats;} - void addFilterResult(int result); + void addFilterResult(int result, int readNum=1); static FilterResult* merge(vector& list); void print(); // for single end - void addAdapterTrimmed(string adapter, bool isR2 = false); + void addAdapterTrimmed(string adapter, bool isR2 = false, bool incTrimmedCounter = true); // for paired end void addAdapterTrimmed(string adapter1, string adapter2); + void addPolyXTrimmed(int base, int length); + long getTotalPolyXTrimmedReads(); + long getTotalPolyXTrimmedBases(); // a part of JSON report void reportJson(ofstream& ofs, string padding); // a part of JSON report for adapters void reportAdapterJson(ofstream& ofs, string padding); + // a part of JSON report for polyX trim + void reportPolyXTrimJson(ofstream& ofs, string padding); // a part of HTML report void reportHtml(ofstream& ofs, long totalReads, long totalBases); // a part of HTML report for adapters @@ -52,15 +57,20 @@ void addCorrection(char from, char to); long getCorrectionNum(char from, char to); void incCorrectedReads(int count); + void addMergedPairs(int pairs); + public: Options* mOptions; bool mPaired; long mCorrectedReads; + long mMergedPairs; private: long mFilterReadStats[FILTER_RESULT_TYPES]; long mTrimmedAdapterRead; long mTrimmedAdapterBases; + long mTrimmedPolyXReads[4] = {0}; + long mTrimmedPolyXBases[4] = {0}; map mAdapter1; map mAdapter2; long* mCorrectionMatrix; diff -Nru fastp-0.19.6+dfsg/src/htmlreporter.cpp fastp-0.20.0+dfsg/src/htmlreporter.cpp --- fastp-0.19.6+dfsg/src/htmlreporter.cpp 2019-01-02 02:56:02.000000000 +0000 +++ fastp-0.20.0+dfsg/src/htmlreporter.cpp 2019-04-17 03:23:22.000000000 +0000 @@ -117,7 +117,8 @@ // report read length change if(mOptions->isPaired()) { outputRow(ofs, "mean length before filtering:", to_string(preStats1->getMeanLength()) + "bp, " + to_string(preStats2->getMeanLength()) + "bp"); - outputRow(ofs, "mean length after filtering:", to_string(postStats1->getMeanLength()) + "bp, " + to_string(postStats2->getMeanLength()) + "bp"); + if(!mOptions->merge.enabled) + outputRow(ofs, "mean length after filtering:", to_string(postStats1->getMeanLength()) + "bp, " + to_string(postStats2->getMeanLength()) + "bp"); } else { outputRow(ofs, "mean length before filtering:", to_string(preStats1->getMeanLength()) + "bp"); outputRow(ofs, "mean length after filtering:", to_string(postStats1->getMeanLength()) + "bp"); @@ -208,7 +209,8 @@ } void HtmlReporter::reportInsertSize(ofstream& ofs, int isizeLimit) { - + if(isizeLimit<1) + isizeLimit = 1; int total = min(mOptions->insertSizeMax, isizeLimit); long *x = new long[total]; double allCount = 0; @@ -349,11 +351,14 @@ ofs << "\n"; ofs << "
\n"; - if(postStats1) { - postStats1 -> reportHtml(ofs, "After filtering", "read1"); + if(postStats1) { + string name = "read1"; + if(mOptions->merge.enabled) + name = "merged"; + postStats1 -> reportHtml(ofs, "After filtering", name); } - if(postStats2) { + if(postStats2 && !mOptions->merge.enabled) { postStats2 -> reportHtml(ofs, "After filtering", "read2"); } diff -Nru fastp-0.19.6+dfsg/src/jsonreporter.cpp fastp-0.20.0+dfsg/src/jsonreporter.cpp --- fastp-0.19.6+dfsg/src/jsonreporter.cpp 2019-01-02 02:56:02.000000000 +0000 +++ fastp-0.20.0+dfsg/src/jsonreporter.cpp 2019-04-17 03:23:22.000000000 +0000 @@ -90,7 +90,7 @@ ofs << "\t\t\t" << "\"q20_rate\":" << (post_total_bases == 0?0.0:(double)post_q20_bases / (double)post_total_bases) << "," << endl; ofs << "\t\t\t" << "\"q30_rate\":" << (post_total_bases == 0?0.0:(double)post_q30_bases / (double)post_total_bases) << "," << endl; ofs << "\t\t\t" << "\"read1_mean_length\":" << postStats1->getMeanLength() << "," << endl; - if(mOptions->isPaired()) + if(mOptions->isPaired() && !mOptions->merge.enabled) ofs << "\t\t\t" << "\"read2_mean_length\":" << postStats2->getMeanLength() << "," << endl; ofs << "\t\t\t" << "\"gc_content\":" << (post_total_bases == 0?0.0:(double)post_total_gc / (double)post_total_bases) << endl; ofs << "\t\t" << "}"; @@ -145,22 +145,30 @@ result -> reportAdapterJson(ofs, "\t"); } + if(result && mOptions->polyXTrimmingEnabled()) { + ofs << "\t" << "\"polyx_trimming\": " ; + result -> reportPolyXTrimJson(ofs, "\t"); + } + if(preStats1) { ofs << "\t" << "\"read1_before_filtering\": " ; preStats1 -> reportJson(ofs, "\t"); } - if(postStats1) { - ofs << "\t" << "\"read1_after_filtering\": " ; - postStats1 -> reportJson(ofs, "\t"); - } - if(preStats2) { ofs << "\t" << "\"read2_before_filtering\": " ; preStats2 -> reportJson(ofs, "\t"); } - if(postStats2) { + if(postStats1) { + string name = "read1_after_filtering"; + if(mOptions->merge.enabled) + name = "merged_and_filtered"; + ofs << "\t" << "\"" << name << "\": " ; + postStats1 -> reportJson(ofs, "\t"); + } + + if(postStats2 && !mOptions->merge.enabled) { ofs << "\t" << "\"read2_after_filtering\": " ; postStats2 -> reportJson(ofs, "\t"); } diff -Nru fastp-0.19.6+dfsg/src/knownadapters.h fastp-0.20.0+dfsg/src/knownadapters.h --- fastp-0.19.6+dfsg/src/knownadapters.h 2019-01-02 02:56:02.000000000 +0000 +++ fastp-0.20.0+dfsg/src/knownadapters.h 2019-04-17 03:23:22.000000000 +0000 @@ -1,6 +1,8 @@ #ifndef KNOWN_ADAPTERS_H #define KNOWN_ADAPTERS_H +// some adapter sequences are from https://github.com/stephenturner/adapters/blob/master/adapters_combined_256_unique.fasta + #include #include #include @@ -9,9 +11,237 @@ inline map getKnownAdapter() { map knownAdapters; - knownAdapters["AGATCGGAAGAGCACACGTCTGAACTCCAGTCA"] = "Illumina TruSeq Adapter Read 1"; - knownAdapters["AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"] = "Illumina TruSeq Adapter Read 2"; - knownAdapters["GATCGTCGGACTGTAGAACTCTGAACGTGTAGA"] = "Illumina Small RNA Adapter Read 2"; + knownAdapters["AGATCGGAAGAGCACACGTCTGAACTCCAGTCA"] = ">Illumina TruSeq Adapter Read 1"; + knownAdapters["AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"] = ">Illumina TruSeq Adapter Read 2"; + knownAdapters["GATCGTCGGACTGTAGAACTCTGAACGTGTAGA"] = ">Illumina Small RNA Adapter Read 2"; + knownAdapters["AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA"] = ">Illumina DpnII expression PCR Primer 2 | >Illumina NlaIII expression PCR Primer 2 | >Illumina Small RNA PCR Primer 2 | >Illumina DpnII Gex PCR Primer 2 | >Illumina NlaIII Gex PCR Primer 2"; + knownAdapters["AATGATACGGCGACCACCGAGATCTACACGTTCAGAGTTCTACAGTCCGA"] = ">Illumina RNA PCR Primer"; + knownAdapters["AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT"] = ">TruSeq_Universal_Adapter | >PrefixPE/1 | >PCR_Primer1 | >Illumina Single End PCR Primer 1 | >Illumina Paried End PCR Primer 1 | >Illumina Multiplexing PCR Primer 1.01 | >TruSeq Universal Adapter | >TruSeq_Universal_Adapter | >PrefixPE/1 | >PCR_Primer1"; + knownAdapters["AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTG"] = ">pcr_dimer"; + knownAdapters["AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTCAAGCAGAAGACGGCATACGAGCTCTTCCGATCT"] = ">PCR_Primers"; + knownAdapters["ACACTCTTTCCCTACACGACGCTCTTCCGATCT"] = ">Illumina Single End Sequencing Primer | >Illumina Paired End Adapter 1 | >Illumina Paried End Sequencing Primer 1 | >Illumina Multiplexing Adapter 2 | >Illumina Multiplexing Read1 Sequencing Primer"; + knownAdapters["AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"] = ">PE2_rc | >TruSeq3_IndexedAdapter | >PE2_rc | >TruSeq3_IndexedAdapter"; + knownAdapters["AGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG"] = ">Reverse_adapter"; + knownAdapters["AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAG"] = ">TruSeq2_PE_r"; + knownAdapters["AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTG"] = ">PCR_Primer2_rc"; + knownAdapters["AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTGAAA"] = ">PhiX_read1_adapter"; + knownAdapters["AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA"] = ">PE1_rc | >TruSeq3_UniversalAdapter | >PE1_rc | >TruSeq3_UniversalAdapter"; + knownAdapters["AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">PCR_Primer1_rc"; + knownAdapters["AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATTAAAAAA"] = ">PhiX_read2_adapter"; + knownAdapters["AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq2_SE"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATAAAATGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 35"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTC"] = ">Illumina PCR Primer Index 10"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 10"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTC"] = ">Illumina PCR Primer Index 2"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 2"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATAGCTAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 38"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATAGGAATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 27"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATATCAGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 25"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATATCGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 31"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATATTATAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 44"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATATTCCGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 37"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTC"] = ">Illumina PCR Primer Index 6"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 6"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTC"] = ">Illumina PCR Primer Index 5"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 5"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCCACTCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 23"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCCGGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 30"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCGAAACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 21"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCGATTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 42"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCGCCTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 33"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT"] = ">PrefixPE/2 | >PCR_Primer2 | >Illumina Paired End PCR Primer 2 | >PrefixPE/2 | >PCR_Primer2"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCGTACGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 22"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTC"] = ">Illumina PCR Primer Index 1"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 1"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCTCTACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 17"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTC"] = ">Illumina PCR Primer Index 9"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 9"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCTTCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 47"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATCTTTTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 28"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGAATGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 45"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTC"] = ">Illumina PCR Primer Index 7"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 7"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGCCATGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 34"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTC"] = ">Illumina PCR Primer Index 3"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 3"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGCGGACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 18"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGCTACCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 24"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGCTCATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 26"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGCTGTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 43"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGGAACTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 14"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGGACGGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 16"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGGCCACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 20"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTC"] = ">Illumina PCR Primer Index 11"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 11"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGTATAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 39"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATGTCGTCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 41"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTC"] = ">Illumina PCR Primer Index 12"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 12"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATTAGTTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 29"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTC"] = ">Illumina PCR Primer Index 8"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 8"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATTCGGGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 46"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATTCTGAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 40"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATTGACATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 15"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATTGAGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 32"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATTGCCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 48"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTC"] = ">Illumina PCR Primer Index 4"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 4"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATTGTTGGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 36"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATTTGACTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 13"; + knownAdapters["CAAGCAGAAGACGGCATACGAGATTTTCACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"] = ">RNA PCR Primer, Index 19"; + knownAdapters["CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT"] = ">Illumina Single End Adapter 2 | >Illumina Single End PCR Primer 2"; + knownAdapters["CCACTACGCCTCCGCTTTCCTCTCTATGGGCAGTCGGTGAT"] = ">ABI Solid3 Adapter B"; + knownAdapters["CCGACAGGTTCAGAGTTCTACAGTCCGACATG"] = ">Illumina NlaIII expression Sequencing Primer | >Illumina NlaIII Gex Sequencing Primer"; + knownAdapters["CCGAGCCCACGAGACAAGAGGCAATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_and_Nextera_Enrichment_N711 | >I7_Primer_Nextera_XT_Index_Kit_v2_N711 | >I7_Primer_Nextera_XT_and_Nextera_Enrichment_N711 | >I7_Primer_Nextera_XT_Index_Kit_v2_N711"; + knownAdapters["CCGAGCCCACGAGACACTCGCTAATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_Index_Kit_v2_N716"; + knownAdapters["CCGAGCCCACGAGACACTGAGCGATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_Index_Kit_v2_N724"; + knownAdapters["CCGAGCCCACGAGACAGGCAGAAATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_and_Nextera_Enrichment_N703 | >I7_Primer_Nextera_XT_Index_Kit_v2_N703 | >I7_Primer_Nextera_XT_and_Nextera_Enrichment_N703 | >I7_Primer_Nextera_XT_Index_Kit_v2_N703"; + knownAdapters["CCGAGCCCACGAGACATCTCAGGATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_Index_Kit_v2_N715"; + knownAdapters["CCGAGCCCACGAGACATGCGCAGATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_Index_Kit_v2_N722"; + knownAdapters["CCGAGCCCACGAGACCAGAGAGGATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_and_Nextera_Enrichment_N708"; + knownAdapters["CCGAGCCCACGAGACCCTAAGACATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_Index_Kit_v2_N726"; + knownAdapters["CCGAGCCCACGAGACCGAGGCTGATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_and_Nextera_Enrichment_N710 | >I7_Primer_Nextera_XT_Index_Kit_v2_N710 | >I7_Primer_Nextera_XT_and_Nextera_Enrichment_N710 | >I7_Primer_Nextera_XT_Index_Kit_v2_N710"; + knownAdapters["CCGAGCCCACGAGACCGATCAGTATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_Index_Kit_v2_N727"; + knownAdapters["CCGAGCCCACGAGACCGGAGCCTATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_Index_Kit_v2_N720"; + knownAdapters["CCGAGCCCACGAGACCGTACTAGATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_and_Nextera_Enrichment_N702 | >I7_Primer_Nextera_XT_Index_Kit_v2_N702 | >I7_Primer_Nextera_XT_and_Nextera_Enrichment_N702 | >I7_Primer_Nextera_XT_Index_Kit_v2_N702"; + knownAdapters["CCGAGCCCACGAGACCTCTCTACATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_and_Nextera_Enrichment_N707 | >I7_Primer_Nextera_XT_Index_Kit_v2_N707 | >I7_Primer_Nextera_XT_and_Nextera_Enrichment_N707 | >I7_Primer_Nextera_XT_Index_Kit_v2_N707"; + knownAdapters["CCGAGCCCACGAGACGCGTAGTAATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_Index_Kit_v2_N719"; + knownAdapters["CCGAGCCCACGAGACGCTACGCTATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_and_Nextera_Enrichment_N709"; + knownAdapters["CCGAGCCCACGAGACGCTCATGAATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_Index_Kit_v2_N714"; + knownAdapters["CCGAGCCCACGAGACGGACTCCTATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_and_Nextera_Enrichment_N705 | >I7_Primer_Nextera_XT_Index_Kit_v2_N705 | >I7_Primer_Nextera_XT_and_Nextera_Enrichment_N705 | >I7_Primer_Nextera_XT_Index_Kit_v2_N705"; + knownAdapters["CCGAGCCCACGAGACGGAGCTACATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_Index_Kit_v2_N718"; + knownAdapters["CCGAGCCCACGAGACGTAGAGGAATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_and_Nextera_Enrichment_N712 | >I7_Primer_Nextera_XT_Index_Kit_v2_N712 | >I7_Primer_Nextera_XT_and_Nextera_Enrichment_N712 | >I7_Primer_Nextera_XT_Index_Kit_v2_N712"; + knownAdapters["CCGAGCCCACGAGACTAAGGCGAATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_and_Nextera_Enrichment_N701 | >I7_Primer_Nextera_XT_Index_Kit_v2_N701 | >I7_Primer_Nextera_XT_and_Nextera_Enrichment_N701 | >I7_Primer_Nextera_XT_Index_Kit_v2_N701"; + knownAdapters["CCGAGCCCACGAGACTACGCTGCATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_Index_Kit_v2_N721"; + knownAdapters["CCGAGCCCACGAGACTAGCGCTCATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_Index_Kit_v2_N723"; + knownAdapters["CCGAGCCCACGAGACTAGGCATGATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_and_Nextera_Enrichment_N706 | >I7_Primer_Nextera_XT_Index_Kit_v2_N706 | >I7_Primer_Nextera_XT_and_Nextera_Enrichment_N706 | >I7_Primer_Nextera_XT_Index_Kit_v2_N706"; + knownAdapters["CCGAGCCCACGAGACTCCTGAGCATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_and_Nextera_Enrichment_N704 | >I7_Primer_Nextera_XT_Index_Kit_v2_N704 | >I7_Primer_Nextera_XT_and_Nextera_Enrichment_N704 | >I7_Primer_Nextera_XT_Index_Kit_v2_N704"; + knownAdapters["CCGAGCCCACGAGACTCGACGTCATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_Index_Kit_v2_N729"; + knownAdapters["CCGAGCCCACGAGACTGCAGCTAATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Primer_Nextera_XT_Index_Kit_v2_N728"; + knownAdapters["CGACAGGTTCAGAGTTCTACAGTCCGACGATC"] = ">Illumina DpnII expression Sequencing Primer | >Illumina Small RNA Sequencing Primer | >Illumina DpnII Gex Sequencing Primer"; + knownAdapters["CGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT"] = ">Illumina Paired End Sequencing Primer 2"; + knownAdapters["CTAATACGACTCACTATAGGGCAAGCAGTGGTATCAACGCAGAGT"] = ">Clontech Universal Primer Mix Long"; + knownAdapters["CTGAGCGGGCTGGCAAGGCAGACCGATCTCGTATGCCGTCTTCTGCTTG"] = ">I7_Adapter_Nextera_No_Barcode"; + knownAdapters["CTGATGGCGCGAGGGAGGCGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Adapter_Nextera"; + knownAdapters["CTGCCCCGGGTTCCTCATTCTCTCAGCAGCATG"] = ">ABI Solid3 Adapter A"; + knownAdapters["CTGTCTCTTATACACATCTCCGAGCCCACGAGAC"] = ">I7_Nextera_Transposase_1 | >Trans2_rc | >I7_Nextera_Transposase_1 | >Trans2_rc"; + knownAdapters["CTGTCTCTTATACACATCTCTGAGCGGGCTGGCAAGGC"] = ">I7_Nextera_Transposase_2"; + knownAdapters["CTGTCTCTTATACACATCTCTGATGGCGCGAGGGAGGC"] = ">I5_Nextera_Transposase_2"; + knownAdapters["CTGTCTCTTATACACATCTGACGCTGCCGACGA"] = ">I5_Nextera_Transposase_1 | >Trans1_rc | >I5_Nextera_Transposase_1 | >Trans1_rc"; + knownAdapters["GACGCTGCCGACGAACTCTAGGGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_Index_Kit_v2_S516"; + knownAdapters["GACGCTGCCGACGAAGAGGATAGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]503 | >I5_Primer_Nextera_XT_Index_Kit_v2_S503 | >I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]503 | >I5_Primer_Nextera_XT_Index_Kit_v2_S503"; + knownAdapters["GACGCTGCCGACGAAGCTAGAAGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_Index_Kit_v2_S515"; + knownAdapters["GACGCTGCCGACGAAGGCTTAGGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]508 | >I5_Primer_Nextera_XT_Index_Kit_v2_S508 | >I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]508 | >I5_Primer_Nextera_XT_Index_Kit_v2_S508"; + knownAdapters["GACGCTGCCGACGAATAGAGAGGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]502 | >I5_Primer_Nextera_XT_Index_Kit_v2_S502 | >I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]502 | >I5_Primer_Nextera_XT_Index_Kit_v2_S502"; + knownAdapters["GACGCTGCCGACGAATAGCCTTGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_Index_Kit_v2_S520"; + knownAdapters["GACGCTGCCGACGAATTAGACGGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_Index_Kit_v2_S510"; + knownAdapters["GACGCTGCCGACGACGGAGAGAGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_Index_Kit_v2_S511"; + knownAdapters["GACGCTGCCGACGACTAGTCGAGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_Index_Kit_v2_S513"; + knownAdapters["GACGCTGCCGACGACTCCTTACGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]505 | >I5_Primer_Nextera_XT_Index_Kit_v2_S505 | >I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]505 | >I5_Primer_Nextera_XT_Index_Kit_v2_S505"; + knownAdapters["GACGCTGCCGACGACTTAATAGGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_Index_Kit_v2_S518"; + knownAdapters["GACGCTGCCGACGAGCGATCTAGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]501"; + knownAdapters["GACGCTGCCGACGATAAGGCTCGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_Index_Kit_v2_S521"; + knownAdapters["GACGCTGCCGACGATACTCCTTGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]507 | >I5_Primer_Nextera_XT_Index_Kit_v2_S507 | >I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]507 | >I5_Primer_Nextera_XT_Index_Kit_v2_S507"; + knownAdapters["GACGCTGCCGACGATATGCAGTGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]506 | >I5_Primer_Nextera_XT_Index_Kit_v2_S506 | >I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]506 | >I5_Primer_Nextera_XT_Index_Kit_v2_S506"; + knownAdapters["GACGCTGCCGACGATCGCATAAGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_Index_Kit_v2_S522"; + knownAdapters["GACGCTGCCGACGATCTACTCTGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]504"; + knownAdapters["GACGCTGCCGACGATCTTACGCGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]517 | >I5_Primer_Nextera_XT_Index_Kit_v2_S517 | >I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]517 | >I5_Primer_Nextera_XT_Index_Kit_v2_S517"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCAC"] = ">Nextera_LMP_Read1_External_Adapter | >Illumina Multiplexing Index Sequencing Primer"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_5 | >TruSeq Adapter, Index 5"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTGATATATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_25"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTGATATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq Adapter, Index 25"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTTGAATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_8 | >TruSeq Adapter, Index 8"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTCAACAATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_13"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTCAACTCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq Adapter, Index 13"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTTCCGTATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_14"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTTCCGTCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq Adapter, Index 14"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_1_6 | >TruSeq Adapter, Index 1"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACATGTCAGAATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_15"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACATGTCAGTCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq Adapter, Index 15"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACATTCCTTTATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_27"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACATTCCTTTCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq Adapter, Index 27"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGATCATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_7 | >TruSeq Adapter, Index 7"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACCCACTCTTCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq Adapter, Index 23"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACCCGTCCCGATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_16"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACCCGTCCCTCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq Adapter, Index 16"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_2 | >TruSeq Adapter, Index 2"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGTACGTAATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_22"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGTACGTTCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq Adapter, Index 22"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTTGTAATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_12 | >TruSeq Adapter, Index 12"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACGAGTGGATATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_23"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACGATCAGATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_9 | >TruSeq Adapter, Index 9"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_6 | >TruSeq Adapter, Index 6"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACGGCTACATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_11 | >TruSeq Adapter, Index 11"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTCCGCACATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_18_7"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTCCGCATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq Adapter, Index 18"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGAAACGATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_19"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGAAACTCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq Adapter, Index 19"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGGCCTTATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_20"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGGCCTTCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq Adapter, Index 20"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTTTCGGAATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_21"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTTTCGGTCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq Adapter, Index 21"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACTAGCTTATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_10 | >TruSeq Adapter, Index 10"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_4 | >TruSeq Adapter, Index 4"; + knownAdapters["GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTAGGCATCTCGTATGCCGTCTTCTGCTTG"] = ">TruSeq_Adapter_Index_3 | >TruSeq Adapter, Index 3"; + knownAdapters["GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG"] = ">Illumina Paired End Adapter 2"; + knownAdapters["GATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"] = ">Nextera_LMP_Read2_External_Adapter"; + knownAdapters["GATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG"] = ">Illumina Single End Adapter 1"; + knownAdapters["GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG"] = ">Trans2"; + knownAdapters["GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT"] = ">PrefixPE/2 | >PE2 | >Illumina Multiplexing PCR Primer 2.01 | >Illumina Multiplexing Read2 Sequencing Primer | >PrefixPE/2 | >PE2"; + knownAdapters["TACACTCTTTCCCTACACGACGCTCTTCCGATCT"] = ">PrefixPE/1 | >PE1 | >PrefixPE/1 | >PE1"; + knownAdapters["TCGGACTGTAGAACTCTGAACGTGTAGATCTCGGTGGTCGCCGTATCATT"] = ">RNA_PCR_Primer_(RP1)_part_#_15013198"; + knownAdapters["TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG"] = ">Trans1"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACACAGTGATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_5_(RPI5)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACACTGATATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_25_(RPI25)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACACTTGAATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_8_(RPI8)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACAGTCAAATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_13_(RPI13)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACAGTTCCATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_14_(RPI14)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_1_(RPI1)_2,9"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACATGAGCATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_26_(RPI26)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACATGTCAATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_15_(RPI15)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACATTCCTATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_27_(RPI27)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCAAAAGATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_28_(RPI28)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCAACTAATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_29_(RPI29)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCACCGGATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_30_(RPI30)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCACGATATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_31_(RPI31)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCACTCAATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_32_(RPI32)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCAGATCATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_7_(RPI7)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCAGGCGATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_33_(RPI33)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCATGGCATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_34_(RPI34)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCATTTTATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_35_(RPI35)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCCAACAATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_36_(RPI36)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCCGTCCATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_16_(RPI16)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_2_(RPI2)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCGGAATATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_37_(RPI37)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCGTACGATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_22_(RPI22)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCTAGCTATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_38_(RPI38)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCTATACATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_39_(RPI39)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCTCAGAATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_40_(RPI40)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCTTGTAATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_12_(RPI12)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGACGACATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_41_(RPI41)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGAGTGGATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_23_(RPI23)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGATCAGATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_9_(RPI9)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_6_(RPI6)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGGCTACATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_11_(RPI11)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGGTAGCATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_24_(RPI24)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGTAGAGATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_17_(RPI17)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGTCCGCATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_18_(RPI18)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGTGAAAATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_19_(RPI19)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGTGGCCATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_20_(RPI20)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGTTTCGATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_21_(RPI21)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTAATCGATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_42_(RPI42)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTACAGCATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_43_(RPI43)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTAGCTTATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_10_(RPI10)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTATAATATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_44_(RPI44)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTCATTCATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_45_(RPI45)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTCCCGAATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_46_(RPI46)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTCGAAGATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_47_(RPI47)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTCGGCAATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_48_(RPI48)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_4_(RPI4)"; + knownAdapters["TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTTAGGCATCTCGTATGCCGTCTTCTGCTTG"] = ">RNA_PCR_Primer_Index_3_(RPI3)"; + knownAdapters["TTTTTTTTTTAATGATACGGCGACCACCGAGATCTACAC"] = ">FlowCell1"; + knownAdapters["TTTTTTTTTTCAAGCAGAAGACGGCATACGA"] = ">FlowCell2"; return knownAdapters; } diff -Nru fastp-0.19.6+dfsg/src/main.cpp fastp-0.20.0+dfsg/src/main.cpp --- fastp-0.19.6+dfsg/src/main.cpp 2019-01-02 02:56:02.000000000 +0000 +++ fastp-0.20.0+dfsg/src/main.cpp 2019-04-17 03:23:22.000000000 +0000 @@ -35,10 +35,16 @@ cmd.add("out1", 'o', "read1 output file name", false, ""); cmd.add("in2", 'I', "read2 input file name", false, ""); cmd.add("out2", 'O', "read2 output file name", false, ""); + cmd.add("unpaired1", 0, "for PE input, if read1 passed QC but read2 not, it will be written to unpaired1. Default is to discard it.", false, ""); + cmd.add("unpaired2", 0, "for PE input, if read2 passed QC but read1 not, it will be written to unpaired2. If --unpaired2 is same as --umpaired1 (default mode), both unpaired reads will be written to this same file.", false, ""); + cmd.add("failed_out", 0, "specify the file to store reads that cannot pass the filters.", false, ""); + cmd.add("merge", 'm', "for paired-end input, merge each pair of reads into a single read if they are overlapped. The merged reads will be written to the file given by --merged_out, the unmerged reads will be written to the files specified by --out1 and --out2. The merging mode is disabled by default."); + cmd.add("merged_out", 0, "in the merging mode, specify the file name to store merged output, or specify --stdout to stream the merged output", false, ""); + cmd.add("include_unmerged", 0, "in the merging mode, write the unmerged or unpaired reads to the file specified by --merge. Disabled by default."); cmd.add("phred64", '6', "indicate the input is using phred64 scoring (it'll be converted to phred33, so the output will still be phred33)"); cmd.add("compression", 'z', "compression level for gzip output (1 ~ 9). 1 is fastest, 9 is smallest, default is 4.", false, 4); cmd.add("stdin", 0, "input from STDIN. If the STDIN is interleaved paired-end FASTQ, please also add --interleaved_in."); - cmd.add("stdout", 0, "stream passing-filters reads to STDOUT. This option will result in interleaved FASTQ output for paired-end input. Disabled by default."); + cmd.add("stdout", 0, "stream passing-filters reads to STDOUT. This option will result in interleaved FASTQ output for paired-end output. Disabled by default."); cmd.add("interleaved_in", 0, "indicate that is an interleaved FASTQ which contains both read1 and read2. Disabled by default."); cmd.add("reads_to_process", 0, "specify how many reads/pairs to be processed. Default 0 means process all reads.", false, 0); cmd.add("dont_overwrite", 0, "don't overwrite existing files. Overwritting is allowed by default."); @@ -48,6 +54,7 @@ cmd.add("disable_adapter_trimming", 'A', "adapter trimming is enabled by default. If this option is specified, adapter trimming is disabled"); cmd.add("adapter_sequence", 'a', "the adapter for read1. For SE data, if not specified, the adapter will be auto-detected. For PE data, this is used if R1/R2 are found not overlapped.", false, "auto"); cmd.add("adapter_sequence_r2", 0, "the adapter for read2 (PE data only). This is used if R1/R2 are found not overlapped. If not specified, it will be the same as ", false, "auto"); + cmd.add("adapter_fasta", 0, "specify a FASTA file to trim both read1 and read2 (if PE) by all the sequences in this FASTA file", false, ""); cmd.add("detect_adapter_for_pe", 0, "by default, the auto-detection for adapter is for SE data input only, turn on this option to enable it for PE data."); // trimming @@ -86,6 +93,7 @@ cmd.add("qualified_quality_phred", 'q', "the quality value that a base is qualified. Default 15 means phred quality >=Q15 is qualified.", false, 15); cmd.add("unqualified_percent_limit", 'u', "how many percents of bases are allowed to be unqualified (0~100). Default 40 means 40%", false, 40); cmd.add("n_base_limit", 'n', "if one read's number of N base is >n_base_limit, then this read/pair is discarded. Default is 5", false, 5); + cmd.add("average_qual", 'e', "if one read's average quality score ("overlap_len_require", 0, "the minimum length of the overlapped region for overlap analysis based adapter trimming and correction. 30 by default.", false, 30); - cmd.add("overlap_diff_limit", 0, "the maximum difference of the overlapped region for overlap analysis based adapter trimming and correction. 5 by default.", false, 5); + cmd.add("overlap_len_require", 0, "the minimum length to detect overlapped region of PE reads. This will affect overlap analysis based PE merge, adapter trimming and correction. 30 by default.", false, 30); + cmd.add("overlap_diff_limit", 0, "the maximum number of mismatched bases to detect overlapped region of PE reads. This will affect overlap analysis based PE merge, adapter trimming and correction. 5 by default.", false, 5); + cmd.add("overlap_diff_percent_limit", 0, "the maximum percentage of mismatched bases to detect overlapped region of PE reads. This will affect overlap analysis based PE merge, adapter trimming and correction. Default 20 means 20%.", false, 20); // umi cmd.add("umi", 'U', "enable unique molecular identifier (UMI) preprocessing"); @@ -134,6 +143,7 @@ cmd.add("cut_by_quality5", 0, "DEPRECATED, use --cut_front instead."); cmd.add("cut_by_quality3", 0, "DEPRECATED, use --cut_tail instead."); cmd.add("cut_by_quality_aggressive", 0, "DEPRECATED, use --cut_right instead."); + cmd.add("discard_unmerged", 0, "DEPRECATED, no effect now, see the introduction for merging."); cmd.parse_check(argc, argv); @@ -142,6 +152,10 @@ return 0; } + if(cmd.exist("discard_unmerged")) { + cerr << "DEPRECATED: --discard_unmerged has no effect now, see the introduction for merging." << endl; + } + Options opt; // I/O @@ -149,6 +163,12 @@ opt.in2 = cmd.get("in2"); opt.out1 = cmd.get("out1"); opt.out2 = cmd.get("out2"); + opt.unpaired1 = cmd.get("unpaired1"); + opt.unpaired2 = cmd.get("unpaired2"); + opt.failedOut = cmd.get("failed_out"); + // write to the same file + if(opt.unpaired2.empty()) + opt.unpaired2 = opt.unpaired1; opt.compression = cmd.get("compression"); opt.readsToProcess = cmd.get("reads_to_process"); opt.phred64 = cmd.exist("phred64"); @@ -158,14 +178,23 @@ opt.interleavedInput = cmd.exist("interleaved_in"); opt.verbose = cmd.exist("verbose"); + // merge PE + opt.merge.enabled = cmd.exist("merge"); + opt.merge.out = cmd.get("merged_out"); + opt.merge.includeUnmerged = cmd.exist("include_unmerged"); + // adapter cutting opt.adapter.enabled = !cmd.exist("disable_adapter_trimming"); opt.adapter.detectAdapterForPE = cmd.exist("detect_adapter_for_pe"); opt.adapter.sequence = cmd.get("adapter_sequence"); opt.adapter.sequenceR2 = cmd.get("adapter_sequence_r2"); + opt.adapter.fastaFile = cmd.get("adapter_fasta"); if(opt.adapter.sequenceR2=="auto" && !opt.adapter.detectAdapterForPE && opt.adapter.sequence != "auto") { opt.adapter.sequenceR2 = opt.adapter.sequence; } + if(!opt.adapter.fastaFile.empty()) { + opt.loadFastaAdapters(); + } // trimming opt.trim.front1 = cmd.get("trim_front1"); @@ -268,6 +297,7 @@ opt.qualfilter.enabled = !cmd.exist("disable_quality_filtering"); opt.qualfilter.qualifiedQual = num2qual(cmd.get("qualified_quality_phred")); opt.qualfilter.unqualifiedPercentLimit = cmd.get("unqualified_percent_limit"); + opt.qualfilter.avgQualReq = cmd.get("average_qual"); opt.qualfilter.nBaseLimit = cmd.get("n_base_limit"); // length filtering @@ -283,6 +313,7 @@ opt.correction.enabled = cmd.exist("correction"); opt.overlapRequire = cmd.get("overlap_len_require"); opt.overlapDiffLimit = cmd.get("overlap_diff_limit"); + opt.overlapDiffPercentLimit = cmd.get("overlap_diff_percent_limit"); // threading opt.thread = cmd.get("thread"); diff -Nru fastp-0.19.6+dfsg/src/options.cpp fastp-0.20.0+dfsg/src/options.cpp --- fastp-0.19.6+dfsg/src/options.cpp 2019-01-02 02:56:02.000000000 +0000 +++ fastp-0.20.0+dfsg/src/options.cpp 2019-04-17 03:23:22.000000000 +0000 @@ -3,6 +3,7 @@ #include #include #include +#include "fastareader.h" Options::Options(){ in1 = ""; @@ -21,6 +22,7 @@ insertSizeMax = 512; overlapRequire = 30; overlapDiffLimit = 5; + overlapDiffPercentLimit = 20; verbose = false; seqLen1 = 151; seqLen2 = 151; @@ -41,6 +43,39 @@ return false; } +bool Options::polyXTrimmingEnabled() { + return polyXTrim.enabled; +} + +void Options::loadFastaAdapters() { + if(adapter.fastaFile.empty()) { + adapter.hasFasta = false; + return; + } + + check_file_valid(adapter.fastaFile); + + FastaReader reader(adapter.fastaFile); + reader.readAll(); + + map contigs = reader.contigs(); + map::iterator iter; + for(iter = contigs.begin(); iter != contigs.end(); iter++) { + if(iter->second.length()>=6) { + adapter.seqsInFasta.push_back(iter->second); + } + else { + cerr << "skip too short adapter sequence in " << adapter.fastaFile << " (6bp required): " << iter->second << endl; + } + } + + if(adapter.seqsInFasta.size() > 0) { + adapter.hasFasta = true; + } else { + adapter.hasFasta = false; + } +} + bool Options::validate() { if(in1.empty()) { if(!in2.empty()) @@ -57,23 +92,73 @@ check_file_valid(in2); } - // if output to STDOUT, then... - if(outputToSTDOUT) { - cerr << "Streaming uncompressed output to STDOUT..." << endl; - if(!in1.empty() && !in2.empty()) - cerr << "Enable interleaved output mode for paired-end input." << endl; - if(!out1.empty()) { - cerr << "Ignore argument --out1 = " << out1 << endl; + if(merge.enabled) { + if(split.enabled) { + error_exit("splitting mode cannot work with merging mode"); + } + if(in2.empty() && !interleavedInput) { + error_exit("read2 input should be specified by --in2 for merging mode"); + } + // enable correction if it's not enabled + if(!correction.enabled) + correction.enabled = true; + if(merge.out.empty() && !outputToSTDOUT && !out1.empty() && out2.empty()) { + cerr << "You specified --out1, but haven't specified --merged_out in merging mode. Using --out1 to store the merged reads to be compatible with fastp 0.19.8" << endl << endl; + merge.out = out1; out1 = ""; } - if(!out2.empty()) { - cerr << "Ignore argument --out2 = " << out2 << endl; - out2 = ""; + if(merge.includeUnmerged) { + if(!out1.empty()) { + cerr << "You specified --include_unmerged in merging mode. Ignoring argument --out1 = " << out1 << endl; + out1 = ""; + } + if(!out2.empty()) { + cerr << "You specified --include_unmerged in merging mode. Ignoring argument --out2 = " << out2 << endl; + out2 = ""; + } + if(!unpaired1.empty()) { + cerr << "You specified --include_unmerged in merging mode. Ignoring argument --unpaired1 = " << unpaired1 << endl; + unpaired1 = ""; + } + if(!unpaired2.empty()) { + cerr << "You specified --include_unmerged in merging mode. Ignoring argument --unpaired1 = " << unpaired2 << endl; + unpaired2 = ""; + } + } + if(merge.out.empty() && !outputToSTDOUT) { + error_exit("In merging mode, you should either specify --merged_out or enable --stdout"); + } + if(!merge.out.empty()) { + if(merge.out == out1) + error_exit("--merged_out and --out1 shouldn't have same file name"); + if(merge.out == out2) + error_exit("--merged_out and --out2 shouldn't have same file name"); + if(merge.out == unpaired1) + error_exit("--merged_out and --unpaired1 shouldn't have same file name"); + if(merge.out == unpaired2) + error_exit("--merged_out and --unpaired2 shouldn't have same file name"); + } + } else { + // not in merging mode + if(!merge.out.empty()) { + cerr << "You haven't enabled merging mode (-m/--merge), ignoring argument --merged_out = " << merge.out << endl; + merge.out = ""; } + } + + // if output to STDOUT, then... + if(outputToSTDOUT) { if(split.enabled) { - cerr << "Ignore split mode" << endl; - split.enabled = false; + error_exit("splitting mode cannot work with stdout mode"); } + cerr << "Streaming uncompressed "; + if(merge.enabled) + cerr << "merged"; + else if(isPaired()) + cerr << "interleaved"; + cerr << " reads to STDOUT..." << endl; + if(isPaired() && !merge.enabled) + cerr << "Enable interleaved output mode for paired-end input." << endl; cerr << endl; } @@ -86,7 +171,8 @@ error_exit("paired-end input, read1 output should be specified together with read2 output (--out2 needed) "); } if(out1.empty() && !out2.empty()) { - error_exit("paired-end input, read1 output should be specified (--out1 needed) together with read2 output "); + if(!merge.enabled) + error_exit("paired-end input, read1 output should be specified (--out1 needed) together with read2 output "); } } @@ -109,6 +195,59 @@ error_exit(out2 + " already exists and you have set to not rewrite output files by --dont_overwrite"); } } + if(!isPaired()) { + if(!unpaired1.empty()) { + cerr << "Not paired-end mode. Ignoring argument --unpaired1 = " << unpaired1 << endl; + unpaired1 = ""; + } + if(!unpaired2.empty()) { + cerr << "Not paired-end mode. Ignoring argument --unpaired2 = " << unpaired2 << endl; + unpaired2 = ""; + } + } + if(split.enabled) { + if(!unpaired1.empty()) { + cerr << "Outputing unpaired reads is not supported in splitting mode. Ignoring argument --unpaired1 = " << unpaired1 << endl; + unpaired1 = ""; + } + if(!unpaired2.empty()) { + cerr << "Outputing unpaired reads is not supported in splitting mode. Ignoring argument --unpaired2 = " << unpaired2 << endl; + unpaired2 = ""; + } + } + if(!unpaired1.empty()) { + if(dontOverwrite && file_exists(unpaired1)) { + error_exit(unpaired1 + " already exists and you have set to not rewrite output files by --dont_overwrite"); + } + if(unpaired1 == out1) + error_exit("--unpaired1 and --out1 shouldn't have same file name"); + if(unpaired1 == out2) + error_exit("--unpaired1 and --out2 shouldn't have same file name"); + } + if(!unpaired2.empty()) { + if(dontOverwrite && file_exists(unpaired2)) { + error_exit(unpaired2 + " already exists and you have set to not rewrite output files by --dont_overwrite"); + } + if(unpaired2 == out1) + error_exit("--unpaired2 and --out1 shouldn't have same file name"); + if(unpaired2 == out2) + error_exit("--unpaired2 and --out2 shouldn't have same file name"); + } + if(!failedOut.empty()) { + if(dontOverwrite && file_exists(failedOut)) { + error_exit(failedOut + " already exists and you have set to not rewrite output files by --dont_overwrite"); + } + if(failedOut == out1) + error_exit("--failed_out and --out1 shouldn't have same file name"); + if(failedOut == out2) + error_exit("--failed_out and --out2 shouldn't have same file name"); + if(failedOut == unpaired1) + error_exit("--failed_out and --unpaired1 shouldn't have same file name"); + if(failedOut == unpaired2) + error_exit("--failed_out and --unpaired2 shouldn't have same file name"); + if(failedOut == merge.out) + error_exit("--failed_out and --merged_out shouldn't have same file name"); + } if(dontOverwrite) { if(file_exists(jsonFile)) { @@ -147,6 +286,9 @@ if(qualfilter.qualifiedQual - 33 < 0 || qualfilter.qualifiedQual - 33 > 93) error_exit("qualitified phred (--qualified_quality_phred) should be 0 ~ 93, suggest 10 ~ 20"); + if(qualfilter.avgQualReq < 0 || qualfilter.avgQualReq > 93) + error_exit("average quality score requirement (--average_qual) should be 0 ~ 93, suggest 20 ~ 30"); + if(qualfilter.unqualifiedPercentLimit < 0 || qualfilter.unqualifiedPercentLimit > 100) error_exit("unqualified percent limit (--unqualified_percent_limit) should be 0 ~ 100, suggest 20 ~ 60"); @@ -156,6 +298,9 @@ if(lengthFilter.requiredLength < 0 ) error_exit("length requirement (--length_required) should be >0, suggest 15 ~ 100"); + if(overlapDiffPercentLimit < 0 || overlapDiffPercentLimit > 100) + error_exit("the maximum percentage of mismatched bases to detect overlapped region (--overlap_diff_percent_limit) should be 0 ~ 100, suggest 20 ~ 60"); + if(split.enabled ) { if(split.digits < 0 || split.digits > 10) error_exit("you have enabled splitting output to multiple files, the digits number of file name prefix (--split_prefix_digits) should be 0 ~ 10."); @@ -226,7 +371,7 @@ } if(correction.enabled && !isPaired()) { - cerr << "WARNING: base correction is only appliable for paired end data, ignored -c/--correction" << endl; + cerr << "WARNING: base correction is only appliable for paired end data, ignoring -c/--correction" << endl; correction.enabled = false; } diff -Nru fastp-0.19.6+dfsg/src/options.h fastp-0.20.0+dfsg/src/options.h --- fastp-0.19.6+dfsg/src/options.h 2019-01-02 02:56:02.000000000 +0000 +++ fastp-0.20.0+dfsg/src/options.h 2019-04-17 03:23:22.000000000 +0000 @@ -17,6 +17,18 @@ #define UMI_LOC_PER_INDEX 5 #define UMI_LOC_PER_READ 6 +class MergeOptions { +public: + MergeOptions() { + enabled = false; + includeUnmerged = false; + } +public: + bool enabled; + bool includeUnmerged; + string out; +}; + class DuplicationOptions { public: DuplicationOptions() { @@ -192,8 +204,11 @@ string sequenceR2; string detectedAdapter1; string detectedAdapter2; + vector seqsInFasta; + string fastaFile; bool hasSeqR1; bool hasSeqR2; + bool hasFasta; bool detectAdapterForPE; }; @@ -240,6 +255,8 @@ int unqualifiedPercentLimit; // if n_base_number > nBaseLimit, then discard this read int nBaseLimit; + // if average qual score < avgQualReq, then discard this read + int avgQualReq; }; class ReadLengthFilteringOptions { @@ -265,11 +282,13 @@ bool isPaired(); bool validate(); bool adapterCuttingEnabled(); + bool polyXTrimmingEnabled(); string getAdapter1(); string getAdapter2(); void initIndexFiltering(string blacklistFile1, string blacklistFile2, int threshold = 0); vector makeListFromFileByLine(string filename); bool shallDetectAdapter(bool isR2 = false); + void loadFastaAdapters(); public: // file name of read1 input @@ -278,8 +297,14 @@ string in2; // file name of read1 output string out1; - // file name of read1 output + // file name of read2 output string out2; + // file name of unpaired read1 output + string unpaired1; + // file name of unpaired read2 output + string unpaired2; + // file name of failed reads output + string failedOut; // json file string jsonFile; // html file @@ -339,8 +364,11 @@ // overlap analysis threshold int overlapRequire; int overlapDiffLimit; + int overlapDiffPercentLimit; // output debug information bool verbose; + // merge options + MergeOptions merge; }; diff -Nru fastp-0.19.6+dfsg/src/overlapanalysis.cpp fastp-0.20.0+dfsg/src/overlapanalysis.cpp --- fastp-0.19.6+dfsg/src/overlapanalysis.cpp 2019-01-02 02:56:02.000000000 +0000 +++ fastp-0.20.0+dfsg/src/overlapanalysis.cpp 2019-04-17 03:23:22.000000000 +0000 @@ -7,12 +7,12 @@ OverlapAnalysis::~OverlapAnalysis(){ } -OverlapResult OverlapAnalysis::analyze(Read* r1, Read* r2, int overlapDiffLimit, int overlapRequire) { - return analyze(r1->mSeq, r2->mSeq, overlapDiffLimit, overlapRequire); +OverlapResult OverlapAnalysis::analyze(Read* r1, Read* r2, int overlapDiffLimit, int overlapRequire, double diffPercentLimit) { + return analyze(r1->mSeq, r2->mSeq, overlapDiffLimit, overlapRequire, diffPercentLimit); } // ported from the python code of AfterQC -OverlapResult OverlapAnalysis::analyze(Sequence& r1, Sequence& r2, int overlapDiffLimit, int overlapRequire) { +OverlapResult OverlapAnalysis::analyze(Sequence& r1, Sequence& r2, int diffLimit, int overlapRequire, double diffPercentLimit) { Sequence rcr2 = ~r2; int len1 = r1.length(); int len2 = rcr2.length(); @@ -31,18 +31,19 @@ while (offset < len1-overlapRequire) { // the overlap length of r1 & r2 when r2 is move right for offset overlap_len = min(len1 - offset, len2); + int overlapDiffLimit = min(diffLimit, (int)(overlap_len * diffPercentLimit)); diff = 0; int i = 0; for (i=0; i= overlapDiffLimit && i < complete_compare_require) + if (diff > overlapDiffLimit && i < complete_compare_require) break; } } - if (diff < overlapDiffLimit || (diff >= overlapDiffLimit && i>complete_compare_require)){ + if (diff <= overlapDiffLimit || (diff > overlapDiffLimit && i>complete_compare_require)){ OverlapResult ov; ov.overlapped = true; ov.offset = offset; @@ -64,18 +65,19 @@ while (offset > -(len2-overlapRequire)){ // the overlap length of r1 & r2 when r2 is move right for offset overlap_len = min(len1, len2- abs(offset)); + int overlapDiffLimit = min(diffLimit, (int)(overlap_len * diffPercentLimit)); diff = 0; int i = 0; for (i=0; i= overlapDiffLimit && i < complete_compare_require) + if (diff > overlapDiffLimit && i < complete_compare_require) break; } } - if (diff < overlapDiffLimit || (diff >= overlapDiffLimit && i>complete_compare_require)){ + if (diff <= overlapDiffLimit || (diff > overlapDiffLimit && i>complete_compare_require)){ OverlapResult ov; ov.overlapped = true; ov.offset = offset; @@ -93,14 +95,51 @@ return ov; } +Read* OverlapAnalysis::merge(Read* r1, Read* r2, OverlapResult ov) { + int ol = ov.overlap_len; + if(!ov.overlapped) + return NULL; + + int len1 = ol + max(0, ov.offset); + int len2 = 0; + if(ov.offset > 0) + len2 = r2->length() - ol; + + Read* rr2 = r2->reverseComplement(); + string mergedSeq = r1->mSeq.mStr.substr(0, len1); + if(ov.offset > 0) { + mergedSeq += rr2->mSeq.mStr.substr(ol, len2); + } + + string mergedQual = r1->mQuality.substr(0, len1); + if(ov.offset > 0) { + mergedQual += rr2->mQuality.substr(ol, len2); + } + + delete rr2; + + string name = r1->mName + " merged_" + to_string(len1) + "_" + to_string(len2); + Read* mergedRead = new Read(name, mergedSeq, r1->mStrand, mergedQual); + + return mergedRead; +} + bool OverlapAnalysis::test(){ //Sequence r1("CAGCGCCTACGGGCCCCTTTTTCTGCGCGACCGCGTGGCTGTGGGCGCGGATGCCTTTGAGCGCGGTGACTTCTCACTGCGTATCGAGCCGCTGGAGGTCTCCC"); //Sequence r2("ACCTCCAGCGGCTCGATACGCAGTGAGAAGTCACCGCGCTCAAAGGCATCCGCGCCCACAGCCACGCGGTCGCGCAGAAAAAGGGGCCCGTAGGCGCGGCTCCC"); Sequence r1("CAGCGCCTACGGGCCCCTTTTTCTGCGCGACCGCGTGGCTGTGGGCGCGGATGCCTTTGAGCGCGGTGACTTCTCACTGCGTATCGAGC"); Sequence r2("ACCTCCAGCGGCTCGATACGCAGTGAGAAGTCACCGCGCTCAAAGGCATCCGCGCCCACAGCCACGCGGTCGCGCAGAAAAAGGGGTCC"); + string qual1("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF"); + string qual2("#########################################################################################"); - OverlapResult ov = OverlapAnalysis::analyze(r1, r2); + OverlapResult ov = OverlapAnalysis::analyze(r1, r2, 2, 30, 0.2); + + Read read1("name1", r1, "+", qual1); + Read read2("name2", r2, "+", qual2); + + Read* mergedRead = OverlapAnalysis::merge(&read1, &read2, ov); + mergedRead->print(); return ov.overlapped && ov.offset == 10 && ov.overlap_len == 79 && ov.diff == 1; } \ No newline at end of file diff -Nru fastp-0.19.6+dfsg/src/overlapanalysis.h fastp-0.20.0+dfsg/src/overlapanalysis.h --- fastp-0.19.6+dfsg/src/overlapanalysis.h 2019-01-02 02:56:02.000000000 +0000 +++ fastp-0.20.0+dfsg/src/overlapanalysis.h 2019-04-17 03:23:22.000000000 +0000 @@ -25,8 +25,9 @@ OverlapAnalysis(); ~OverlapAnalysis(); - static OverlapResult analyze(Sequence& r1, Sequence& r2, int overlapDiffLimit = 5, int overlapRequire=30); - static OverlapResult analyze(Read* r1, Read* r2, int overlapDiffLimit = 5, int overlapRequire=30); + static OverlapResult analyze(Sequence& r1, Sequence& r2, int diffLimit, int overlapRequire, double diffPercentLimit); + static OverlapResult analyze(Read* r1, Read* r2, int diffLimit, int overlapRequire, double diffPercentLimit); + static Read* merge(Read* r1, Read* r2, OverlapResult ov); public: static bool test(); diff -Nru fastp-0.19.6+dfsg/src/peprocessor.cpp fastp-0.20.0+dfsg/src/peprocessor.cpp --- fastp-0.19.6+dfsg/src/peprocessor.cpp 2019-01-02 02:56:02.000000000 +0000 +++ fastp-0.20.0+dfsg/src/peprocessor.cpp 2019-04-17 03:23:22.000000000 +0000 @@ -28,6 +28,10 @@ memset(mInsertSizeHist, 0, sizeof(long)*isizeBufLen); mLeftWriter = NULL; mRightWriter = NULL; + mUnpairedLeftWriter = NULL; + mUnpairedRightWriter = NULL; + mMergedWriter = NULL; + mFailedWriter = NULL; mDuplicate = NULL; if(mOptions->duplicate.enabled) { @@ -44,11 +48,26 @@ } void PairEndProcessor::initOutput() { - if(mOptions->out1.empty() || mOptions->out2.empty()) + if(!mOptions->unpaired1.empty()) + mUnpairedLeftWriter = new WriterThread(mOptions, mOptions->unpaired1); + + if(!mOptions->unpaired2.empty() && mOptions->unpaired2 != mOptions->unpaired1) + mUnpairedRightWriter = new WriterThread(mOptions, mOptions->unpaired2); + + if(mOptions->merge.enabled) { + if(!mOptions->merge.out.empty()) + mMergedWriter = new WriterThread(mOptions, mOptions->merge.out); + } + + if(!mOptions->failedOut.empty()) + mFailedWriter = new WriterThread(mOptions, mOptions->failedOut); + + if(mOptions->out1.empty()) return; mLeftWriter = new WriterThread(mOptions, mOptions->out1); - mRightWriter = new WriterThread(mOptions, mOptions->out2); + if(!mOptions->out2.empty()) + mRightWriter = new WriterThread(mOptions, mOptions->out2); } void PairEndProcessor::closeOutput() { @@ -60,6 +79,22 @@ delete mRightWriter; mRightWriter = NULL; } + if(mMergedWriter) { + delete mMergedWriter; + mMergedWriter = NULL; + } + if(mFailedWriter) { + delete mFailedWriter; + mFailedWriter = NULL; + } + if(mUnpairedLeftWriter) { + delete mUnpairedLeftWriter; + mLeftWriter = NULL; + } + if(mUnpairedRightWriter) { + delete mUnpairedRightWriter; + mRightWriter = NULL; + } } void PairEndProcessor::initConfig(ThreadConfig* config) { @@ -93,10 +128,22 @@ std::thread* leftWriterThread = NULL; std::thread* rightWriterThread = NULL; + std::thread* unpairedLeftWriterThread = NULL; + std::thread* unpairedRightWriterThread = NULL; + std::thread* mergedWriterThread = NULL; + std::thread* failedWriterThread = NULL; if(mLeftWriter) leftWriterThread = new std::thread(std::bind(&PairEndProcessor::writeTask, this, mLeftWriter)); if(mRightWriter) rightWriterThread = new std::thread(std::bind(&PairEndProcessor::writeTask, this, mRightWriter)); + if(mUnpairedLeftWriter) + unpairedLeftWriterThread = new std::thread(std::bind(&PairEndProcessor::writeTask, this, mUnpairedLeftWriter)); + if(mUnpairedRightWriter) + unpairedRightWriterThread = new std::thread(std::bind(&PairEndProcessor::writeTask, this, mUnpairedRightWriter)); + if(mMergedWriter) + mergedWriterThread = new std::thread(std::bind(&PairEndProcessor::writeTask, this, mMergedWriter)); + if(mFailedWriter) + failedWriterThread = new std::thread(std::bind(&PairEndProcessor::writeTask, this, mFailedWriter)); producer.join(); for(int t=0; tthread; t++){ @@ -108,6 +155,14 @@ leftWriterThread->join(); if(rightWriterThread) rightWriterThread->join(); + if(unpairedLeftWriterThread) + unpairedLeftWriterThread->join(); + if(unpairedRightWriterThread) + unpairedRightWriterThread->join(); + if(mergedWriterThread) + mergedWriterThread->join(); + if(failedWriterThread) + failedWriterThread->join(); } if(mOptions->verbose) @@ -135,14 +190,19 @@ cerr << "Read1 before filtering:"<print(); cerr << endl; - cerr << "Read1 after filtering:"<print(); - cerr << endl; cerr << "Read2 before filtering:"<print(); cerr << endl; - cerr << "Read2 aftering filtering:"<print(); + if(!mOptions->merge.enabled) { + cerr << "Read1 after filtering:"<print(); + cerr << endl; + cerr << "Read2 aftering filtering:"<print(); + } else { + cerr << "Merged and filtered:"<print(); + } cerr << endl; cerr << "Filtering result:"<merge.enabled) { + cerr << endl; + cerr << "Read pairs merged: " << finalFilterResult->mMergedPairs << endl; + if(finalPostStats1->getReads() > 0) { + double postMergedPercent = 100.0 * finalFilterResult->mMergedPairs / finalPostStats1->getReads(); + double preMergedPercent = 100.0 * finalFilterResult->mMergedPairs / finalPreStats1->getReads(); + cerr << "% of original read pairs: " << preMergedPercent << "%" << endl; + cerr << "% in reads after filtering: " << postMergedPercent << "%" << endl; + } + cerr << endl; + } + // make JSON report JsonReporter jr(mOptions); jr.setDupHist(dupHist, dupMeanGC, dupRate); @@ -205,6 +277,14 @@ delete leftWriterThread; if(rightWriterThread) delete rightWriterThread; + if(unpairedLeftWriterThread) + delete unpairedLeftWriterThread; + if(unpairedRightWriterThread) + delete unpairedRightWriterThread; + if(mergedWriterThread) + delete mergedWriterThread; + if(failedWriterThread) + delete failedWriterThread; if(!mOptions->split.enabled) closeOutput(); @@ -227,8 +307,13 @@ bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){ string outstr1; string outstr2; - string interleaved; + string unpairedOut1; + string unpairedOut2; + string singleOutput; + string mergedOutput; + string failedOut; int readPassed = 0; + int mergedCount = 0; for(int p=0;pcount;p++){ ReadPair* pair = pack->data[p]; Read* or1 = pair->mLeft; @@ -258,69 +343,144 @@ mUmiProcessor->process(or1, or2); // trim in head and tail, and apply quality cut in sliding window - Read* r1 = mFilter->trimAndCut(or1, mOptions->trim.front1, mOptions->trim.tail1); - Read* r2 = mFilter->trimAndCut(or2, mOptions->trim.front2, mOptions->trim.tail2); + int frontTrimmed1 = 0; + int frontTrimmed2 = 0; + Read* r1 = mFilter->trimAndCut(or1, mOptions->trim.front1, mOptions->trim.tail1, frontTrimmed1); + Read* r2 = mFilter->trimAndCut(or2, mOptions->trim.front2, mOptions->trim.tail2, frontTrimmed2); if(r1 != NULL && r2!=NULL) { if(mOptions->polyGTrim.enabled) PolyX::trimPolyG(r1, r2, config->getFilterResult(), mOptions->polyGTrim.minLen); - if(mOptions->polyXTrim.enabled) - PolyX::trimPolyX(r1, r2, config->getFilterResult(), mOptions->polyXTrim.minLen); } bool isizeEvaluated = false; if(r1 != NULL && r2!=NULL && (mOptions->adapter.enabled || mOptions->correction.enabled)){ - OverlapResult ov = OverlapAnalysis::analyze(r1, r2, mOptions->overlapDiffLimit, mOptions->overlapRequire); + OverlapResult ov = OverlapAnalysis::analyze(r1, r2, mOptions->overlapDiffLimit, mOptions->overlapRequire, mOptions->overlapDiffPercentLimit/100.0); // we only use thread 0 to evaluae ISIZE if(config->getThreadId() == 0) { - statInsertSize(r1, r2, ov); + statInsertSize(r1, r2, ov, frontTrimmed1, frontTrimmed2); isizeEvaluated = true; } if(mOptions->correction.enabled) { BaseCorrector::correctByOverlapAnalysis(r1, r2, config->getFilterResult(), ov); } if(mOptions->adapter.enabled) { - bool trimmed = AdapterTrimmer::trimByOverlapAnalysis(r1, r2, config->getFilterResult(), ov); + bool trimmed = AdapterTrimmer::trimByOverlapAnalysis(r1, r2, config->getFilterResult(), ov, frontTrimmed1, frontTrimmed2); + bool trimmed1 = trimmed; + bool trimmed2 = trimmed; if(!trimmed){ if(mOptions->adapter.hasSeqR1) - AdapterTrimmer::trimBySequence(r1, config->getFilterResult(), mOptions->adapter.sequence, false); + trimmed1 = AdapterTrimmer::trimBySequence(r1, config->getFilterResult(), mOptions->adapter.sequence, false); if(mOptions->adapter.hasSeqR2) - AdapterTrimmer::trimBySequence(r2, config->getFilterResult(), mOptions->adapter.sequenceR2, true); + trimmed2 = AdapterTrimmer::trimBySequence(r2, config->getFilterResult(), mOptions->adapter.sequenceR2, true); + } + if(mOptions->adapter.hasFasta) { + AdapterTrimmer::trimByMultiSequences(r1, config->getFilterResult(), mOptions->adapter.seqsInFasta, false, !trimmed1); + AdapterTrimmer::trimByMultiSequences(r2, config->getFilterResult(), mOptions->adapter.seqsInFasta, true, !trimmed2); } } } if(config->getThreadId() == 0 && !isizeEvaluated && r1 != NULL && r2!=NULL) { - OverlapResult ov = OverlapAnalysis::analyze(r1, r2, mOptions->overlapDiffLimit, mOptions->overlapRequire); - statInsertSize(r1, r2, ov); + OverlapResult ov = OverlapAnalysis::analyze(r1, r2, mOptions->overlapDiffLimit, mOptions->overlapRequire, mOptions->overlapDiffPercentLimit/100.0); + statInsertSize(r1, r2, ov, frontTrimmed1, frontTrimmed2); isizeEvaluated = true; } if(r1 != NULL && r2!=NULL) { + if(mOptions->polyXTrim.enabled) + PolyX::trimPolyX(r1, r2, config->getFilterResult(), mOptions->polyXTrim.minLen); + } + + if(r1 != NULL && r2!=NULL) { if( mOptions->trim.maxLen1 > 0 && mOptions->trim.maxLen1 < r1->length()) r1->resize(mOptions->trim.maxLen1); if( mOptions->trim.maxLen2 > 0 && mOptions->trim.maxLen2 < r2->length()) r2->resize(mOptions->trim.maxLen2); } - int result1 = mFilter->passFilter(r1); - int result2 = mFilter->passFilter(r2); - - config->addFilterResult(max(result1, result2)); + Read* merged = NULL; + // merging mode + bool mergeProcessed = false; + if(mOptions->merge.enabled && r1 && r2) { + OverlapResult ov = OverlapAnalysis::analyze(r1, r2, mOptions->overlapDiffLimit, mOptions->overlapRequire, mOptions->overlapDiffPercentLimit/100.0); + if(ov.overlapped) { + merged = OverlapAnalysis::merge(r1, r2, ov); + int result = mFilter->passFilter(merged); + config->addFilterResult(result, 2); + if(result == PASS_FILTER) { + mergedOutput += merged->toString(); + config->getPostStats1()->statRead(merged); + readPassed++; + mergedCount++; + } + delete merged; + mergeProcessed = true; + } else if(mOptions->merge.includeUnmerged){ + int result1 = mFilter->passFilter(r1); + config->addFilterResult(result1, 1); + if(result1 == PASS_FILTER) { + mergedOutput += r1->toString(); + config->getPostStats1()->statRead(r1); + } - if( r1 != NULL && result1 == PASS_FILTER && r2 != NULL && result2 == PASS_FILTER ) { - - if(mOptions->outputToSTDOUT) { - interleaved += r1->toString() + r2->toString(); - } else { - outstr1 += r1->toString(); - outstr2 += r2->toString(); + int result2 = mFilter->passFilter(r2); + config->addFilterResult(result2, 1); + if(result2 == PASS_FILTER) { + mergedOutput += r2->toString(); + config->getPostStats1()->statRead(r2); + } + if(result1 == PASS_FILTER && result2 == PASS_FILTER ) + readPassed++; + mergeProcessed = true; } + } + + if(!mergeProcessed) { + + int result1 = mFilter->passFilter(r1); + int result2 = mFilter->passFilter(r2); + + config->addFilterResult(max(result1, result2), 2); + + if( r1 != NULL && result1 == PASS_FILTER && r2 != NULL && result2 == PASS_FILTER ) { + + if(mOptions->outputToSTDOUT && !mOptions->merge.enabled) { + singleOutput += r1->toString() + r2->toString(); + } else { + outstr1 += r1->toString(); + outstr2 += r2->toString(); + } - // stats the read after filtering - config->getPostStats1()->statRead(r1); - config->getPostStats2()->statRead(r2); + // stats the read after filtering + if(!mOptions->merge.enabled) { + config->getPostStats1()->statRead(r1); + config->getPostStats2()->statRead(r2); + } - readPassed++; + readPassed++; + } else if( r1 != NULL && result1 == PASS_FILTER) { + if(mUnpairedLeftWriter) { + unpairedOut1 += r1->toString(); + if(mFailedWriter) + failedOut += or2->toStringWithTag(FAILED_TYPES[result2]); + } else { + if(mFailedWriter) { + failedOut += or1->toStringWithTag("paired_read_is_failing"); + failedOut += or2->toStringWithTag(FAILED_TYPES[result2]); + } + } + } else if( r2 != NULL && result2 == PASS_FILTER) { + if(mUnpairedLeftWriter || mUnpairedRightWriter) { + unpairedOut2 += r2->toString(); + if(mFailedWriter) + failedOut += or1->toStringWithTag(FAILED_TYPES[result1]); + } else { + if(mFailedWriter) { + failedOut += or1->toStringWithTag(FAILED_TYPES[result1]); + failedOut += or2->toStringWithTag("paired_read_is_failing"); + } + } + } } delete pair; @@ -332,35 +492,73 @@ delete r2; } // if splitting output, then no lock is need since different threads write different files - if(!mOptions->split.enabled) + if(!mOptions->split.enabled) mOutputMtx.lock(); if(mOptions->outputToSTDOUT) { // STDOUT output - fwrite(interleaved.c_str(), 1, interleaved.length(), stdout); + // if it's merging mode, write the merged reads to STDOUT + // otherwise write interleaved single output + if(mOptions->merge.enabled) + fwrite(mergedOutput.c_str(), 1, mergedOutput.length(), stdout); + else + fwrite(singleOutput.c_str(), 1, singleOutput.length(), stdout); } else if(mOptions->split.enabled) { // split output by each worker thread if(!mOptions->out1.empty()) config->getWriter1()->writeString(outstr1); if(!mOptions->out2.empty()) config->getWriter2()->writeString(outstr2); - } else { - // normal output by left/right writer thread - if(mRightWriter && mLeftWriter) { + } + + if(mMergedWriter && !mergedOutput.empty()) { + // write merged data + char* mdata = new char[mergedOutput.size()]; + memcpy(mdata, mergedOutput.c_str(), mergedOutput.size()); + mMergedWriter->input(mdata, mergedOutput.size()); + } + + if(mFailedWriter && !failedOut.empty()) { + // write failed data + char* fdata = new char[failedOut.size()]; + memcpy(fdata, failedOut.c_str(), failedOut.size()); + mFailedWriter->input(fdata, failedOut.size()); + } + + // normal output by left/right writer thread + if(mRightWriter && mLeftWriter && (!outstr1.empty() || !outstr2.empty())) { + // write PE + char* ldata = new char[outstr1.size()]; + memcpy(ldata, outstr1.c_str(), outstr1.size()); + mLeftWriter->input(ldata, outstr1.size()); + + char* rdata = new char[outstr2.size()]; + memcpy(rdata, outstr2.c_str(), outstr2.size()); + mRightWriter->input(rdata, outstr2.size()); + } else if(mLeftWriter && !singleOutput.empty()) { + // write singleOutput + char* ldata = new char[singleOutput.size()]; + memcpy(ldata, singleOutput.c_str(), singleOutput.size()); + mLeftWriter->input(ldata, singleOutput.size()); + } + // output unpaired reads + if (!unpairedOut1.empty() || !unpairedOut2.empty()) { + if(mUnpairedLeftWriter && mUnpairedRightWriter) { // write PE - char* ldata = new char[outstr1.size()]; - memcpy(ldata, outstr1.c_str(), outstr1.size()); - mLeftWriter->input(ldata, outstr1.size()); - - char* rdata = new char[outstr2.size()]; - memcpy(rdata, outstr2.c_str(), outstr2.size()); - mRightWriter->input(rdata, outstr2.size()); - } else if(mLeftWriter) { - // write interleaved - char* ldata = new char[interleaved.size()]; - memcpy(ldata, interleaved.c_str(), interleaved.size()); - mLeftWriter->input(ldata, interleaved.size()); + char* unpairedData1 = new char[unpairedOut1.size()]; + memcpy(unpairedData1, unpairedOut1.c_str(), unpairedOut1.size()); + mUnpairedLeftWriter->input(unpairedData1, unpairedOut1.size()); + + char* unpairedData2 = new char[unpairedOut2.size()]; + memcpy(unpairedData2, unpairedOut2.c_str(), unpairedOut2.size()); + mUnpairedRightWriter->input(unpairedData2, unpairedOut2.size()); + } else if(mUnpairedLeftWriter) { + char* unpairedData = new char[unpairedOut1.size() + unpairedOut2.size() ]; + memcpy(unpairedData, unpairedOut1.c_str(), unpairedOut1.size()); + memcpy(unpairedData + unpairedOut1.size(), unpairedOut2.c_str(), unpairedOut2.size()); + mUnpairedLeftWriter->input(unpairedData, unpairedOut1.size() + unpairedOut2.size()); } } + if(!mOptions->split.enabled) mOutputMtx.unlock(); @@ -369,19 +567,23 @@ else config->markProcessed(pack->count); + if(mOptions->merge.enabled) { + config->addMergedPairs(mergedCount); + } + delete pack->data; delete pack; return true; } -void PairEndProcessor::statInsertSize(Read* r1, Read* r2, OverlapResult& ov) { +void PairEndProcessor::statInsertSize(Read* r1, Read* r2, OverlapResult& ov, int frontTrimmed1, int frontTrimmed2) { int isize = mOptions->insertSizeMax; if(ov.overlapped) { if(ov.offset > 0) - isize = r1->length() + r2->length() - ov.overlap_len; + isize = r1->length() + r2->length() - ov.overlap_len + frontTrimmed1 + frontTrimmed2; else - isize = ov.overlap_len; + isize = ov.overlap_len + frontTrimmed1 + frontTrimmed2; } if(isize > mOptions->insertSizeMax) @@ -593,6 +795,14 @@ mLeftWriter->setInputCompleted(); if(mRightWriter) mRightWriter->setInputCompleted(); + if(mUnpairedLeftWriter) + mUnpairedLeftWriter->setInputCompleted(); + if(mUnpairedRightWriter) + mUnpairedRightWriter->setInputCompleted(); + if(mMergedWriter) + mMergedWriter->setInputCompleted(); + if(mFailedWriter) + mFailedWriter->setInputCompleted(); } if(mOptions->verbose) { diff -Nru fastp-0.19.6+dfsg/src/peprocessor.h fastp-0.20.0+dfsg/src/peprocessor.h --- fastp-0.19.6+dfsg/src/peprocessor.h 2019-01-02 02:56:02.000000000 +0000 +++ fastp-0.20.0+dfsg/src/peprocessor.h 2019-04-17 03:23:22.000000000 +0000 @@ -57,7 +57,7 @@ void initConfig(ThreadConfig* config); void initOutput(); void closeOutput(); - void statInsertSize(Read* r1, Read* r2, OverlapResult& ov); + void statInsertSize(Read* r1, Read* r2, OverlapResult& ov, int frontTrimmed1 = 0, int frontTrimmed2 = 0); int getPeakInsertSize(); void writeTask(WriterThread* config); @@ -77,6 +77,10 @@ long* mInsertSizeHist; WriterThread* mLeftWriter; WriterThread* mRightWriter; + WriterThread* mUnpairedLeftWriter; + WriterThread* mUnpairedRightWriter; + WriterThread* mMergedWriter; + WriterThread* mFailedWriter; Duplicate* mDuplicate; }; diff -Nru fastp-0.19.6+dfsg/src/polyx.cpp fastp-0.20.0+dfsg/src/polyx.cpp --- fastp-0.19.6+dfsg/src/polyx.cpp 2019-01-02 02:56:02.000000000 +0000 +++ fastp-0.20.0+dfsg/src/polyx.cpp 2019-04-17 03:23:22.000000000 +0000 @@ -1,4 +1,5 @@ #include "polyx.h" +#include "common.h" PolyX::PolyX(){ } @@ -53,8 +54,8 @@ int rlen = r->length(); + int atcgNumbers[4] = {0, 0, 0, 0}; - char atcgBases[4] = {'A', 'T', 'C', 'G'}; int pos = 0; for(pos=0; pos=0) pos--; r->resize(rlen - pos - 1); + if(fr) + fr->addPolyXTrimmed(poly, pos + 1); } } bool PolyX::test() { + Read r("@name", "ATTTTAAAAAAAAAATAAAAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAAAAAT", "+", "///EEEEEEEEEEEEEEEEEEEEEEEEEE////EEEEEEEEEEEEE////E////E"); - PolyX::trimPolyX(&r, NULL, 10); + + FilterResult fr(NULL, false); + PolyX::trimPolyX(&r, &fr, 10); r.print(); - return r.mSeq.mStr == "ATTTT"; + + return r.mSeq.mStr == "ATTTT" && fr.getTotalPolyXTrimmedReads() == 1 && fr.getTotalPolyXTrimmedBases() == 51; } \ No newline at end of file diff -Nru fastp-0.19.6+dfsg/src/read.cpp fastp-0.20.0+dfsg/src/read.cpp --- fastp-0.19.6+dfsg/src/read.cpp 2019-01-02 02:56:02.000000000 +0000 +++ fastp-0.20.0+dfsg/src/read.cpp 2019-04-17 03:23:22.000000000 +0000 @@ -131,6 +131,10 @@ return mName + "\n" + mSeq.mStr + "\n" + mStrand + "\n" + mQuality + "\n"; } +string Read::toStringWithTag(string tag) { + return mName + " " + tag + "\n" + mSeq.mStr + "\n" + mStrand + "\n" + mQuality + "\n"; +} + bool Read::test(){ Read r("@NS500713:64:HFKJJBGXY:1:11101:20469:1097 1:N:0:TATAGCCT+GGTCCCGA", "CTCTTGGACTCTAACACTGTTTTTTCTTATGAAAACACAGGAGTGATGACTAGTTGAGTGCATTCTTATGAGACTCATAGTCATTCTATGATGTAGTTTTCCTTAGGAGGACATTTTTTACATGAAATTATTAACCTAAATAGAGTTGATC", diff -Nru fastp-0.19.6+dfsg/src/read.h fastp-0.20.0+dfsg/src/read.h --- fastp-0.19.6+dfsg/src/read.h 2019-01-02 02:56:02.000000000 +0000 +++ fastp-0.20.0+dfsg/src/read.h 2019-04-17 03:23:22.000000000 +0000 @@ -27,6 +27,7 @@ int lowQualCount(int qual=20); int length(); string toString(); + string toStringWithTag(string tag); void resize(int len); void convertPhred64To33(); void trimFront(int len); diff -Nru fastp-0.19.6+dfsg/src/seprocessor.cpp fastp-0.20.0+dfsg/src/seprocessor.cpp --- fastp-0.19.6+dfsg/src/seprocessor.cpp 2019-01-02 02:56:02.000000000 +0000 +++ fastp-0.20.0+dfsg/src/seprocessor.cpp 2019-04-17 03:23:22.000000000 +0000 @@ -20,6 +20,7 @@ mZipFile = NULL; mUmiProcessor = new UmiProcessor(opt); mLeftWriter = NULL; + mFailedWriter = NULL; mDuplicate = NULL; if(mOptions->duplicate.enabled) { @@ -36,6 +37,8 @@ } void SingleEndProcessor::initOutput() { + if(!mOptions->failedOut.empty()) + mFailedWriter = new WriterThread(mOptions, mOptions->failedOut); if(mOptions->out1.empty()) return; mLeftWriter = new WriterThread(mOptions, mOptions->out1); @@ -46,6 +49,10 @@ delete mLeftWriter; mLeftWriter = NULL; } + if(mFailedWriter) { + delete mFailedWriter; + mFailedWriter = NULL; + } } void SingleEndProcessor::initConfig(ThreadConfig* config) { @@ -78,8 +85,11 @@ } std::thread* leftWriterThread = NULL; + std::thread* failedWriterThread = NULL; if(mLeftWriter) leftWriterThread = new std::thread(std::bind(&SingleEndProcessor::writeTask, this, mLeftWriter)); + if(mFailedWriter) + failedWriterThread = new std::thread(std::bind(&SingleEndProcessor::writeTask, this, mFailedWriter)); producer.join(); for(int t=0; tthread; t++){ @@ -89,6 +99,8 @@ if(!mOptions->split.enabled) { if(leftWriterThread) leftWriterThread->join(); + if(failedWriterThread) + failedWriterThread->join(); } if(mOptions->verbose) @@ -169,6 +181,8 @@ if(leftWriterThread) delete leftWriterThread; + if(failedWriterThread) + delete failedWriterThread; if(!mOptions->split.enabled) closeOutput(); @@ -178,6 +192,7 @@ bool SingleEndProcessor::processSingleEnd(ReadPack* pack, ThreadConfig* config){ string outstr; + string failedOut; int readPassed = 0; for(int p=0;pcount;p++){ @@ -201,18 +216,28 @@ if(mOptions->umi.enabled) mUmiProcessor->process(or1); + int frontTrimmed = 0; // trim in head and tail, and apply quality cut in sliding window - Read* r1 = mFilter->trimAndCut(or1, mOptions->trim.front1, mOptions->trim.tail1); + Read* r1 = mFilter->trimAndCut(or1, mOptions->trim.front1, mOptions->trim.tail1, frontTrimmed); if(r1 != NULL) { if(mOptions->polyGTrim.enabled) PolyX::trimPolyG(r1, config->getFilterResult(), mOptions->polyGTrim.minLen); - if(mOptions->polyXTrim.enabled) - PolyX::trimPolyX(r1, config->getFilterResult(), mOptions->polyXTrim.minLen); } - if(r1 != NULL && mOptions->adapter.enabled && mOptions->adapter.hasSeqR1){ - AdapterTrimmer::trimBySequence(r1, config->getFilterResult(), mOptions->adapter.sequence); + if(r1 != NULL && mOptions->adapter.enabled){ + bool trimmed = false; + if(mOptions->adapter.hasSeqR1) + trimmed = AdapterTrimmer::trimBySequence(r1, config->getFilterResult(), mOptions->adapter.sequence, false); + bool incTrimmedCounter = !trimmed; + if(mOptions->adapter.hasFasta) { + AdapterTrimmer::trimByMultiSequences(r1, config->getFilterResult(), mOptions->adapter.seqsInFasta, false, incTrimmedCounter); + } + } + + if(r1 != NULL) { + if(mOptions->polyXTrim.enabled) + PolyX::trimPolyX(r1, config->getFilterResult(), mOptions->polyXTrim.minLen); } if(r1 != NULL) { @@ -222,7 +247,7 @@ int result = mFilter->passFilter(r1); - config->addFilterResult(result); + config->addFilterResult(result, 1); if( r1 != NULL && result == PASS_FILTER) { outstr += r1->toString(); @@ -230,6 +255,8 @@ // stats the read after filtering config->getPostStats1()->statRead(r1); readPassed++; + } else if(mFailedWriter) { + failedOut += or1->toStringWithTag(FAILED_TYPES[result]); } delete or1; @@ -247,12 +274,17 @@ if(!mOptions->out1.empty()) config->getWriter1()->writeString(outstr); } - else { - if(mLeftWriter) { - char* ldata = new char[outstr.size()]; - memcpy(ldata, outstr.c_str(), outstr.size()); - mLeftWriter->input(ldata, outstr.size()); - } + + if(mLeftWriter) { + char* ldata = new char[outstr.size()]; + memcpy(ldata, outstr.c_str(), outstr.size()); + mLeftWriter->input(ldata, outstr.size()); + } + if(mFailedWriter && !failedOut.empty()) { + // write failed data + char* fdata = new char[failedOut.size()]; + memcpy(fdata, failedOut.c_str(), failedOut.size()); + mFailedWriter->input(fdata, failedOut.size()); } if(!mOptions->split.enabled) mOutputMtx.unlock(); @@ -465,6 +497,8 @@ if(mFinishedThreads == mOptions->thread) { if(mLeftWriter) mLeftWriter->setInputCompleted(); + if(mFailedWriter) + mFailedWriter->setInputCompleted(); } if(mOptions->verbose) { diff -Nru fastp-0.19.6+dfsg/src/seprocessor.h fastp-0.20.0+dfsg/src/seprocessor.h --- fastp-0.19.6+dfsg/src/seprocessor.h 2019-01-02 02:56:02.000000000 +0000 +++ fastp-0.20.0+dfsg/src/seprocessor.h 2019-04-17 03:23:22.000000000 +0000 @@ -68,6 +68,7 @@ ofstream* mOutStream; UmiProcessor* mUmiProcessor; WriterThread* mLeftWriter; + WriterThread* mFailedWriter; Duplicate* mDuplicate; }; diff -Nru fastp-0.19.6+dfsg/src/threadconfig.cpp fastp-0.20.0+dfsg/src/threadconfig.cpp --- fastp-0.19.6+dfsg/src/threadconfig.cpp 2019-01-02 02:56:02.000000000 +0000 +++ fastp-0.20.0+dfsg/src/threadconfig.cpp 2019-04-17 03:23:22.000000000 +0000 @@ -77,8 +77,12 @@ mWriter2 = new Writer(gzfile2); } -void ThreadConfig::addFilterResult(int result) { - mFilterResult->addFilterResult(result); +void ThreadConfig::addFilterResult(int result, int readNum) { + mFilterResult->addFilterResult(result, readNum); +} + +void ThreadConfig::addMergedPairs(int pairs) { + mFilterResult->addMergedPairs(pairs); } void ThreadConfig::initWriterForSplit() { diff -Nru fastp-0.19.6+dfsg/src/threadconfig.h fastp-0.20.0+dfsg/src/threadconfig.h --- fastp-0.19.6+dfsg/src/threadconfig.h 2019-01-02 02:56:02.000000000 +0000 +++ fastp-0.20.0+dfsg/src/threadconfig.h 2019-04-17 03:23:22.000000000 +0000 @@ -31,7 +31,8 @@ void initWriter(gzFile gzfile); void initWriter(gzFile gzfile1, gzFile gzfile2); - void addFilterResult(int result); + void addFilterResult(int result, int readNum); + void addMergedPairs(int pairs); int getThreadId() {return mThreadId;} // for splitting output diff -Nru fastp-0.19.6+dfsg/src/util.h fastp-0.20.0+dfsg/src/util.h --- fastp-0.19.6+dfsg/src/util.h 2019-01-02 02:56:02.000000000 +0000 +++ fastp-0.20.0+dfsg/src/util.h 2019-04-17 03:23:22.000000000 +0000 @@ -213,15 +213,22 @@ // Remove invalid sequence characters from a string -inline string str_keep_valid_sequence(const string& s) +inline void str_keep_valid_sequence( string& s, bool forceUpperCase = false) { - string new_str; + size_t total = 0; + const char case_gap = 'a' - 'A'; for( size_t it =0; it < s.size(); it++) { - if( isalpha(s[it]) || s[it] == '-' || s[it] == '*' ) { - new_str += s[it]; + char c = s[it]; + if(forceUpperCase && c>='a' && c<='z') { + c -= case_gap; + } + if( isalpha(c) || c == '-' || c == '*' ) { + s[total] = c; + total ++; } } - return new_str; + + s.resize(total); } inline int find_with_right_pos(const string& str, const string& pattern, int start=0) { diff -Nru fastp-0.19.6+dfsg/testdata/R1.fq fastp-0.20.0+dfsg/testdata/R1.fq --- fastp-0.19.6+dfsg/testdata/R1.fq 2019-01-02 02:56:02.000000000 +0000 +++ fastp-0.20.0+dfsg/testdata/R1.fq 2019-04-17 03:23:22.000000000 +0000 @@ -1,15 +1,15 @@ @AS500713:64:HFKJJBGXY:1:11101:1675:1101 1:A:0:TATAGCCT+GACCCCCA -TAGGAGGCTTGGAGTACCAATAATAAAGTGAGCCCACCTTCCTGGTACCCAGACATTTCAGGAGGTCGGGAAATTTTTAAACCCAGGCAGCTTCCTGGCAGTGACATTTGGAGCATCAAAGTGGTAAATAAAATTTCATTTACATTAATAT + + -////AAEEEEE/E/EA/E/AEA6EE//AEE66/AAE//ZZZ////E/AA/EEE/A/AEE/EEA//EEEEEEEE6EEAAA/E/A/6E/6//6