diff -Nru bowtie2-2.3.4.2/bowtie2-build bowtie2-2.3.4.3/bowtie2-build --- bowtie2-2.3.4.2/bowtie2-build 2018-08-07 19:24:25.000000000 +0000 +++ bowtie2-2.3.4.3/bowtie2-build 2018-09-17 04:22:55.000000000 +0000 @@ -29,15 +29,15 @@ from collections import deque def main(): - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser(add_help = False) + + parser.add_argument('--large-index', action='store_true') + parser.add_argument('--verbose', action='store_true') group = parser.add_mutually_exclusive_group() group.add_argument('--debug', action='store_true') group.add_argument('--sanitized', action='store_true') - parser.add_argument('--verbose', action='store_true') - parser.add_argument('--large-index', action='store_true') - logging.basicConfig(level=logging.ERROR, format='%(levelname)s: %(message)s' ) @@ -51,8 +51,12 @@ build_bin_spec = os.path.join(ex_path,build_bin_s) script_options, argv = parser.parse_known_args() + print_help = False argv = deque(argv) + if '-h' in argv or '--help' in argv: + print_help = True + if script_options.verbose: logging.getLogger().setLevel(logging.INFO) diff -Nru bowtie2-2.3.4.2/bowtie2-inspect bowtie2-2.3.4.3/bowtie2-inspect --- bowtie2-2.3.4.2/bowtie2-inspect 2018-08-07 19:24:25.000000000 +0000 +++ bowtie2-2.3.4.3/bowtie2-inspect 2018-09-17 04:22:55.000000000 +0000 @@ -22,6 +22,7 @@ import os import imp +import sys import inspect import logging import argparse @@ -29,7 +30,7 @@ from collections import deque def main(): - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser(add_help = False) group = parser.add_mutually_exclusive_group() group.add_argument('--debug', action='store_true') diff -Nru bowtie2-2.3.4.2/bt2_build.cpp bowtie2-2.3.4.3/bt2_build.cpp --- bowtie2-2.3.4.2/bt2_build.cpp 2018-08-07 19:24:25.000000000 +0000 +++ bowtie2-2.3.4.3/bt2_build.cpp 2018-09-17 04:22:55.000000000 +0000 @@ -142,7 +142,10 @@ << " )" << endl; if(wrapper == "basic-0") { out << " --large-index force generated index to be 'large', even if ref" << endl - << " has fewer than 4 billion nucleotides" << endl; + << " has fewer than 4 billion nucleotides" << endl + << " --debug use the debug binary; slower, assertions enabled" << endl + << " --sanitized use sanitized binary; slower, uses ASan and/or UBSan" << endl + << " --verbose log the issued command" << endl; } out << " -a/--noauto disable automatic -p/--bmax/--dcv memory-fitting" << endl << " -p/--packed use packed strings internally; slower, less memory" << endl diff -Nru bowtie2-2.3.4.2/bt2_inspect.cpp bowtie2-2.3.4.3/bt2_inspect.cpp --- bowtie2-2.3.4.2/bt2_inspect.cpp 2018-08-07 19:24:25.000000000 +0000 +++ bowtie2-2.3.4.3/bt2_inspect.cpp 2018-09-17 04:22:55.000000000 +0000 @@ -76,14 +76,16 @@ << "Options:" << endl; if(wrapper == "basic-0") { out << " --large-index force inspection of the 'large' index, even if a" << endl - << " 'small' one is present." << endl; + << " 'small' one is present." << endl + << " --debug use the debug binary; slower, assertions enabled" << endl + << " --sanitized use sanitized binary; slower, uses ASan and/or UBSan" << endl + << " --verbose log the issued command" << endl; } out << " -a/--across Number of characters across in FASTA output (default: 60)" << endl << " -n/--names Print reference sequence names only" << endl << " -s/--summary Print summary incl. ref names, lengths, index properties" << endl << " -v/--verbose Verbose output (for debugging)" << endl << " -h/--help print detailed description of tool and its options" << endl - << " --help print this usage message" << endl ; if(wrapper.empty()) { cerr << endl diff -Nru bowtie2-2.3.4.2/bt2_search.cpp bowtie2-2.3.4.3/bt2_search.cpp --- bowtie2-2.3.4.2/bt2_search.cpp 2018-08-07 19:24:25.000000000 +0000 +++ bowtie2-2.3.4.3/bt2_search.cpp 2018-09-17 04:22:55.000000000 +0000 @@ -84,6 +84,7 @@ int gQuiet; // print nothing but the alignments static int sanityCheck; // enable expensive sanity checks static int format; // default read format is FASTQ +static bool interleaved; // reads are interleaved static string origString; // reference text, or filename(s) static int seed; // srandom() seed static int timing; // whether to report basic timing data @@ -278,6 +279,7 @@ gQuiet = false; sanityCheck = 0; // enable expensive sanity checks format = FASTQ; // default read format is FASTQ + interleaved = false; // reads are not interleaved by default origString = ""; // reference text, or filename(s) seed = 0; // srandom() seed timing = 0; // whether to report basic timing data @@ -999,7 +1001,12 @@ case ARG_ONETWO: tokenize(arg, ",", mates12); format = TAB_MATE5; break; case ARG_TAB5: tokenize(arg, ",", mates12); format = TAB_MATE5; break; case ARG_TAB6: tokenize(arg, ",", mates12); format = TAB_MATE6; break; - case ARG_INTERLEAVED_FASTQ: tokenize(arg, ",", mates12); format = INTERLEAVED; break; + case ARG_INTERLEAVED_FASTQ: { + tokenize(arg, ",", mates12); + format = FASTQ; + interleaved = true; + break; + } case 'b': { format = BAM; saw_bam = true; @@ -1735,7 +1742,7 @@ int tid) { PatternSourcePerThreadFactory *patsrcFact; - patsrcFact = new PatternSourcePerThreadFactory(patcomp, pp); + patsrcFact = new PatternSourcePerThreadFactory(patcomp, pp, tid); assert(patsrcFact != NULL); return patsrcFact; } @@ -4782,6 +4789,7 @@ } PatternParams pp( format, // file format + interleaved, // some or all of the reads are interleaved fileParallel, // true -> wrap files with separate PairedPatternSources seed, // pseudo-random seed readsPerBatch, // # reads in a light parsing batch diff -Nru bowtie2-2.3.4.2/CMakeLists.txt bowtie2-2.3.4.3/CMakeLists.txt --- bowtie2-2.3.4.2/CMakeLists.txt 2018-08-07 19:24:25.000000000 +0000 +++ bowtie2-2.3.4.3/CMakeLists.txt 2018-09-17 04:22:55.000000000 +0000 @@ -3,7 +3,7 @@ cmake_policy(SET CMP0048 NEW) cmake_policy(SET CMP0005 NEW) -project(bowtie2 LANGUAGES CXX VERSION "2.3.4.2") +project(bowtie2 LANGUAGES CXX VERSION "2.3.4.3") enable_testing() diff -Nru bowtie2-2.3.4.2/debian/changelog bowtie2-2.3.4.3/debian/changelog --- bowtie2-2.3.4.2/debian/changelog 2018-08-14 11:31:56.000000000 +0000 +++ bowtie2-2.3.4.3/debian/changelog 2018-09-25 07:58:26.000000000 +0000 @@ -1,3 +1,19 @@ +bowtie2 (2.3.4.3-1) unstable; urgency=medium + + [ Steffen Möller ] + * Updated OMICtools ref in d/u/metadata + + [ Alexandre Mestiashvili ] + * New upstream version 2.3.4.3: + - Fixed an issue causing `bowtie2-build` and `bowtie2-inspect` + to output incomplete help text. + - Fixed an issue causing `bowtie2-inspect` to crash. + - Fixed an issue preventing `bowtie2` from processing paired and/or + unpaired FASTQ reads together with interleaved FASTQ reads. + * Bump Policy to 4.2.1 + + -- Alexandre Mestiashvili Tue, 25 Sep 2018 07:58:26 +0000 + bowtie2 (2.3.4.2-1) unstable; urgency=medium * New upstream version 2.3.4.2 diff -Nru bowtie2-2.3.4.2/debian/control bowtie2-2.3.4.3/debian/control --- bowtie2-2.3.4.2/debian/control 2018-08-14 11:31:56.000000000 +0000 +++ bowtie2-2.3.4.3/debian/control 2018-09-25 07:58:26.000000000 +0000 @@ -12,7 +12,7 @@ libtest-deep-perl, libclone-perl, zlib1g-dev -Standards-Version: 4.2.0 +Standards-Version: 4.2.1 Vcs-Browser: https://salsa.debian.org/med-team/bowtie2 Vcs-Git: https://salsa.debian.org/med-team/bowtie2.git Homepage: http://bowtie-bio.sourceforge.net/bowtie2 diff -Nru bowtie2-2.3.4.2/debian/upstream/metadata bowtie2-2.3.4.3/debian/upstream/metadata --- bowtie2-2.3.4.2/debian/upstream/metadata 2018-08-14 11:31:56.000000000 +0000 +++ bowtie2-2.3.4.3/debian/upstream/metadata 2018-09-25 07:58:26.000000000 +0000 @@ -17,6 +17,6 @@ - Name: SciCrunch Entry: SCR_005476 - Name: OMICtools - Entry: OMICS_00653 + Entry: OMICS_31633 - Name: bio.tools Entry: bowtie2 diff -Nru bowtie2-2.3.4.2/doc/website/manual.ssi bowtie2-2.3.4.3/doc/website/manual.ssi --- bowtie2-2.3.4.2/doc/website/manual.ssi 2018-08-07 19:24:25.000000000 +0000 +++ bowtie2-2.3.4.3/doc/website/manual.ssi 2018-09-17 04:22:55.000000000 +0000 @@ -152,7 +152,7 @@
yum check-update
-

yum search tbb

+
yum search tbb
yum install tbb-devel.x86_64
diff -Nru bowtie2-2.3.4.2/doc/website/recent_news.ssi bowtie2-2.3.4.3/doc/website/recent_news.ssi --- bowtie2-2.3.4.2/doc/website/recent_news.ssi 2018-08-07 19:24:25.000000000 +0000 +++ bowtie2-2.3.4.3/doc/website/recent_news.ssi 2018-09-17 04:22:55.000000000 +0000 @@ -1,17 +1,22 @@ -

Version 2.3.4.2 - August 07, 1987

+

Version 2.3.4.3 - September 17, 2018

    -
  • Fixed issue causing bowtie2 to fail in --fast-local mode.
  • -
  • Fixed issue causing --trim-to N causes bowtie2 to trim reads longer than N bases to exactly N bases. Can trim from either 3' or 5' end, e.g. --trim-to 5:30 trims reads to 30 bases, truncating at the 5' end.
  • -
  • Updated "Building from source" manual section with additional instructions on installing TBB.
  • +
  • Fixed an issue causing bowtie2-build and bowtie2-inspect to output incomplete help text.
  • +
  • Fixed an issue causing bowtie2-align to crash.
  • +
  • Fixed an issue preventing bowtie2 from processing paired and/or unpaired FASTQ reads together with interleaved FASTQ reads.
  • +
+ +

Version 2.3.4.2 - August 07, 2018

+
    +
  • Fixed issue causing bowtie2 to fail in --fast-local mode.
  • +
  • Fixed issue causing --soft-clipped-unmapped-tlen to be a positional argument.
  • +
  • New option --trim-to N causes bowtie2 to trim reads longer than N bases to exactly N bases. Can trim from either 3' or 5' end, e.g. --trim-to 5:30 trims reads to 30 bases, truncating at the 5' end.
  • +
  • Updated "Building from source" manual section with additional instructions on installing TBB.
  • Several other updates to manual, including new mentions of Bioconda and Biocontainers.
  • Fixed an issue preventing bowtie2 from processing more than one pattern source when running single threaded.
  • Fixed an issue causing bowtie2 and bowtie2-inspect to crash if the index contains a gap-only segment.
  • Added experimental BAM input mode -b. Works only with unpaired input reads and BAM files that are sorted by read name (samtools sort -n). BAM input mode also supports the following options:
  • -
      -
    • --preserve-sam-tags: Preserve any optional fields present in BAM record
    • -
    • --align-paired-reads: Paired-end mode for BAM files
    • -
    +     --preserve-sam-tags: Preserve any optional fields present in BAM record
  • +     --align-paired-reads: Paired-end mode for BAM files
  • Add experimental CMake support
diff -Nru bowtie2-2.3.4.2/doc/website/rhsidebar.ssi bowtie2-2.3.4.3/doc/website/rhsidebar.ssi --- bowtie2-2.3.4.2/doc/website/rhsidebar.ssi 2018-08-07 19:24:25.000000000 +0000 +++ bowtie2-2.3.4.3/doc/website/rhsidebar.ssi 2018-09-17 04:22:55.000000000 +0000 @@ -18,10 +18,10 @@ - Bowtie2 2.3.4.2 + Bowtie2 2.3.4.3 - 08/07/18  + 09/17/18  @@ -164,6 +164,7 @@

Related publications

diff -Nru bowtie2-2.3.4.2/example/reads/conversion_utilities.sh bowtie2-2.3.4.3/example/reads/conversion_utilities.sh --- bowtie2-2.3.4.2/example/reads/conversion_utilities.sh 1970-01-01 00:00:00.000000000 +0000 +++ bowtie2-2.3.4.3/example/reads/conversion_utilities.sh 2018-09-17 04:22:55.000000000 +0000 @@ -0,0 +1,7 @@ +# aliases for converting sample read files +# `source conversion_utilities.sh` to add them to your environment + +alias fastq_to_fasta="sed 'N;x;N;N;x;s/@/>/" +alias paired_to_tab5="paste <(sed 'N;x;N;g;N;s/\n/ /g' reads_1.fq) <(sed -n 'n;h;n;g;N;s/\n/ /g;p' reads_2.fq) > reads_12.tab5" +alias paired_to_tab6="paste <(sed 'N;x;N;g;N;s/\n/ /g' reads_1.fq) <(sed 'N;x;N;g;N;s/\n/ /g' reads_2.fq) > reads_12.tab6" +alias paired_to_interleaved="paste -d'\n' <(sed 'N;N;N;s/\n/ /g' reads_1.fq) <(sed 'N;N;N;s/\n/ /g' reads_2.fq) | tr '\t' '\n' > reads_12.fq" diff -Nru bowtie2-2.3.4.2/formats.h bowtie2-2.3.4.3/formats.h --- bowtie2-2.3.4.2/formats.h 2018-08-07 19:24:25.000000000 +0000 +++ bowtie2-2.3.4.3/formats.h 2018-09-17 04:22:55.000000000 +0000 @@ -30,7 +30,6 @@ FASTA = 1, FASTA_CONT, FASTQ, - INTERLEAVED, BAM, TAB_MATE5, TAB_MATE6, @@ -44,7 +43,6 @@ "FASTA", "FASTA sampling", "FASTQ", - "Interleaved FASTQ", "BAM", "Tabbed mated", "Raw", diff -Nru bowtie2-2.3.4.2/NEWS bowtie2-2.3.4.3/NEWS --- bowtie2-2.3.4.2/NEWS 2018-08-07 19:24:25.000000000 +0000 +++ bowtie2-2.3.4.3/NEWS 2018-09-17 04:22:55.000000000 +0000 @@ -19,6 +19,14 @@ Version Release History ======================= +Version 2.3.4.3 - Sep 17, 2018 + + * Fixed an issue causing `bowtie2-build` and `bowtie2-inspect` + to output incomplete help text. + * Fixed an issue causing `bowtie2-inspect` to crash. + * Fixed an issue preventing `bowtie2` from processing paired and/or + unpaired FASTQ reads together with interleaved FASTQ reads. + Version 2.3.4.2 - Aug 7, 2018 * Fixed issue causing `bowtie2` to fail in `--fast-local` mode. diff -Nru bowtie2-2.3.4.2/pat.cpp bowtie2-2.3.4.3/pat.cpp --- bowtie2-2.3.4.2/pat.cpp 2018-08-07 19:24:25.000000000 +0000 +++ bowtie2-2.3.4.3/pat.cpp 2018-09-17 04:22:55.000000000 +0000 @@ -100,9 +100,8 @@ case FASTA: return new FastaPatternSource(qs, p); case FASTA_CONT: return new FastaContinuousPatternSource(qs, p); case RAW: return new RawPatternSource(qs, p); - case FASTQ: return new FastqPatternSource(qs, p); + case FASTQ: return new FastqPatternSource(qs, p, p.interleaved); case BAM: return new BAMPatternSource(qs, p); - case INTERLEAVED: return new FastqPatternSource(qs, p, true /* interleaved */); case TAB_MATE5: return new TabbedPatternSource(qs, p, false); case TAB_MATE6: return new TabbedPatternSource(qs, p, true); case CMDLINE: return new VectorPatternSource(qs, p); @@ -232,14 +231,13 @@ pt, true, // batch A (or pairs) true); // grab lock below - bool done = res.first; - if(!done && res.second == 0) { + if(res.second == 0 && cur < srca_->size() - 1) { ThreadSafe ts(mutex_m); if(cur + 1 > cur_) cur_++; cur = cur_; // Move on to next PatternSource continue; // on to next pair of PatternSources } - return make_pair(done, res.second); + return make_pair(res.first && cur == srca_->size() - 1, res.second); } else { pair resa, resb; // Lock to ensure that this thread gets parallel reads @@ -295,12 +293,11 @@ const EList& q, // qualities associated with singles const EList& q1, // qualities associated with m1 const EList& q2, // qualities associated with m2 - const PatternParams& p, // read-in parameters + PatternParams& p, // read-in parameters bool verbose) // be talkative? { EList* a = new EList(); EList* b = new EList(); - EList* ab = new EList(); // Create list of pattern sources for paired reads appearing // interleaved in a single file for(size_t i = 0; i < m12.size(); i++) { @@ -312,7 +309,9 @@ tmp.push_back(m12[i]); assert_eq(1, tmp.size()); } - ab->push_back(PatternSource::patsrcFromStrings(p, *qs)); + a->push_back(PatternSource::patsrcFromStrings(p, *qs)); + b->push_back(NULL); + p.interleaved = false; if(!p.fileParallel) { break; } @@ -376,16 +375,7 @@ } PatternComposer *patsrc = NULL; - if(m12.size() > 0) { - patsrc = new SoloPatternComposer(ab, p); - for(size_t i = 0; i < a->size(); i++) delete (*a)[i]; - for(size_t i = 0; i < b->size(); i++) delete (*b)[i]; - delete a; delete b; - } else { - patsrc = new DualPatternComposer(a, b, p); - for(size_t i = 0; i < ab->size(); i++) delete (*ab)[i]; - delete ab; - } + patsrc = new DualPatternComposer(a, b, p); return patsrc; } @@ -471,9 +461,15 @@ } else { const char* filename = infiles_[filecur_].c_str(); - compressed_ = is_gzipped_file(filename); - if (compressed_) { + const char* ext = filename + infiles_[filecur_].length(); + + // Only temporary until we have a separate BGZFPatternSource + while (ext != filename && *--ext != '.') ; + bool bam_file = ext != filename && strncmp(ext, ".bam", 4) == 0; + + if (!bam_file && is_gzipped_file(filename)) { zfp_ = gzopen(filename, "rb"); + compressed_ = true; } else { fp_ = fopen(filename, "rb"); } @@ -1146,6 +1142,12 @@ 32, //read_name }; +const uint8_t BAMPatternSource::EOF_MARKER[] = { + 0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, + 0x06, 0x00, 0x42, 0x43, 0x02, 0x00, 0x1b, 0x00, 0x03, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + bool BAMPatternSource::parse_bam_header() { char magic[4]; @@ -1203,60 +1205,283 @@ return true; } -std::pair BAMPatternSource::nextBatchFromFile(PerThreadReadBuf& pt, - bool batch_a, unsigned readi) { - if (first_) { - first_ = false; - if (!parse_bam_header()) { - std::cerr << "Unable to parse BAM file" << std::endl; - return make_pair(true, 0); +uint16_t BAMPatternSource::nextBGZFBlockFromFile(BGZF& b) { + fread(&b.hdr, sizeof(b.hdr), 1, fp_); + if (feof_unlocked(fp_) || ferror_unlocked(fp_)) { + return 0; + } + + uint8_t *extra = new uint8_t[b.hdr.xlen]; + fread(extra, b.hdr.xlen, 1, fp_); + if (ferror_unlocked(fp_)) { + return 0; + } + + if (memcmp(EOF_MARKER, &b.hdr, sizeof(b.hdr)) == 0 + && memcmp(EOF_MARKER + sizeof(b.hdr), extra, 6 /* sizeof BAM subfield */) == 0) + { + delete[] extra; + fclose(fp_); + return 0; + } + + uint16_t bsize = 0; + for (uint16_t i = 0; i < b.hdr.xlen;) { + if (extra[0] == 66 && extra[1] == 67) { + bsize = *((uint16_t *)(extra + 4)); + bsize -= (b.hdr.xlen + 19); + break; } + i = i + 2; + uint16_t sub_field_len = *((uint16_t *)(extra + 2)); + i = i + 2 + sub_field_len; + } + + delete[] extra; + + if (bsize == 0) { + return 0; } + fread(b.cdata, bsize, 1, fp_); + if (ferror_unlocked(fp_)) { + return 0; + } + + fread(&b.ftr, sizeof b.ftr, 1, fp_); + if (ferror_unlocked(fp_)) { + return 0; + } + + return bsize; +} + +std::pair BAMPatternSource::nextBatch(PerThreadReadBuf& pt, bool batch_a, bool lock) { + uint16_t cdata_len; + + unsigned nread = 0; + bool done = false; + + pt.setReadId(readCnt_); + + do { + if (bam_batch_indexes_[pt.tid_] >= bam_batches_[pt.tid_].size()) { + BGZF& block = blocks_[pt.tid_]; + std::vector& batch = bam_batches_[pt.tid_]; + if (lock) { + ThreadSafe ts(mutex); + if (first_) { + nextBGZFBlockFromFile(block); + first_ = false; + } + cdata_len = nextBGZFBlockFromFile(block); + } else { + cdata_len = nextBGZFBlockFromFile(block); + } + + if (cdata_len == 0) { + ThreadSafe ts(orphan_mates_mutex_); + get_orphaned_pairs(pt.bufa_, pt.bufb_, pt.max_buf_, nread); + return make_pair(nread == 0, nread); + } + + bam_batch_indexes_[pt.tid_] = 0; + + batch.resize(block.ftr.isize); + + int ret_code = decompress_bgzf_block(&batch[0], block.ftr.isize, block.cdata, cdata_len); + + if (ret_code != Z_OK) { + return make_pair(true, 0); + } + + uLong crc = crc32(0L, Z_NULL, 0); + crc = crc32(crc, &batch[0], batch.size()); + assert(crc == block.ftr.crc32); + } + + std::pair ret = get_alignments(pt, batch_a, nread); + + done = ret.first; + } while (!done && nread < pt.max_buf_); + + readCnt_ += 1; + + return make_pair(done, nread); +} + +std::pair BAMPatternSource::get_alignments(PerThreadReadBuf& pt, bool batch_a, unsigned& readi) { + size_t& i = bam_batch_indexes_[pt.tid_]; bool done = false; bool read1 = true; while (readi < pt.max_buf_) { - int r; + if (i >= bam_batches_[pt.tid_].size()) { + return make_pair(false, readi); + } + uint16_t flag; int32_t block_size; EList& readbuf = pp_.align_paired_reads && !read1 ? pt.bufb_ : pt.bufa_; - if ((r = zread(&block_size, sizeof(block_size))) != sizeof(block_size)) { - return make_pair(true, readi); + memcpy(&block_size, &bam_batches_[pt.tid_][0] + i, sizeof(block_size)); + if (block_size <= 0) { + return make_pair(done, readi); } - if (readbuf[readi].readOrigBuf.length() < block_size) { - readbuf[readi].readOrigBuf.resize(block_size); - } - if (zread(readbuf[readi].readOrigBuf.wbuf(), block_size) != block_size) { - done = true; - break; - } - memcpy(&flag, readbuf[readi].readOrigBuf.buf() + offset[BAMField::flag], sizeof(flag)); + i += sizeof(block_size); + + memcpy(&flag, &bam_batches_[pt.tid_][0] + i + offset[BAMField::flag], sizeof(flag)); if (!pp_.align_paired_reads && ((flag & 0x40) != 0 || (flag & 0x80) != 0)) { readbuf[readi].readOrigBuf.clear(); + i += block_size; continue; } + if (pp_.align_paired_reads && ((flag & 0x40) == 0 && (flag & 0x80) == 0)) { readbuf[readi].readOrigBuf.clear(); + i += block_size; continue; } - if (pp_.align_paired_reads && read1 && (flag & 0x40) == 0) { - std::cerr << "Paired reads are out of order" << std::endl; - return make_pair(true, readi == 0 ? readi : readi-1); - } - if (pp_.align_paired_reads && !read1 && (flag & 0x80) == 0) { - std::cerr << "Paired reads are out of order" << std::endl; - return make_pair(true, readi == 0 ? readi : readi-1); - } - read1 = !read1; - readi = (pp_.align_paired_reads - && pt.bufb_[readi].readOrigBuf.length() == 0) ? readi : readi + 1; + if (pp_.align_paired_reads && (((flag & 0x40) != 0 + && i + block_size == bam_batches_[pt.tid_].size()) + || ((flag & 0x80) != 0 && i == sizeof(block_size)))) + { + ThreadSafe ts(orphan_mates_mutex_); + store_orphan_mate(&bam_batches_[pt.tid_][0] + i, block_size); + i += block_size; + get_orphaned_pairs(pt.bufa_, pt.bufb_, pt.max_buf_, readi); + } else { + readbuf[readi].readOrigBuf.resize(block_size); + + memcpy(readbuf[readi].readOrigBuf.wbuf(), &bam_batches_[pt.tid_][0] + i, block_size); + i += block_size; + + read1 = !read1; + readi = (pp_.align_paired_reads + && pt.bufb_[readi].readOrigBuf.length() == 0) ? readi : readi + 1; + } } + return make_pair(done, readi); } +void BAMPatternSource::store_orphan_mate(const uint8_t* r, size_t read_len) { + uint8_t flag; + memcpy(&flag, r + offset[BAMField::flag], sizeof(flag)); + + std::vector& + orphan_mates = (flag & 0x40) != 0 ? orphan_mate1s : orphan_mate2s; + + size_t i; + for (i = 0; i < orphan_mates.size() && !orphan_mates[i].empty(); ++i) ; + + if (i == orphan_mates.size()) { + orphan_mates.resize(orphan_mates.size() * 2); + } + + orphan_mate_t& mate = orphan_mates[i]; + + if (mate.data == NULL || mate.cap < read_len) { + mate.data = new uint8_t[read_len]; + mate.cap = read_len; + } + + if (mate.cap < read_len) { + mate.data = new uint8_t[read_len]; + mate.cap = read_len; + } + + mate.size = read_len; + + memcpy(mate.data, r, read_len); +} + +int BAMPatternSource::compare_read_names(const void* m1, const void* m2) { + const orphan_mate_t* mate1 = (const orphan_mate_t*)m1; + const orphan_mate_t* mate2 = (const orphan_mate_t*)m2; + + const char* r1 = (const char *)(mate1->data + offset[BAMField::read_name]); + const char* r2 = (const char *)(mate2->data + offset[BAMField::read_name]); + + return strcmp(r1, r2); +} + +bool BAMPatternSource::compare_read_names2(const orphan_mate_t& m1, const orphan_mate_t& m2) { + if (m1.empty()) { + return false; + } + + if (m2.empty()) { + return true; + } + + const char* r1 = (const char *)(m1.data + offset[BAMField::read_name]); + const char* r2 = (const char *)(m2.data + offset[BAMField::read_name]); + + return strcmp(r1, r2); +} + +void BAMPatternSource::get_orphaned_pairs(EList& buf_a, EList& buf_b, const size_t max_buf, unsigned& readi) { + std::sort(orphan_mate1s.begin(), orphan_mate1s.end(), &BAMPatternSource::compare_read_names2); + std::sort(orphan_mate2s.begin(), orphan_mate2s.end(), &BAMPatternSource::compare_read_names2); + + size_t lim1, lim2; + for (lim1 = 0; lim1 < orphan_mate1s.size() && !orphan_mate1s[lim1].empty(); lim1++) ; + for (lim2 = 0; lim2 < orphan_mate2s.size() && !orphan_mate2s[lim2].empty(); lim2++) ; + + if (lim2 == 0) { + return; + } + + for (size_t i = 0; i < lim1 && readi < max_buf; ++i) { + orphan_mate_t* mate1 = &orphan_mate1s[i]; + orphan_mate_t* mate2 = (orphan_mate_t*)bsearch(mate1, &orphan_mate2s[0], lim2, + sizeof(orphan_mate2s[0]), compare_read_names); + + if (mate2 != NULL) { + Read& ra = buf_a[readi]; + Read& rb = buf_b[readi]; + + ra.readOrigBuf.resize(mate1->size); + rb.readOrigBuf.resize(mate2->size); + + memcpy(ra.readOrigBuf.wbuf(), mate1->data, mate1->size); + memcpy(rb.readOrigBuf.wbuf(), mate2->data, mate2->size); + + mate1->reset(); + mate2->reset(); + + readi++; + } + } +} + +int BAMPatternSource::decompress_bgzf_block(uint8_t *dst, size_t dst_len, uint8_t *src, size_t src_len) { + z_stream strm; + + strm.zalloc = Z_NULL; + strm.zfree = Z_NULL; + strm.opaque = Z_NULL; + + strm.avail_in = src_len; + strm.next_in = src; + strm.avail_out = dst_len; + strm.next_out = dst; + + int ret = inflateInit2(&strm, -8); + if (ret != Z_OK) { + return ret; + } + + ret = inflate(&strm, Z_FINISH); + if (ret != Z_STREAM_END) { + return ret; + } + + return inflateEnd(&strm); +} + bool BAMPatternSource::parse(Read& ra, Read& rb, TReadId rdid) const { uint8_t l_read_name; int32_t l_seq; diff -Nru bowtie2-2.3.4.2/pat.h bowtie2-2.3.4.3/pat.h --- bowtie2-2.3.4.2/pat.h 2018-08-07 19:24:25.000000000 +0000 +++ bowtie2-2.3.4.3/pat.h 2018-09-17 04:22:55.000000000 +0000 @@ -26,6 +26,7 @@ #include #include #include +#include #include "alphabet.h" #include "assert_helpers.h" #include "random_source.h" @@ -54,6 +55,7 @@ PatternParams( int format_, + bool interleaved_, bool fileParallel_, uint32_t seed_, size_t max_buf_, @@ -71,6 +73,7 @@ bool preserve_sam_tags_, bool align_paired_reads_) : format(format_), + interleaved(interleaved_), fileParallel(fileParallel_), seed(seed_), max_buf(max_buf_), @@ -89,6 +92,7 @@ align_paired_reads(align_paired_reads_) { } int format; // file format + bool interleaved; // some or all of the FASTQ reads are interleaved bool fileParallel; // true -> wrap files with separate PatternComposers uint32_t seed; // pseudo-random seed size_t max_buf; // number of reads to buffer in one read @@ -112,11 +116,12 @@ */ struct PerThreadReadBuf { - PerThreadReadBuf(size_t max_buf) : + PerThreadReadBuf(size_t max_buf, int tid) : max_buf_(max_buf), bufa_(max_buf), bufb_(max_buf), - rdid_() + rdid_(), + tid_(tid) { bufa_.resize(max_buf); bufb_.resize(max_buf); @@ -185,6 +190,7 @@ EList bufb_; // Read buffer for mate bs size_t cur_buf_; // Read buffer currently active TReadId rdid_; // index of read at offset 0 of bufa_/bufb_ + int tid_; }; extern void wrongQualityFormat(const BTString& read_name); @@ -682,7 +688,7 @@ FastqPatternSource( const EList& infiles, - const PatternParams& p, bool interleaved = false) : + const PatternParams& p, bool interleaved) : CFilePatternSource(infiles, p), first_(true), interleaved_(interleaved) { } @@ -719,6 +725,63 @@ }; class BAMPatternSource : public CFilePatternSource { + struct hdr_t { + uint8_t id1; + uint8_t id2; + uint8_t cm; + uint8_t flg; + uint32_t mtime; + uint8_t xfl; + uint8_t os; + uint16_t xlen; + }; + + struct ftr_t { + uint32_t crc32; + uint32_t isize; + }; + + struct BGZF { + hdr_t hdr; + uint8_t cdata[1 << 16]; + ftr_t ftr; + }; + + struct orphan_mate_t { + orphan_mate_t() : + data(NULL), + size(0), + cap(0) {} + + void reset() { + size = 0; + } + + bool empty() const { + return size == 0; + } + + uint8_t* data; + uint16_t size; + uint16_t cap; + }; + + struct BAMField { + enum aln_rec_field_name { + refID, + pos, + l_read_name, + mapq, + bin, + n_cigar_op, + flag, + l_seq, + next_refID, + next_pos, + tlen, + read_name, + }; + }; public: @@ -726,7 +789,19 @@ const EList& infiles, const PatternParams& p) : CFilePatternSource(infiles, p), - first_(true) {} + first_(true), + blocks_(p.nthreads), + bam_batches_(p.nthreads), + bam_batch_indexes_(p.nthreads), + orphan_mate1s(p.nthreads * 2), + orphan_mate2s(p.nthreads * 2), + orphan_mates_mutex_(), + pp_(p) { + // uncompressed size of BGZF block is limited to 2**16 bytes + for (size_t i = 0; i < bam_batches_.size(); ++i) { + bam_batches_[i].reserve(1 << 16); + } + } virtual void reset() { first_ = true; @@ -738,14 +813,27 @@ */ virtual bool parse(Read& ra, Read& rb, TReadId rdid) const; + ~BAMPatternSource() { + // only temporary until c++11 + for (size_t i = 0; i < orphan_mate1s.size(); i++) { + if (orphan_mate1s[i].data != NULL) { + delete[] orphan_mate1s[i].data; + } + } + + for (size_t i = 0; i < orphan_mate2s.size(); i++) { + if (orphan_mate2s[i].data != NULL) { + delete[] orphan_mate2s[i].data; + } + } + } + + protected: - /** - * Light-parse a batch into the given buffer. - */ - virtual std::pair nextBatchFromFile(PerThreadReadBuf& pt, bool batch_a, - unsigned readi); + virtual std::pair nextBatch(PerThreadReadBuf& pt, bool batch_a, bool lock = true); + uint16_t nextBGZFBlockFromFile(BGZF& block); /** * Reset state to be ready for the next file. @@ -760,24 +848,36 @@ bool parse_bam_header(); - struct BAMField { - enum aln_rec_field_name { - refID, - pos, - l_read_name, - mapq, - bin, - n_cigar_op, - flag, - l_seq, - next_refID, - next_pos, - tlen, - read_name, - }; - }; + virtual std::pair nextBatchFromFile(PerThreadReadBuf&, bool, unsigned) { + return make_pair(true, 0); + } + + int decompress_bgzf_block(uint8_t *dst, size_t dst_len, uint8_t *src, size_t src_len); + + std::pair get_alignments(PerThreadReadBuf& pt, bool batch_a, unsigned& readi); + + void store_orphan_mate(const uint8_t* read, size_t read_len); + + void get_orphaned_pairs(EList& buf_a, EList& buf_b, const size_t max_buf, unsigned& readi); + + size_t get_matching_read(const uint8_t* rec); + + static int compare_read_names(const void* m1, const void* m2); + + static bool compare_read_names2(const orphan_mate_t& m1, const orphan_mate_t& m2); static const int offset[]; + static const uint8_t EOF_MARKER[]; + + std::vector blocks_; + std::vector > bam_batches_; + std::vector bam_batch_indexes_; + + std::vector orphan_mate1s; + std::vector orphan_mate2s; + MUTEX_T orphan_mates_mutex_; + + PatternParams pp_; }; /** @@ -861,7 +961,7 @@ const EList& q, // qualities associated with singles const EList& q1, // qualities associated with m1 const EList& q2, // qualities associated with m2 - const PatternParams& p, // read-in params + PatternParams& p, // read-in params bool verbose); // be talkative? protected: @@ -1017,9 +1117,9 @@ PatternSourcePerThread( PatternComposer& composer, - const PatternParams& pp) : + const PatternParams& pp, int tid) : composer_(composer), - buf_(pp.max_buf), + buf_(pp.max_buf, tid), pp_(pp), last_batch_(false), last_batch_size_(0) { } @@ -1107,15 +1207,16 @@ public: PatternSourcePerThreadFactory( PatternComposer& composer, - const PatternParams& pp) : + const PatternParams& pp, int tid) : composer_(composer), - pp_(pp) { } + pp_(pp), + tid_(tid) { } /** * Create a new heap-allocated PatternSourcePerThreads. */ virtual PatternSourcePerThread* create() const { - return new PatternSourcePerThread(composer_, pp_); + return new PatternSourcePerThread(composer_, pp_, tid_); } /** @@ -1125,7 +1226,7 @@ virtual EList* create(uint32_t n) const { EList* v = new EList; for(size_t i = 0; i < n; i++) { - v->push_back(new PatternSourcePerThread(composer_, pp_)); + v->push_back(new PatternSourcePerThread(composer_, pp_, tid_)); assert(v->back() != NULL); } return v; @@ -1135,6 +1236,7 @@ /// Container for obtaining paired reads from PatternSources PatternComposer& composer_; const PatternParams& pp_; + int tid_; }; #endif /*PAT_H_*/ diff -Nru bowtie2-2.3.4.2/VERSION bowtie2-2.3.4.3/VERSION --- bowtie2-2.3.4.2/VERSION 2018-08-07 19:24:25.000000000 +0000 +++ bowtie2-2.3.4.3/VERSION 2018-09-17 04:22:55.000000000 +0000 @@ -1 +1 @@ -2.3.4.2 +2.3.4.3