diff -Nru blasr-0~20151014+git8e668be/Blasr.cpp blasr-5.3/Blasr.cpp --- blasr-0~20151014+git8e668be/Blasr.cpp 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/Blasr.cpp 2016-09-12 08:56:50.000000000 +0000 @@ -1,44 +1,9 @@ -// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. -// -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted (subject to the limitations in the -// disclaimer below) provided that the following conditions are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// -// * Neither the name of Pacific Biosciences nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE -// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC -// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED -// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES -// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF -// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT -// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -// SUCH DAMAGE. - // Author: Mark Chaisson -#include "BlasrMiscs.hpp" -#include "BlasrUtils.hpp" -#include "BlasrAlign.hpp" -#include "RegisterBlasrOptions.h" +#include "iblasr/BlasrMiscs.hpp" +#include "iblasr/BlasrUtils.hpp" +#include "iblasr/BlasrAlign.hpp" +#include "iblasr/RegisterBlasrOptions.h" //#define USE_GOOGLE_PROFILER #ifdef USE_GOOGLE_PROFILER @@ -51,23 +16,33 @@ MappingSemaphores semaphores; ostream *outFilePtr = NULL; #ifdef USE_PBBAM -PacBio::BAM::BamWriter * bamWriterPtr = NULL; +PacBio::BAM::IRecordWriter * bamWriterPtr = NULL; // use IRecordWriter for both SAM ands BAM #endif HDFRegionTableReader *regionTableReader = NULL; ReaderAgglomerate *reader = NULL; +// Add comment to version history for each version change ! +// +// Version history +// +// 5.0 - a new major version number +// 5.1 - transiotion to POSIX notation - double sashes before multi-character flags +// 5.2 - --sam no longer supported +// 5.3 - --sam supported via pbbam/IRecordWriter +// const string GetMajorVersion() { - return "2.0.0"; + return "5.3"; } +// version format is 3 numbers sparated by dots : Version.Subversion.SHA1 const string GetVersion(void) { - string perforceVersionString("$Change$"); + string gitVersionString(SHA1_7); // gitVersionString is first 7 characters of SHA1 string version = GetMajorVersion(); - if (perforceVersionString.size() > 12) { - version.insert(version.size(), "."); - version.insert(version.size(), perforceVersionString, 9, perforceVersionString.size() - 11); - } + // if (gitVersionString.size() == 7) { + version.append("."); + version.append(gitVersionString); + // } return version; } @@ -99,7 +74,7 @@ // if (smrtRead.highQualityRegionScore < params.minRawSubreadScore or (params.maxReadLength != 0 and smrtRead.length > UInt(params.maxReadLength)) or - (smrtRead.length < params.minReadLength)) { + (int(smrtRead.length) < params.minReadLength)) { return false; } @@ -196,16 +171,15 @@ void MakePrimaryIntervals(vector & subreads, vector & subreadIntervals, vector & subreadDirections, - int & bestSubreadIndex, - MappingParameters & params) + int & bestSubreadIndex) { MakeSubreadIntervals(subreads, subreadIntervals); CreateDirections(subreadDirections, subreadIntervals.size()); - bestSubreadIndex = GetIndexOfMedian(subreadIntervals); + bestSubreadIndex = GetIndexOfConcordantTemplate(subreadIntervals); } -/// Scan the next read from input. This may either be a CCS read, +/// Scan the next read from input. This may either be a CCS read, unrolled (Polymerase) read, /// or regular read (though this may be aligned in whole, or by /// subread). /// \params[in] reader: FASTA/FASTQ/BAX.H5/CCS.H5/BAM file reader @@ -231,9 +205,9 @@ int & associatedRandInt, bool & stop) { - if (reader->GetFileType() != BAM or not params.concordant) { - if (reader->GetFileType() == HDFCCS || - reader->GetFileType() == HDFCCSONLY) { + if ((reader->GetFileType() != FileType::PBBAM and reader->GetFileType() != FileType::PBDATASET) or not params.concordant) { + if (reader->GetFileType() == FileType::HDFCCS || + reader->GetFileType() == FileType::HDFCCSONLY) { if (GetNextReadThroughSemaphore(*reader, params, ccsRead, readGroupId, associatedRandInt, semaphores) == false) { stop = true; return false; @@ -299,7 +273,7 @@ } } if (subreads.size() != 0) { - MakeVirtualRead(smrtRead, subreads); + smrtRead.MadeFromSubreadsAsPolymerase(subreads); return true; } else { @@ -312,7 +286,6 @@ MappingBuffers & mappingBuffers, SMRTSequence & smrtRead, SMRTSequence & smrtReadRC, - CCSSequence & ccsRead, vector & subreads, MappingParameters & params, const int & associatedRandInt, @@ -333,19 +306,18 @@ bwtPtr = mapData->bwtPtr; SeqBoundaryFtr seqBoundary(&seqdb); - vector subreadIntervals; vector subreadDirections; int bestSubreadIndex; - if (mapData->reader->GetFileType() != BAM or not params.concordant) { + if ((mapData->reader->GetFileType() != FileType::PBBAM and mapData->reader->GetFileType() != FileType::PBDATASET) or not params.concordant) { MakePrimaryIntervals(mapData->regionTablePtr, smrtRead, subreadIntervals, subreadDirections, bestSubreadIndex, params); } else { MakePrimaryIntervals(subreads, subreadIntervals, subreadDirections, - bestSubreadIndex, params); + bestSubreadIndex); } // Flop all directions if direction of the longest subread is 1. @@ -360,8 +332,14 @@ if (params.concordant) { // Only the longest subread will be aligned in the first round. + // VR , change the comment startIndex = max(startIndex, bestSubreadIndex); endIndex = min(endIndex, bestSubreadIndex + 1); + + if (params.verbosity >= 1) { + cout << "Concordant template subread index: " << bestSubreadIndex << ", " + << smrtRead.HoleNumber() << "/" << subreadIntervals[bestSubreadIndex] << endl; + } } // @@ -370,8 +348,7 @@ allReadAlignments.Resize(subreadIntervals.size()); allReadAlignments.alignMode = Subread; - DNALength intvIndex; - for (intvIndex = startIndex; intvIndex < endIndex; intvIndex++) { + for (int intvIndex = startIndex; intvIndex < endIndex; intvIndex++) { SMRTSequence subreadSequence, subreadSequenceRC; MakeSubreadOfInterval(subreadSequence, smrtRead, subreadIntervals[intvIndex], params); @@ -473,10 +450,10 @@ // for printing // delete all AC which are in complement of SelectedAlignmemntPtrs vector // namely (SelectedAlignmentPtrs/alignmentPtrs) - for (int ii = 0; ii < alignmentPtrs.size(); ii++) + for (size_t ii = 0; ii < alignmentPtrs.size(); ii++) { int found =0; - for (int jj = 0; jj < selectedAlignmentPtrs.size(); jj++) + for (size_t jj = 0; jj < selectedAlignmentPtrs.size(); jj++) { if (alignmentPtrs[ii] == selectedAlignmentPtrs[jj] ) { @@ -489,10 +466,13 @@ subreadSequence.Free(); subreadSequenceRC.Free(); } // End of looping over subread intervals within [startIndex, endIndex). + if (params.verbosity >= 3) allReadAlignments.Print(threadOut); + // If not concordant , all done + if (params.concordant) { allReadAlignments.read = smrtRead; allReadAlignments.alignMode = ZmwSubreads; @@ -507,7 +487,7 @@ seqdb, genome, params.flankSize); } - for (intvIndex = 0; intvIndex < subreadIntervals.size(); intvIndex++) { + for (int intvIndex = 0; intvIndex < int(subreadIntervals.size()); intvIndex++) { if (intvIndex == startIndex) continue; int passDirection = subreadDirections[intvIndex]; int passStartBase = subreadIntervals[intvIndex].start; @@ -529,7 +509,7 @@ allReadAlignments.SetSequence(intvIndex, smrtRead); } - for (int alnIndex = 0; alnIndex < selectedAlignmentPtrs.size(); alnIndex++) { + for (size_t alnIndex = 0; alnIndex < selectedAlignmentPtrs.size(); alnIndex++) { T_AlignmentCandidate * alignment = selectedAlignmentPtrs[alnIndex]; if (alignment->score > params.maxScore) break; AlignSubreadToAlignmentTarget(allReadAlignments, @@ -553,7 +533,7 @@ } // End of aligning this subread to each selected alignment. subread.Free(); } // End of aligning each subread to where the template subread aligned to. - for(int alignmentIndex = 0; alignmentIndex < selectedAlignmentPtrs.size(); + for(size_t alignmentIndex = 0; alignmentIndex < selectedAlignmentPtrs.size(); alignmentIndex++) { if (selectedAlignmentPtrs[alignmentIndex]) delete selectedAlignmentPtrs[alignmentIndex]; @@ -562,6 +542,11 @@ } // End of if params.concordant } +// +// invoked for mapping entire ZMW as a single entity +// either for CCS reads : all subreads of a ZMW collapsed/merged into a single read +// or Polymerase reads : all subreads of a ZMW stitched into a single read +// void MapReadsCCS(MappingData *mapData, MappingBuffers & mappingBuffers, SMRTSequence & smrtRead, @@ -619,6 +604,9 @@ // all other are secondary. // + // + // Here unrolled reads are aligned + // if (readIsCCS == false or params.useCcsOnly) { // if -noSplitSubreads or -useccsdenovo. // @@ -634,9 +622,12 @@ } allReadAlignments.SetSequence(0, smrtRead); } + // + // Here CCS reads are aligned + // else if (readIsCCS) { // if -useccsall or -useccs // Flank alignment candidates to both ends. - for(int alignmentIndex = 0; alignmentIndex < selectedAlignmentPtrs.size(); + for(size_t alignmentIndex = 0; alignmentIndex < selectedAlignmentPtrs.size(); alignmentIndex++) { FlankTAlignedSeq(selectedAlignmentPtrs[alignmentIndex], seqdb, genome, params.flankSize); @@ -698,12 +689,11 @@ // The unrolled alignment should be relative to the entire read. allReadAlignments.SetSequence(subreadIndex, ccsRead.unrolledRead); - int alignmentIndex; // // Align this subread to all the positions that the de novo // sequence has aligned to. // - for (alignmentIndex = 0; alignmentIndex < selectedAlignmentPtrs.size(); alignmentIndex++) { + for (size_t alignmentIndex = 0; alignmentIndex < selectedAlignmentPtrs.size(); alignmentIndex++) { T_AlignmentCandidate *alignment = selectedAlignmentPtrs[alignmentIndex]; if (alignment->score > params.maxScore) break; AlignSubreadToAlignmentTarget(allReadAlignments, @@ -722,10 +712,10 @@ // for printing // delete all AC which are in complement of SelectedAlignmemntPtrs vector // namely (SelectedAlignmentPtrs/alignmentPtrs) - for (int ii = 0; ii < alignmentPtrs.size(); ii++) + for (size_t ii = 0; ii < alignmentPtrs.size(); ii++) { int found =0; - for (int jj = 0; jj < selectedAlignmentPtrs.size(); jj++) + for (size_t jj = 0; jj < selectedAlignmentPtrs.size(); jj++) { if (alignmentPtrs[ii] == selectedAlignmentPtrs[jj] ) { @@ -750,14 +740,12 @@ TupleCountTable ct; SequenceIndexDatabase seqdb; T_GenomeSequence genome; - BWT *bwtPtr; mapData->ShallowCopySuffixArray(sarray); mapData->ShallowCopyReferenceSequence(genome); mapData->ShallowCopySequenceIndexDatabase(seqdb); mapData->ShallowCopyTupleCountTable(ct); - bwtPtr = mapData->bwtPtr; SeqBoundaryFtr seqBoundary(&seqdb); int numAligned = 0; @@ -804,6 +792,10 @@ smrtRead.MakeRC(smrtReadRC); + // important + // 1. CCS and unrolled mode are mutually exclusive + // 2. Reverse Complement Read is generated fort CCS only + // if (readIsCCS) { ccsRead.unrolledRead.MakeRC(unrolledReadRC); } @@ -815,10 +807,18 @@ ReadAlignments allReadAlignments; allReadAlignments.read = smrtRead; + // currently 3 ways of mapping + // regular, CCS , and Polymerase (unrolled) + // + // for regular subreads MapReadsNonCCS + // for mapping ZMW as a whole (CCS or Polymerase) MapReadsCCS + // For the future , change the name of functions to be more desriptive + // noSplitSubreads is in essense unrolled - Polymerase read mode + // if (readIsCCS == false and params.mapSubreadsSeparately) { // (not readIsCCS and not -noSplitSubreads) MapReadsNonCCS(mapData, mappingBuffers, - smrtRead, smrtReadRC, ccsRead, subreads, + smrtRead, smrtReadRC, subreads, params, associatedRandInt, allReadAlignments, threadOut); } // End of if (readIsCCS == false and params.mapSubreadsSeparately). @@ -877,10 +877,7 @@ // Configure parameters for refining alignments. // MappingParameters params; - ReverseCompressIndex index; - pid_t parentPID; - pid_t *pids; - + CommandLineParser clp; clp.SetHelp(BlasrHelp(params)); clp.SetConciseHelp(BlasrConciseHelp()); @@ -909,7 +906,7 @@ cout << BlasrConciseHelp(); exit(1); // A failure. } - + int a, b; for (a = 0; a < 5; a++ ) { for (b = 0; b < 5; b++ ){ @@ -921,7 +918,7 @@ } } } - + if (params.scoreMatrixString != "") { if (StringToScoreMatrix(params.scoreMatrixString, SMRTDistanceMatrix) == false) { cout << "ERROR. The string " << endl @@ -938,7 +935,7 @@ exit(1); } } - + cerr << "[INFO] " << GetTimestamp() << " [blasr] started." << endl; params.MakeSane(); @@ -953,7 +950,7 @@ else { InitializeRandomGeneratorWithTime(); } - + // // Various aspects of timing are stored here. However this isn't // quite finished. @@ -979,13 +976,13 @@ } } - if (params.regionTableFileNames.size() != 0 and + if (params.regionTableFileNames.size() != 0 and params.regionTableFileNames.size() != params.queryFileNames.size()) { cout << "Error, there are not the same number of region table files as input files." << endl; exit(1); } - // If reading a separate ccs fofn, there is a 1-1 corresponence + // If reading a separate ccs fofn, there is a 1-1 corresponence // between ccs fofn and base file. if (params.readSeparateCcsFofn) { if (FileOfFileNames::IsFOFN(params.ccsFofnFileName)) { @@ -995,13 +992,11 @@ params.ccsFofnFileNames.push_back(params.ccsFofnFileName); } } - if (params.ccsFofnFileNames.size() != 0 and + if (params.ccsFofnFileNames.size() != 0 and params.ccsFofnFileNames.size() != params.queryFileNames.size()) { cout << "Error, there are not the same number of ccs files as input files." << endl; exit(1); } - - parentPID = getpid(); SequenceIndexDatabase seqdb; SeqBoundaryFtr seqBoundary(&seqdb); @@ -1021,14 +1016,14 @@ // Make sure the reads file exists and can be opened before // trying to read any of the larger data structures. // - + FASTASequence fastaGenome; T_Sequence genome; FASTAReader genomeReader; - // - // The genome is in normal FASTA, or condensed (lossy homopolymer->unipolymer) + // + // The genome is in normal FASTA, or condensed (lossy homopolymer->unipolymer) // format. Both may be read in using a FASTA reader. // if (!genomeReader.Init(params.genomeFileName)) { @@ -1054,8 +1049,7 @@ // // The genome may have extra spaces in the fasta name. Get rid of those. // - VectorIndex t; - for (t = 0; t < fastaGenome.titleLength; t++ ){ + for (int t = 0; t < fastaGenome.titleLength; t++ ){ if (fastaGenome.title[t] == ' ') { fastaGenome.titleLength = t; fastaGenome.title[t] = '\0'; @@ -1073,8 +1067,6 @@ DNASuffixArray sarray; TupleCountTable ct; - int listTupleSize; - ofstream outFile; outFile.exceptions(ostream::failbit); ofstream unalignedOutFile; @@ -1115,7 +1107,7 @@ else { params.listTupleSize = sarray.lookupPrefixLength; } - if (params.minMatchLength < sarray.lookupPrefixLength) { + if (params.minMatchLength < int(sarray.lookupPrefixLength)) { cerr << "WARNING. The value of -minMatch " << params.minMatchLength << " is less than the smallest searched length of " << sarray.lookupPrefixLength << ". Setting -minMatch to " << sarray.lookupPrefixLength << "." << endl; params.minMatchLength = sarray.lookupPrefixLength; } @@ -1128,7 +1120,7 @@ } } - if (params.minMatchLength < sarray.lookupPrefixLength) { + if (params.minMatchLength < int(sarray.lookupPrefixLength)) { cerr << "WARNING. The value of -minMatch " << params.minMatchLength << " is less than the smallest searched length of " << sarray.lookupPrefixLength << ". Setting -minMatch to " << sarray.lookupPrefixLength << "." << endl; params.minMatchLength = sarray.lookupPrefixLength; } @@ -1136,12 +1128,11 @@ // // It is required to have a tuple count table // for estimating the background frequencies - // for word matching. + // for word matching. // If one is specified on the command line, simply read - // it in. If not, this is operating under the mode + // it in. If not, this is operating under the mode // that everything is computed from scratch. // - long l; TupleMetrics saLookupTupleMetrics; if (params.useCountTable) { ifstream ctIn; @@ -1160,8 +1151,8 @@ ofstream titleTableOut; CrucialOpen(params.titleTableName, titleTableOut); // - // When using a sequence index database, the title table is simply copied - // from the sequencedb. + // When using a sequence index database, the title table is simply copied + // from the sequencedb. // if (params.useSeqDB) { titleTable.Copy(seqdb.names, seqdb.nSeqPos-1); @@ -1170,7 +1161,7 @@ else { // // No seqdb, so there is just one sequence. Still the user specified a title - // table, so just the first sequence in the fasta file should be used. + // table, so just the first sequence in the fasta file should be used. // titleTable.Copy(&fastaGenome.title, 1); titleTable.ResetTableToIntegers(&genome.title, &genome.titleLength, 1); @@ -1198,7 +1189,7 @@ ofstream metricsOut, lcpBoundsOut; ofstream anchorFileStrm; ofstream clusterOut, *clusterOutPtr; - + if (params.anchorFileName != "") { CrucialOpen(params.anchorFileName, anchorFileStrm, std::ios::out); } @@ -1217,7 +1208,7 @@ CrucialOpen(params.outFileName, outFileStrm, std::ios::out); outFilePtr = &outFileStrm; } // otherwise, use bamWriter and initialize it later - } + } if (params.printHeader) { switch(params.printFormat) { @@ -1237,7 +1228,7 @@ CrucialOpen(params.unalignedFileName, unalignedFile, std::ios::out); unalignedFilePtr = &unalignedFile; } - + if (params.metricsFileName != "") { CrucialOpen(params.metricsFileName, metricsOut); } @@ -1246,7 +1237,7 @@ CrucialOpen(params.lcpBoundsFileName, lcpBoundsOut); // lcpBoundsOut << "pos depth width lnwidth" << endl; } - + // // Configure the mapping database. // @@ -1271,7 +1262,6 @@ // // Start the mapping jobs. // - int readsFileIndex = 0; if (params.subsample < 1) { InitializeRandomGeneratorWithTime(); reader = new ReaderAgglomerate(params.subsample); @@ -1282,7 +1272,7 @@ // In case the input is fasta, make all bases in upper case. reader->SetToUpper(); - + regionTableReader = new HDFRegionTableReader; RegionTable regionTable; // @@ -1295,47 +1285,77 @@ string commandLineString; // Restore command. clp.CommandLineToString(argc, argv, commandLineString); - + if (params.printSAM or params.printBAM) { string so = "UNKNOWN"; // sorting order; string version = GetVersion(); //blasr version; - SAMHeaderPrinter shp(so, seqdb, - params.queryFileNames, params.queryReadType, - params.samQVList, "BLASR", version, - commandLineString); + SAMHeaderPrinter shp(so, seqdb, + params.queryFileNames, params.queryReadType, + params.samQVList, "BLASR", version, + commandLineString); string headerString = shp.ToString();// SAM/BAM header if (params.printSAM) { + // this is not going to be executed since sam is printed via bam *outFilePtr << headerString; } else if (params.printBAM) { + // here both bam and sam are handled #ifdef USE_PBBAM PacBio::BAM::BamHeader header = PacBio::BAM::BamHeader(headerString); - // Both file name and SAMHeader are required in order to create a BamWriter. - bamWriterPtr = new PacBio::BAM::BamWriter(params.outFileName, header); + // Create bam header + // Both file name and SAMHeader are required in order to create a BamWriter. + // sam_via_bam changes + if (params.sam_via_bam) + { + bamWriterPtr = new PacBio::BAM::SamWriter(params.outFileName, header); + } + else + { + bamWriterPtr = new PacBio::BAM::BamWriter(params.outFileName, header); + } #else REQUIRE_PBBAM_ERROR(); #endif - } + } } - for (readsFileIndex = 0; readsFileIndex < params.queryFileNames.size(); readsFileIndex++ ){ + for (size_t readsFileIndex = 0; readsFileIndex < params.queryFileNames.size(); readsFileIndex++ ){ params.readsFileIndex = readsFileIndex; // // Configure the reader to use the correct read and region // file names. - // + // reader->SetReadFileName(params.queryFileNames[params.readsFileIndex]); + // if PBBAM , need to construct scrap file name and check if exist // // Initialize using already set file names. // - int initReturnValue = reader->Initialize(); + + + + // unrolled Need to pass unrolled option + // unrolled If not PBDATASET also need to construct scrap file name and + // test if it exists in the same directory, if not exit with error message + // + int initReturnValue; + + if ( ( (reader->GetFileType() == FileType::PBDATASET) || (reader->GetFileType() == FileType::PBBAM)) and not params.mapSubreadsSeparately) { + + if ( reader->GetFileType() == FileType::PBBAM ) { + reader->SetScrapsFileName(params.scrapsFileNames[params.readsFileIndex]); + } + initReturnValue = reader->Initialize(true); + } + else { + initReturnValue = reader->Initialize(); + } if (initReturnValue <= 0) { cerr << "WARNING! Could not open file " << params.queryFileNames[params.readsFileIndex] << endl; continue; } // Check whether use ccs only. - if (reader->GetFileType() == HDFCCSONLY) { + if (reader->GetFileType() == FileType::HDFCCSONLY) { params.useAllSubreadsInCcs = false; params.useCcs = params.useCcsOnly = true; } @@ -1373,7 +1393,7 @@ // Check to see if there is a region table. If there is a separate // region table, use that (over the region table in the bas // file). If there is a region table in the bas file, use that, - // without having to specify a region table on the command line. + // without having to specify a region table on the command line. // if (params.useRegionTable) { regionTable.Reset(); @@ -1383,23 +1403,25 @@ // // Check to see if there is a separate ccs fofn. If there is a separate - // ccs fofn, use that over the one in the bas file. + // ccs fofn, use that over the one in the bas file. // //if (params.readSeparateCcsFofn and params.useCcs) { // if (reader->SetCCS(params.ccsFofnFileNames[params.readsFileIndex]) == 0) { - // cout << "ERROR! Could not read the ccs file " + // cout << "ERROR! Could not read the ccs file " // << params.ccsFofnFileNames[params.readsFileIndex] << endl; // exit(1); // } // } - if (reader->GetFileType() != HDFCCS and - reader->GetFileType() != HDFBase and - reader->GetFileType() != HDFPulse and - reader->GetFileType() != BAM and + if (reader->GetFileType() != FileType::HDFCCS and + reader->GetFileType() != FileType::HDFBase and + reader->GetFileType() != FileType::HDFPulse and + reader->GetFileType() != FileType::PBBAM and + reader->GetFileType() != FileType::PBDATASET and params.concordant) { cerr << "WARNING! Option concordant is only enabled when " - << "input reads are in PacBio bax/pls.h5 or bam format." << endl; + << "input reads are in PacBio bax/pls.h5, bam or " + << "dataset xml format." << endl; params.concordant = false; } @@ -1415,7 +1437,7 @@ assert (initReturnValue > 0); if (params.nProc == 1) { - mapdb[0].Initialize(&sarray, &genome, &seqdb, &ct, &index, params, reader, ®ionTable, + mapdb[0].Initialize(&sarray, &genome, &seqdb, &ct, params, reader, ®ionTable, outFilePtr, unalignedFilePtr, &anchorFileStrm, clusterOutPtr); mapdb[0].bwtPtr = &bwt; if (params.fullMetricsFileName != "") { @@ -1433,12 +1455,12 @@ } else { pthread_t *threads = new pthread_t[params.nProc]; - for (procIndex = 0; procIndex < params.nProc; procIndex++ ){ + for (procIndex = 0; procIndex < params.nProc; procIndex++ ){ // // Initialize thread-specific parameters. // - - mapdb[procIndex].Initialize(&sarray, &genome, &seqdb, &ct, &index, params, reader, ®ionTable, + + mapdb[procIndex].Initialize(&sarray, &genome, &seqdb, &ct, params, reader, ®ionTable, outFilePtr, unalignedFilePtr, &anchorFileStrm, clusterOutPtr); mapdb[procIndex].bwtPtr = &bwt; if (params.fullMetricsFileName != "") { @@ -1477,7 +1499,7 @@ } reader->Close(); } - + if (!reader) {delete reader; reader = NULL;} fastaGenome.Free(); @@ -1506,7 +1528,9 @@ #ifdef USE_PBBAM assert(bamWriterPtr); try { - bamWriterPtr->TryFlush(); + if (!params.sam_via_bam) { // no need to flush for SAM , but need to understand why + bamWriterPtr->TryFlush(); + } delete bamWriterPtr; bamWriterPtr = NULL; } catch (std::exception e) { diff -Nru blasr-0~20151014+git8e668be/configure.py blasr-5.3/configure.py --- blasr-0~20151014+git8e668be/configure.py 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/configure.py 2016-09-12 08:56:50.000000000 +0000 @@ -78,7 +78,7 @@ if key in envin: envout[key] = envin[key] def compose_defs_env(env): - # We disallow env overrides for anything with a default from GNU make. + # We disallow env overrides for some things with defaults from GNU make. nons = ['CXX', 'CC', 'AR'] # 'SHELL'? ovr = ['%-20s ?= %s' %(k, v) for k,v in env.items() if k not in nons] nonovr = ['%-20s := %s' %(k, v) for k,v in env.items() if k in nons] @@ -93,7 +93,8 @@ #setifenvf(env, envin, 'PREBUILT', get_PREBUILT) nondefaults = set([ 'CXX', - 'BLASR_INC', + 'CXXFLAGS', + 'NOPBBAM', 'LIBPBDATA_INC', 'LIBPBDATA_LIB', 'LIBPBDATA_LIBFLAGS', 'LIBPBIHDF_INC', 'LIBPBIHDF_LIB', 'LIBPBIHDF_LIBFLAGS', 'LIBBLASR_INC', 'LIBBLASR_LIB', 'LIBBLASR_LIBFLAGS', @@ -140,7 +141,6 @@ def set_defs_defaults(env, nopbbam, with_szlib): defaults = { - 'BLASR_INC': os.path.join(ROOT, 'include'), 'LIBBLASR_INC': os.path.join(ROOT, 'libcpp', 'alignment'), 'LIBPBDATA_INC': os.path.join(ROOT, 'libcpp', 'pbdata'), 'LIBPBIHDF_INC': os.path.join(ROOT, 'libcpp', 'hdf'), @@ -172,6 +172,8 @@ } if not nopbbam: defaults.update(pbbam_defaults) + else: + defaults['NOPBBAM'] = 1 szlib_defaults = { 'SZLIB_LIBFLAGS': '-lsz', #'ZLIB_LIBFLAGS': '-lz', # probably needed, but provided elsewhere @@ -196,7 +198,7 @@ parser.add_option('--no-pbbam', action='store_true', help='Avoid compiling anything which would need pbbam.') parser.add_option('--with-szlib', action='store_true', - help='If HDF5 was built with --with-szlib, then -lz is needed for static binaries.') + help='If HDF5 was built with --with-szlib, then -lsz is needed for static binaries.') parser.add_option('--submodules', action='store_true', help='Set variables to use our git-submodules, which must be pulled and built first. (Implies --no-pbbam.)') parser.add_option('--shared', action='store_true', diff -Nru blasr-0~20151014+git8e668be/cram.mk blasr-5.3/cram.mk --- blasr-0~20151014+git8e668be/cram.mk 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/cram.mk 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,39 @@ +FAST_CTESTS := \ +ctest/affineAlign.t ctest/bamOut.t ctest/ccsH5.t ctest/filtercriteria.t ctest/m0-5.t \ +ctest/aggressiveIntervalCut.t ctest/fofn.t ctest/multipart.t \ +ctest/alignScore.t ctest/hitpolicy.t ctest/noSplitSubreads.t \ +ctest/bamIn.t ctest/fastMaxInterval.t ctest/open_fail.t ctest/verbose.t ctest/deterministic.t + + +MILD_CTESTS := \ + ctest/bug25766.t ctest/holeNumbers.t + +SLOW_CTESTS := ctest/bug25328.t ctest/useccsallLargeGenome.t + +# XXX: following tests sidelined, needs bam input after --sam option removed +# FAST: ctest/ecoli.t +# MILD: ctest/useccsallBestN1.t ctest/concordant.t + +#BLASR_PATH=/mnt/secondary/builds/full/3.0.0/prod/current-build_smrtanalysis/private/otherbins/internalall/bin/ +#export BLASR_PATH + + +cramfast: + cram -v --shell=/bin/bash ${FAST_CTESTS} + +crammild: + cram -v --shell=/bin/bash ${MILD_CTESTS} + +cramslow: + cram -v --shell=/bin/bash ${SLOW_CTESTS} + +cramtests: + cram -v --shell=/bin/bash ${FAST_CTESTS} ${MILD_CTESTS} ${SLOW_CTESTS} + +cramqu: + for test in ${FAST_CTESTS}; do \ + qsub -pe smp 15 -V -cwd -b y -N cramqu $@cram -v --shell=bin/bash $$test;\ + done + +clean: + rm -f cramqu.* ctest/*.err diff -Nru blasr-0~20151014+git8e668be/ctest/affineAlign.t blasr-5.3/ctest/affineAlign.t --- blasr-0~20151014+git8e668be/ctest/affineAlign.t 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/ctest/affineAlign.t 2016-09-12 08:56:50.000000000 +0000 @@ -3,15 +3,15 @@ Test affineAlign $ rm -rf $OUTDIR/affineAlign.m0 - $ $EXEC $DATDIR/affineAlign.fofn $DATDIR/substr_with_ins.fasta -m 0 -out $OUTDIR/affineAlign.m0 -affineAlign -holeNumbers 493 -insertion 100 -deletion 100 + $ $EXEC $DATDIR/affineAlign.fofn $DATDIR/substr_with_ins.fasta -m 0 --out $OUTDIR/affineAlign.m0 --affineAlign --holeNumbers 493 --insertion 100 --deletion 100 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/affineAlign.m0 $STDDIR/affineAlign_2014_06_10.m0 $ rm -rf $OUTDIR/ecoli_affine.m0 - $ $EXEC $DATDIR/ecoli_affine.fasta $DATDIR/ecoli_reference.fasta -m 0 -out $OUTDIR/ecoli_affine.m0 -affineAlign -insertion 100 -deletion 100 + $ $EXEC $DATDIR/ecoli_affine.fasta $DATDIR/ecoli_reference.fasta -m 0 --out $OUTDIR/ecoli_affine.m0 --affineAlign --insertion 100 --deletion 100 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/ecoli_affine.m0 $STDDIR/ecoli_affine_2014_06_10.m0 -# Note that MapQV for -affineAlign has been fixed in 2014 04 18, bug 24363 +# Note that MapQV for --affineAlign has been fixed in 2014 04 18, bug 24363 diff -Nru blasr-0~20151014+git8e668be/ctest/aggressiveIntervalCut.t blasr-5.3/ctest/aggressiveIntervalCut.t --- blasr-0~20151014+git8e668be/ctest/aggressiveIntervalCut.t 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/ctest/aggressiveIntervalCut.t 2016-09-12 08:56:50.000000000 +0000 @@ -1,11 +1,11 @@ Set up $ . $TESTDIR/setup.sh -Test -aggressiveIntervalCut. +Test --aggressiveIntervalCut. $ rm -f $TMP1 $ BASFILE=/mnt/data3/vol53/2450598/0001/Analysis_Results/m130812_185809_42141_c100533960310000001823079711101380_s1_p0.bas.h5 $ REFFA=/mnt/secondary/Smrtpipe/repository/Ecoli_BL21_O26/sequence/Ecoli_BL21_O26.fasta - $ $EXEC $BASFILE $REFFA -holeNumbers 1-100 -out $TMP1 -aggressiveIntervalCut + $ $EXEC $BASFILE $REFFA --holeNumbers 1--100 --out $TMP1 --aggressiveIntervalCut [INFO] * [blasr] started. (glob) [INFO] * [blasr] ended. (glob) $ echo $? diff -Nru blasr-0~20151014+git8e668be/ctest/alignScore.t blasr-5.3/ctest/alignScore.t --- blasr-0~20151014+git8e668be/ctest/alignScore.t 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/ctest/alignScore.t 2016-09-12 08:56:50.000000000 +0000 @@ -3,7 +3,7 @@ Test alignment score $ rm -rf $OUTDIR/testscore.m0 - $ $EXEC $DATDIR/read.fasta $DATDIR/ref.fasta -minReadLength 1 -m 0 -out $OUTDIR/testscore.m0 + $ $EXEC $DATDIR/read.fasta $DATDIR/ref.fasta --minReadLength 1 -m 0 --out $OUTDIR/testscore.m0 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/testscore.m0 $STDDIR/testscore.m0 diff -Nru blasr-0~20151014+git8e668be/ctest/bamConcordant.t blasr-5.3/ctest/bamConcordant.t --- blasr-0~20151014+git8e668be/ctest/bamConcordant.t 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/ctest/bamConcordant.t 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,33 @@ +Set up + $ . $TESTDIR/setup.sh + +Test using bam as input, use --concordant + $ $EXEC $DATDIR/test_bam/tiny_bam.fofn $DATDIR/bamConcordantRef.fasta --bam --concordant --refineConcordantAlignments --bestn 1 --out $OUTDIR/bamConcordant.bam + [INFO]* (glob) + [INFO]* (glob) + +Check whether sam out and bam out have identical alignments, not checking qvs + $ $SAMTOOLS view $OUTDIR/bamConcordant.bam |cut -f 4 + 1 + 1 + 8??? (glob) + 86?? (glob) + 86?? (glob) + 86?? (glob) + 86?? (glob) + 86?? (glob) + 86?? (glob) + 86?? (glob) + 86?? (glob) + 86?? (glob) + 86?? (glob) + 86?? (glob) + 86?? (glob) + 86?? (glob) + + $ $EXEC /pbi/dept/secondary/siv/testdata/SA3-RS/lambda/2372215/0007_tiny/Analysis_Results/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam $DATDIR/lambda_ref.fasta -m 4 --concordant --bestn 1 --holeNumbers 17417 --out $OUTDIR/tmp.m4 -V 2 > $OUTDIR/bamConcordant.log + [INFO]* (glob) + [INFO]* (glob) + + $ grep "Concordant template" $OUTDIR/bamConcordant.log + Concordant template subread index: 8, 17417/14708_16595 diff -Nru blasr-0~20151014+git8e668be/ctest/bamIn.t blasr-5.3/ctest/bamIn.t --- blasr-0~20151014+git8e668be/ctest/bamIn.t 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/ctest/bamIn.t 2016-09-12 08:56:50.000000000 +0000 @@ -2,51 +2,20 @@ $ . $TESTDIR/setup.sh Test using bam as input - $ $EXEC $DATDIR/test_bam/tiny_bam.fofn $DATDIR/lambda_ref.fasta -m 4 -out $OUTDIR/tiny_bam_in.m4 + $ $EXEC $DATDIR/test_bam/tiny_bam.fofn $DATDIR/lambda_ref.fasta -m 4 --out $OUTDIR/tiny_bam_in.m4 [INFO]* (glob) [INFO]* (glob) Check whether blasr produces identical results taking fasta sequences of the bam as input - $ $EXEC $DATDIR/test_bam/tiny_fasta.fofn $DATDIR/lambda_ref.fasta -m 4 -out $OUTDIR/tiny_fasta_in.m4 + $ $EXEC $DATDIR/test_bam/tiny_fasta.fofn $DATDIR/lambda_ref.fasta -m 4 --out $OUTDIR/tiny_fasta_in.m4 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/tiny_bam_in.m4 $OUTDIR/tiny_fasta_in.m4 -Test bam in, sam out - $ $EXEC $DATDIR/test_bam/tiny_bam.fofn $DATDIR/lambda_ref.fasta -sam -out $OUTDIR/tiny_bam_in.sam -printSAMQV -clipping subread -cigarUseSeqMatch - [INFO]* (glob) - [INFO]* (glob) - Test bam in, bam out - $ $EXEC $DATDIR/test_bam/tiny_bam.fofn $DATDIR/lambda_ref.fasta -bam -out $OUTDIR/tiny_bam_in.bam -clipping subread - [INFO]* (glob) - [INFO]* (glob) - -Check whether sam out and bam out have identical alignments, not checking qvs - $ $SAMTOOLS view -h $OUTDIR/tiny_bam_in.bam -o $OUTDIR/tiny_bam_in.bam.sam - $ cut -f 2-11 $OUTDIR/tiny_bam_in.bam.sam |sed -n '6,$p' > $TMP1.aln - $ cut -f 2-11 $OUTDIR/tiny_bam_in.sam |sed -n '6,$p' > $TMP2.aln - $ diff $TMP1.aln $TMP2.aln - -Check whether sam out and bam out have identical read groups @RG - $ awk '/^@RG/' $OUTDIR/tiny_bam_in.bam.sam > $TMP1.rg - $ awk '/^@RG/' $OUTDIR/tiny_bam_in.sam > $TMP2.rg - $ diff $TMP1.rg $TMP2.rg - -Compare iq produced with stdout - $ sed -n '6,$p' $OUTDIR/tiny_bam_in.bam.sam | awk '{gsub(/\t/,"\n");}1' | awk '/^iq:Z:/' > $TMP1.iq - $ sed -n '6,$p' $STDDIR/$UPDATEDATE/tiny_bam_in.bam.sam | awk '{gsub(/\t/,"\n");}1' | awk '/^iq:Z:/' > $TMP2.iq - $ diff $TMP1.iq $TMP2.iq - -TODO:Check whether sam out and bam out have identical insertion qvs -Currently QVs in bam are in 'native' orientation, and QVs in sam are in 'genomic' orientation. This needs to be fixed. -$ sed -n '6,$p' $OUTDIR/tiny_bam_in.sam | awk '{gsub(/\t/,"\n");}1' | awk '/^iq:Z:/' > $TMP2.iq - -Test with multiple nproc - $ $EXEC $DATDIR/test_bam/two_bam.fofn $DATDIR/lambda_ref.fasta -bam -nproc 15 -out $OUTDIR/two_bam_in.bam + $ $EXEC $DATDIR/test_bam/tiny_bam.fofn $DATDIR/lambda_ref.fasta --bam --out $OUTDIR/tiny_bam_in.bam --clipping subread [INFO]* (glob) [INFO]* (glob) - $ $SAMTOOLS view -h $OUTDIR/two_bam_in.bam -o $OUTDIR/two_bam_in.bam.sam -TODO: test -concordant, when pbbam API to query over ZMWs is available. +TODO: test --concordant, when pbbam API to query over ZMWs is available. TODO: test bam with ccs reads diff -Nru blasr-0~20151014+git8e668be/ctest/bamOut.t blasr-5.3/ctest/bamOut.t --- blasr-0~20151014+git8e668be/ctest/bamOut.t 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/ctest/bamOut.t 2016-09-12 08:56:50.000000000 +0000 @@ -4,11 +4,11 @@ Test generating bam output Input is bam, clipping=soft and subread should produce identical results - $ $EXEC $DATDIR/test_bam/tiny_bam.fofn $DATDIR/lambda_ref.fasta -bam -out $OUTDIR/tiny_bam_in_soft.bam -clipping soft + $ $EXEC $DATDIR/test_bam/tiny_bam.fofn $DATDIR/lambda_ref.fasta --bam --out $OUTDIR/tiny_bam_in_soft.bam --clipping soft [INFO]* (glob) [INFO]* (glob) - $ $EXEC $DATDIR/test_bam/tiny_bam.fofn $DATDIR/lambda_ref.fasta -bam -out $OUTDIR/tiny_bam_in_subread.bam -clipping subread + $ $EXEC $DATDIR/test_bam/tiny_bam.fofn $DATDIR/lambda_ref.fasta --bam --out $OUTDIR/tiny_bam_in_subread.bam --clipping subread [INFO]* (glob) [INFO]* (glob) diff -Nru blasr-0~20151014+git8e668be/ctest/bug25328.t blasr-5.3/ctest/bug25328.t --- blasr-0~20151014+git8e668be/ctest/bug25328.t 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/ctest/bug25328.t 2016-09-12 08:56:50.000000000 +0000 @@ -5,7 +5,7 @@ $ INFA=$DATDIR/bug_25328_zmw_38131.fasta $ REF=$DATDIR/All4mers_circular_72x_l50256.fasta $ OUTFA=$OUTDIR/bug_25328.m4 - $ $EXEC $INFA $REF -bestn 1 -nCandidates 1 -forwardOnly -maxMatch 14 -m 4 -out $OUTFA + $ $EXEC $INFA $REF --bestn 1 --nCandidates 1 --forwardOnly --maxMatch 14 -m 4 --out $OUTFA [INFO]* (glob) [INFO]* (glob) diff -Nru blasr-0~20151014+git8e668be/ctest/bug25741.t blasr-5.3/ctest/bug25741.t --- blasr-0~20151014+git8e668be/ctest/bug25741.t 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/ctest/bug25741.t 1970-01-01 00:00:00.000000000 +0000 @@ -1,9 +0,0 @@ -Set up - $ . $TESTDIR/setup.sh - -bug_25741, if input bas.h5 does not contain mergeQV, blasr with -printSAMQV, -nproc>1 should not write garbage 'mq' values to output. - $ $EXEC $DATDIR/bas_wo_mergeQV.fofn $DATDIR/lambda_ref.fasta -printSAMQV -sam -clipping subread -out $OUTDIR/out_printSAMQV.sam -nproc 12 - [INFO]* (glob) - [INFO]* (glob) - $ grep 'mq' $OUTDIR/out_printSAMQV.sam |wc -l - 1 diff -Nru blasr-0~20151014+git8e668be/ctest/bug25766.t blasr-5.3/ctest/bug25766.t --- blasr-0~20151014+git8e668be/ctest/bug25766.t 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/ctest/bug25766.t 2016-09-12 08:56:50.000000000 +0000 @@ -1,10 +1,10 @@ Set up $ . $TESTDIR/setup.sh -bug_25766, added an option -minRawSubreadScore +bug_25766, added an option --minRawSubreadScore $ BASFILE=$DATDIR/lambda_bax.fofn $ REF=$DATDIR/lambda_ref.fasta - $ $EXEC $BASFILE $REF -out $TMP1 -minRawSubreadScore 700 -nproc 18 + $ $EXEC $BASFILE $REF --out $TMP1 --minRawSubreadScore 700 --nproc 18 [INFO]* (glob) [INFO]* (glob) $ echo $? diff -Nru blasr-0~20151014+git8e668be/ctest/ccsH5.t blasr-5.3/ctest/ccsH5.t --- blasr-0~20151014+git8e668be/ctest/ccsH5.t 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/ctest/ccsH5.t 2016-09-12 08:56:50.000000000 +0000 @@ -3,9 +3,9 @@ Test using *.ccs.h5 as input # The results should be exactly the same as -# blasr $DATDIR/ccsasinput_bas.fofn $DATDIR/ccsasinput.fasta -m 4 -out tmp.m4 -useccsdenovo +# blasr $DATDIR/ccsasinput_bas.fofn $DATDIR/ccsasinput.fasta -m 4 --out tmp.m4 --useccsdenovo $ rm -rf $OUTDIR/ccsasinput.m4 - $ $EXEC $DATDIR/ccsasinput.fofn $DATDIR/ccsasinput.fasta -m 4 -out $OUTDIR/ccsasinput.m4 + $ $EXEC $DATDIR/ccsasinput.fofn $DATDIR/ccsasinput.fasta -m 4 --out $OUTDIR/ccsasinput.m4 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/ccsasinput.m4 $STDDIR/ccsasinput_2014_06_10.m4 diff -Nru blasr-0~20151014+git8e668be/ctest/cigarAdjecentIndels.t blasr-5.3/ctest/cigarAdjecentIndels.t --- blasr-0~20151014+git8e668be/ctest/cigarAdjecentIndels.t 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/ctest/cigarAdjecentIndels.t 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,22 @@ +Set up + $ . $TESTDIR/setup.sh + +Without --allowAdjacentIndels, adjacent indels should not exist in SAM/BAM CIGAR strings + $ $EXEC $DATDIR/test_dataset/nofilter.subreadset.xml $DATDIR/ecoli_reference.fasta --bam --out $OUTDIR/noAdjacentIndels.bam --concordant --refineConcordantAlignments --bestn 1 && echo $? + [INFO]* (glob) + [INFO]* (glob) + 0 + + $ $SAMTOOLS view $OUTDIR/noAdjacentIndels.bam |cut -f 6 > $TMP1 + + $ grep 'ID' $TMP1 |wc -l + 0 + + $ grep 'DI' $TMP1 |wc -l + 0 + +With --allowAdjacentIndels + $ $EXEC $DATDIR/test_dataset/nofilter.subreadset.xml $DATDIR/ecoli_reference.fasta --bam --out $OUTDIR/allowAdjacentIndels.bam --concordant --bestn 1 --allowAdjacentIndels && echo $? + [INFO]* (glob) + [INFO]* (glob) + 0 diff -Nru blasr-0~20151014+git8e668be/ctest/concordant.t blasr-5.3/ctest/concordant.t --- blasr-0~20151014+git8e668be/ctest/concordant.t 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/ctest/concordant.t 2016-09-12 08:56:50.000000000 +0000 @@ -1,11 +1,13 @@ Set up $ . $TESTDIR/setup.sh -Test -concordant +Test --concordant + $ rm -rf $OUTDIR/concordant_subset.bam $ rm -rf $OUTDIR/concordant_subset.sam - $ $EXEC $DATDIR/ecoli_lp.fofn $DATDIR/ecoli_reference.fasta -concordant -sam -out $OUTDIR/concordant_subset.sam -nproc 12 -holeNumbers 1-10000 -sa $DATDIR/ecoli_reference.sa + $ $EXEC $DATDIR/ecoli_lp.fofn $DATDIR/ecoli_reference.fasta --concordant --refineConcordantAlignments --bam --out $OUTDIR/concordant_subset.bam --nproc 12 --holeNumbers 1--10000 --sa $DATDIR/ecoli_reference.sa [INFO]* (glob) [INFO]* (glob) + $ $SAMTOOLS view $OUTDIR/concordant_subset.bam > $OUTDIR/concordant_subset.sam $ sed -n 6,110864p $OUTDIR/concordant_subset.sam > $OUTDIR/tmp1 $ sort $OUTDIR/tmp1 > $OUTDIR/tmp11 $ sed -n 6,110864p $STDDIR/$UPDATEDATE/concordant_subset.sam > $OUTDIR/tmp2 @@ -15,18 +17,19 @@ #2014_05_28 --> changelist 135254, use MAX_BAND_SIZE to contrain GuidedAlign #2014_08_21 --> changelist 138516, added YS, YE, ZM tags. #2014_08_28 --> changelist 139176, update SAM MD5 -#2014_09_12 --> changelist 140410, changed the default value of '-concordantTemplate' from 'longestsubread' to 'typicalsubread' +#2014_09_12 --> changelist 140410, changed the default value of '--concordantTemplate' from 'longestsubread' to 'typicalsubread' #2014_09_17 --> changelist 140573, changed SDPFragment LessThan to make sure blasr compiled with gcc 4.4 and 4.8 can produce identical results. -#2014_10_16 --> changelist 141378, changed the default value of '-concordantTemplate' from 'typicalsubread' to 'mediansubread' +#2014_10_16 --> changelist 141378, changed the default value of '--concordantTemplate' from 'typicalsubread' to 'mediansubread' #2015_03_01 --> changelist 146599, reads from the same movie should have unique readGroupId #2015_03_28 --> changelist 148101, 148080 updated read group id, 148100 updated TLEN #2015_04_09 --> changelist 148796, updated read group id #2015_04_25 --> changelist 149721, update CIGAR string, replace M with X=. +#2015_04_25 --> changelist ?, force refine all concordant alignments -Test -concordant FMR1 case (the 'typical subread' is selected as template for concordant mapping) +Test --concordant FMR1 case (the 'typical subread' is selected as template for concordant mapping) $ FOFN=$DATDIR/FMR1_concordant.fofn $ REF=$DATDIR/FMR1_130CGG.fasta - $ $EXEC $FOFN $REF -concordant -out $OUTDIR/FMR1_zmw_37927.m4 -m 4 -holeNumbers 37927 + $ $EXEC $FOFN $REF --concordant --refineConcordantAlignments --out $OUTDIR/FMR1_zmw_37927.m4 -m 4 --holeNumbers 37927 [INFO]* (glob) [INFO]* (glob) - $ diff $OUTDIR/FMR1_zmw_37927.m4 $STDDIR/FMR1_zmw_37927.m4 + $ diff $OUTDIR/FMR1_zmw_37927.m4 $STDDIR/$UPDATEDATE/FMR1_zmw_37927.m4 diff -Nru blasr-0~20151014+git8e668be/ctest/dataset.t blasr-5.3/ctest/dataset.t --- blasr-0~20151014+git8e668be/ctest/dataset.t 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/ctest/dataset.t 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,64 @@ +Set up + $ . $TESTDIR/setup.sh + +Test dataset.xml as input + $ $EXEC $DATDIR/test_dataset/chunking.subreadset.xml $DATDIR/ecoli_reference.fasta -m 4 --out $OUTDIR/chunking.m4 --bestn 1 && echo $? + [INFO]* (glob) + [INFO]* (glob) + 0 +Test filters in dataset.xml is respected. + $ cat $OUTDIR/chunking.m4 | wc -l + 9 + +Test dataset.xml --bam output + $ $EXEC $DATDIR/test_dataset/chunking.subreadset.xml $DATDIR/ecoli_reference.fasta --bam --out $OUTDIR/chunking.bam && echo $? + [INFO]* (glob) + [INFO]* (glob) + 0 + +Test dataset.xml --concordant + $ $EXEC $DATDIR/test_dataset/chunking.subreadset.xml $DATDIR/ecoli_reference.fasta --bam --out $OUTDIR/chunking.concordant.bam --concordant && echo $? + [INFO]* (glob) + [INFO]* (glob) + 0 + +Test dataset with no filters (to make sure that an empty filter does not discard all bam records.) + $ $EXEC $DATDIR/test_dataset/nofilter.subreadset.xml $DATDIR/ecoli_reference.fasta --bam --out $OUTDIR/nofilter.bam --concordant --bestn 1 && echo $? + [INFO]* (glob) + [INFO]* (glob) + 0 + + $ $SAMTOOLS view $OUTDIR/nofilter.bam|wc -l + 131 + + +Test dataset with --concordant is on + $ $EXEC $DATDIR/test_dataset/nofilter.subreadset.xml $DATDIR/bamConcordantRef.fasta --bam --concordant --refineConcordantAlignments --bestn 1 --out $OUTDIR/datasetConcordant.bam --holeNumbers 1898 && echo $? + [INFO]* (glob) + [INFO]* (glob) + 0 + + $ $SAMTOOLS view $OUTDIR/datasetConcordant.bam | cut -f 4 + ??? (glob) + 3?? (glob) + 3?? (glob) + 3?? (glob) + 3?? (glob) + 3?? (glob) + 3?? (glob) + 3?? (glob) + 3?? (glob) + 3?? (glob) + 3?? (glob) + 3?? (glob) + 3?? (glob) + 3?? (glob) + 3?? (glob) + 3?? (glob) + 3?? (glob) + 3?? (glob) + 3?? (glob) + 3?? (glob) + 3?? (glob) + 3?? (glob) + 3?? (glob) diff -Nru blasr-0~20151014+git8e668be/ctest/deterministic.t blasr-5.3/ctest/deterministic.t --- blasr-0~20151014+git8e668be/ctest/deterministic.t 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/ctest/deterministic.t 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,47 @@ +Set up + $ . $TESTDIR/setup.sh + +Test blasr with input bam which has: +(1) insertionQV, deletionQV, deletionTag, substitutionQV, substitutionTag +(2) insertionQV, deletionQV, deletionTag +(3) no QV +and then check if output is determined. + +(1) + $ name=iq-dq-sub + $ infile=$DATDIR/test_bam/$name.subreads.bam + $ outfile=$OUTDIR/$name.m4 + $ stdfile=$STDDIR/$name.m4 + $ rm -f $outfile + $ $EXEC $infile $DATDIR/lambda_ref.fasta -m 4 --out $outfile && echo $? + [INFO]* (glob) + [INFO]* (glob) + 0 + $ sort $outfile > $outfile.tmp && mv $outfile.tmp $outfile + $ diff $outfile $stdfile + +(2) + $ name=iq-dq + $ infile=$DATDIR/test_bam/$name.subreads.bam + $ outfile=$OUTDIR/$name.m4 + $ stdfile=$STDDIR/$name.m4 + $ rm -f $outfile + $ $EXEC $infile $DATDIR/lambda_ref.fasta -m 4 --out $outfile && echo $? + [INFO]* (glob) + [INFO]* (glob) + 0 + $ sort $outfile > $outfile.tmp && mv $outfile.tmp $outfile + $ diff $outfile $stdfile + +(3) + $ name=no-iq-dq + $ infile=$DATDIR/test_bam/$name.subreads.bam + $ outfile=$OUTDIR/$name.m4 + $ stdfile=$STDDIR/$name.m4 + $ rm -f $outfile + $ $EXEC $infile $DATDIR/lambda_ref.fasta -m 4 --out $outfile && echo $? + [INFO]* (glob) + [INFO]* (glob) + 0 + $ sort $outfile > $outfile.tmp && mv $outfile.tmp $outfile + $ diff $outfile $stdfile diff -Nru blasr-0~20151014+git8e668be/ctest/ecoli.t blasr-5.3/ctest/ecoli.t --- blasr-0~20151014+git8e668be/ctest/ecoli.t 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/ctest/ecoli.t 2016-09-12 08:56:50.000000000 +0000 @@ -2,18 +2,20 @@ $ . $TESTDIR/setup.sh Test blasr on ecoli. -Test blasr with -sam +Test blasr with --bam # The following job takes a very long time to finish, let us use a subset of reads instead #See $STDOUT/ecoli_v1.4.sam for 1.4 output. # $STDOUT/ecoli_2014_03_28.sam for bug before mapQV for affineAlign/align without QV is fixed. + $ rm -rf $OUTDIR/ecoli_subset.bam $ rm -rf $OUTDIR/ecoli_subset.sam - $ $EXEC $DATDIR/ecoli_subset.fasta $DATDIR/ecoli_reference.fasta -sam -out $OUTDIR/ecoli_subset.sam -nproc 15 + $ $EXEC $DATDIR/ecoli_subset.fasta $DATDIR/ecoli_reference.fasta --bam --out $OUTDIR/ecoli_subset.bam --nproc 15 [INFO]* (glob) [INFO]* (glob) - $ sed -n '5,$ p' $OUTDIR/ecoli_subset.sam | sort | cut -f 1-11 > $TMP1 - $ sed -n '5,$ p' $STDDIR/$UPDATEDATE/ecoli_subset.sam | sort | cut -f 1-11 > $TMP2 + $ $SAMTOOLS view $OUTDIR/ecoli_subset.bam > $OUTDIR/ecoli_subset.sam + $ sed -n '5,$ p' $OUTDIR/ecoli_subset.sam | sort | cut -f 1--11 > $TMP1 + $ sed -n '5,$ p' $STDDIR/$UPDATEDATE/ecoli_subset.sam | sort | cut -f 1--11 > $TMP2 $ diff $TMP1 $TMP2 $ rm $TMP1 $TMP2 # 2015_03_08 --> changelist 148101, 148080 updated read group id; 148100 updated TLEN diff -Nru blasr-0~20151014+git8e668be/ctest/fastMaxInterval.t blasr-5.3/ctest/fastMaxInterval.t --- blasr-0~20151014+git8e668be/ctest/fastMaxInterval.t 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/ctest/fastMaxInterval.t 2016-09-12 08:56:50.000000000 +0000 @@ -1,11 +1,11 @@ Set up $ . $TESTDIR/setup.sh -Test -fastMaxInterval. +Test --fastMaxInterval. $ rm -f $TMP1 $ BASFILE=/mnt/data3/vol53/2450598/0001/Analysis_Results/m130812_185809_42141_c100533960310000001823079711101380_s1_p0.bas.h5 $ REFFA=/mnt/secondary/Smrtpipe/repository/Ecoli_BL21_O26/sequence/Ecoli_BL21_O26.fasta - $ $EXEC $BASFILE $REFFA -holeNumbers 1-100 -out $TMP1 -fastMaxInterval + $ $EXEC $BASFILE $REFFA --holeNumbers 1--100 --out $TMP1 --fastMaxInterval [INFO] * [blasr] started. (glob) [INFO] * [blasr] ended. (glob) $ echo $? diff -Nru blasr-0~20151014+git8e668be/ctest/filtercriteria.t blasr-5.3/ctest/filtercriteria.t --- blasr-0~20151014+git8e668be/ctest/filtercriteria.t 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/ctest/filtercriteria.t 2016-09-12 08:56:50.000000000 +0000 @@ -7,12 +7,12 @@ $ STDDIR=$STDDIR/$NAME $ mkdir -p $OUTDIR -Test -minPctSimilarity +Test --minPctSimilarity $ I=$DATDIR/tiny_bam.fofn $ R=$DATDIR/lambdaNEB.fa $ O=$OUTDIR/min_pct_similarity_90.m4 - $ $EXEC $I $R -out $O -m 4 -minPctSimilarity 90 + $ $EXEC $I $R --out $O -m 4 --minPctSimilarity 90 [INFO]* (glob) [INFO]* (glob) $ echo $? @@ -21,10 +21,10 @@ 0 $ O=$OUTDIR/min_aln_len_1000.m4 - $ $EXEC $I $R -out $O -m 4 -minAlnLength 1000 + $ $EXEC $I $R --out $O -m 4 --minAlnLength 1000 [INFO]* (glob) [INFO]* (glob) $ echo $? 0 $ wc -l $O |cut -f 1 -d ' ' - 12 + 14 diff -Nru blasr-0~20151014+git8e668be/ctest/fofn.t blasr-5.3/ctest/fofn.t --- blasr-0~20151014+git8e668be/ctest/fofn.t 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/ctest/fofn.t 2016-09-12 08:56:50.000000000 +0000 @@ -3,7 +3,7 @@ Test blasr with *.fofn input # $ rm -rf $OUTDIR/lambda_bax.m4 -# $ $EXEC $DATDIR/lambda_bax.fofn $DATDIR/lambda_ref.fasta -m 4 -out lambda_bax_tmp.m4 -nproc 15 -minMatch 14 +# $ $EXEC $DATDIR/lambda_bax.fofn $DATDIR/lambda_ref.fasta -m 4 --out lambda_bax_tmp.m4 --nproc 15 --minMatch 14 # [INFO]* (glob) # [INFO]* (glob) # $ sort lambda_bax_tmp.m4 > $OUTDIR/lambda_bax.m4 @@ -11,7 +11,7 @@ # This test takes a long time, use a subset instad. $ rm -rf $OUTDIR/lambda_bax_subset.m4 - $ $EXEC $DATDIR/lambda_bax.fofn $DATDIR/lambda_ref.fasta -m 4 -out $OUTDIR/lambda_bax_tmp_subset.m4 -nproc 15 -minMatch 14 -holeNumbers 1-1000 -sa $DATDIR/lambda_ref.sa + $ $EXEC $DATDIR/lambda_bax.fofn $DATDIR/lambda_ref.fasta -m 4 --out $OUTDIR/lambda_bax_tmp_subset.m4 --nproc 15 --minMatch 14 --holeNumbers 1--1000 --sa $DATDIR/lambda_ref.sa [INFO]* (glob) [INFO]* (glob) $ sort $OUTDIR/lambda_bax_tmp_subset.m4 > $OUTDIR/lambda_bax_subset.m4 diff -Nru blasr-0~20151014+git8e668be/ctest/.gitignore blasr-5.3/ctest/.gitignore --- blasr-0~20151014+git8e668be/ctest/.gitignore 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/ctest/.gitignore 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1 @@ +/out/ diff -Nru blasr-0~20151014+git8e668be/ctest/hitpolicy.t blasr-5.3/ctest/hitpolicy.t --- blasr-0~20151014+git8e668be/ctest/hitpolicy.t 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/ctest/hitpolicy.t 2016-09-12 08:56:50.000000000 +0000 @@ -13,7 +13,7 @@ $ X=$STDDIR/hitpolicy_all.m4 Test hitpolicy all - $ $EXEC $I $R -out $O -m 4 -hitPolicy all + $ $EXEC $I $R --out $O -m 4 --hitPolicy all [INFO]* (glob) [INFO]* (glob) $ echo $? @@ -24,7 +24,7 @@ Test hitpolicy allbest $ O=$OUTDIR/hitpolicy_allbest.m4 $ X=$STDDIR/hitpolicy_allbest.m4 - $ $EXEC $I $R -out $O -m 4 -hitPolicy allbest && sort $O > $TMP1 && mv $TMP1 $O + $ $EXEC $I $R --out $O -m 4 --hitPolicy allbest && sort $O > $TMP1 && mv $TMP1 $O [INFO]* (glob) [INFO]* (glob) $ echo $? @@ -37,10 +37,10 @@ $ O=$OUTDIR/hitpolicy_random.m4 $ O2=$OUTDIR/hitpolicy_random_2.m4 $ X=$STDDIR/hitpolicy_random.m4 - $ $EXEC $I $R -out $O -m 4 -hitPolicy random -randomSeed 1 + $ $EXEC $I $R --out $O -m 4 --hitPolicy random --randomSeed 1 [INFO]* (glob) [INFO]* (glob) - $ $EXEC $I $R -out $O2 -m 4 -hitPolicy random -randomSeed 1 + $ $EXEC $I $R --out $O2 -m 4 --hitPolicy random --randomSeed 1 [INFO]* (glob) [INFO]* (glob) $ sort $O > $TMP1 && mv $TMP1 $O @@ -52,10 +52,10 @@ $ O=$OUTDIR/hitpolicy_randombest_bam_in.m4 $ O2=$OUTDIR/hitpolicy_randombest_bam_in_2.m4 $ X=$STDDIR/hitpolicy_randombest_bam_in.m4 - $ $EXEC $I $R -out $O -m 4 -hitPolicy randombest -randomSeed 1 -nproc 10 + $ $EXEC $I $R --out $O -m 4 --hitPolicy randombest --randomSeed 1 --nproc 10 [INFO]* (glob) [INFO]* (glob) - $ $EXEC $I $R -out $O2 -m 4 -hitPolicy randombest -randomSeed 1 -nproc 10 + $ $EXEC $I $R --out $O2 -m 4 --hitPolicy randombest --randomSeed 1 --nproc 10 [INFO]* (glob) [INFO]* (glob) $ sort $O > $TMP1 && mv $TMP1 $O @@ -67,7 +67,7 @@ $ I=$DATDIR/tiny_bax.fofn $ O=$OUTDIR/hitpolicy_randombest_bax_in.m4 $ X=$STDDIR/hitpolicy_randombest_bax_in.m4 - $ $EXEC $I $R -out $O -m 4 -hitPolicy randombest -randomSeed 1 -nproc 10 + $ $EXEC $I $R --out $O -m 4 --hitPolicy randombest --randomSeed 1 --nproc 10 [INFO]* (glob) [INFO]* (glob) $ sort $O > $TMP1 && mv $TMP1 $O @@ -78,7 +78,7 @@ $ I=$DATDIR/tiny_fasta.fofn $ O=$OUTDIR/hitpolicy_randombest_fasta_in.m4 $ X=$STDDIR/hitpolicy_randombest_fasta_in.m4 - $ $EXEC $I $R -out $O -m 4 -hitPolicy randombest -randomSeed 1 -nproc 10 + $ $EXEC $I $R --out $O -m 4 --hitPolicy randombest --randomSeed 1 --nproc 10 [INFO]* (glob) [INFO]* (glob) $ sort $O > $TMP1 && mv $TMP1 $O @@ -88,7 +88,7 @@ Test hitpolicy leftmost $ O=$OUTDIR/hitpolicy_leftmost.m4 $ X=$STDDIR/hitpolicy_leftmost.m4 - $ $EXEC $I $R -out $O -m 4 -hitPolicy leftmost -nproc 10 + $ $EXEC $I $R --out $O -m 4 --hitPolicy leftmost --nproc 10 [INFO]* (glob) [INFO]* (glob) $ # target is lambda x 6, leftmost -> only map to the very first x. diff -Nru blasr-0~20151014+git8e668be/ctest/holeNumbers.t blasr-5.3/ctest/holeNumbers.t --- blasr-0~20151014+git8e668be/ctest/holeNumbers.t 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/ctest/holeNumbers.t 2016-09-12 08:56:50.000000000 +0000 @@ -1,9 +1,9 @@ Set up $ . $TESTDIR/setup.sh -Test -holeNumbers +Test --holeNumbers $ rm -f $OUTDIR/holeNumbers.m4 - $ $EXEC $DATDIR/lambda_bax.fofn $DATDIR/lambda_ref.fasta -m 4 -out $OUTDIR/holeNumbers.m4 -holeNumbers 14798,55000-55100 -nproc 8 + $ $EXEC $DATDIR/lambda_bax.fofn $DATDIR/lambda_ref.fasta -m 4 --out $OUTDIR/holeNumbers.m4 --holeNumbers 14798,55000--55100 --nproc 8 [INFO]* (glob) [INFO]* (glob) $ sort $OUTDIR/holeNumbers.m4 > $TMP1 diff -Nru blasr-0~20151014+git8e668be/ctest/m0-5.t blasr-5.3/ctest/m0-5.t --- blasr-0~20151014+git8e668be/ctest/m0-5.t 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/ctest/m0-5.t 2016-09-12 08:56:50.000000000 +0000 @@ -3,31 +3,31 @@ Test blasr with -m 0 ~ 5 $ rm -rf $OUTDIR/read.m0 - $ $EXEC $DATDIR/read.fasta $DATDIR/ref.fasta -m 0 -out $OUTDIR/read.m0 + $ $EXEC $DATDIR/read.fasta $DATDIR/ref.fasta -m 0 --out $OUTDIR/read.m0 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/read.m0 $STDDIR/read.m0 $ rm -rf $OUTDIR/read.m1 - $ $EXEC $DATDIR/read.fasta $DATDIR/ref.fasta -m 1 -out $OUTDIR/read.m1 + $ $EXEC $DATDIR/read.fasta $DATDIR/ref.fasta -m 1 --out $OUTDIR/read.m1 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/read.m1 $STDDIR/read_2014_05_29.m1 $ rm -rf $OUTDIR/read.m2 - $ $EXEC $DATDIR/read.fasta $DATDIR/ref.fasta -m 2 -out $OUTDIR/read.m2 + $ $EXEC $DATDIR/read.fasta $DATDIR/ref.fasta -m 2 --out $OUTDIR/read.m2 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/read.m2 $STDDIR/read.m2 $ rm -rf $OUTDIR/read.m3 - $ $EXEC $DATDIR/read.fasta $DATDIR/ref.fasta -m 3 -out $OUTDIR/read.m3 + $ $EXEC $DATDIR/read.fasta $DATDIR/ref.fasta -m 3 --out $OUTDIR/read.m3 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/read.m3 $STDDIR/read.m3 $ rm -rf $OUTDIR/read.m4 - $ $EXEC $DATDIR/read.fasta $DATDIR/ref.fasta -m 4 -out $OUTDIR/read.m4 + $ $EXEC $DATDIR/read.fasta $DATDIR/ref.fasta -m 4 --out $OUTDIR/read.m4 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/read.m4 $STDDIR/read.m4 diff -Nru blasr-0~20151014+git8e668be/ctest/multipart.t blasr-5.3/ctest/multipart.t --- blasr-0~20151014+git8e668be/ctest/multipart.t 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/ctest/multipart.t 2016-09-12 08:56:50.000000000 +0000 @@ -6,7 +6,7 @@ $ rm -f $TMP1 $ BASFILE=/mnt/data3/vol53/2450598/0001/Analysis_Results/m130812_185809_42141_c100533960310000001823079711101380_s1_p0.bas.h5 $ REFFA=/mnt/secondary/Smrtpipe/repository/Ecoli_BL21_O26/sequence/Ecoli_BL21_O26.fasta - $ $EXEC $BASFILE $REFFA -holeNumbers 1-100 -out $TMP1 + $ $EXEC $BASFILE $REFFA --holeNumbers 1--100 --out $TMP1 [INFO] * [blasr] started. (glob) [INFO] * [blasr] ended. (glob) $ echo $? diff -Nru blasr-0~20151014+git8e668be/ctest/noSplitSubreads.t blasr-5.3/ctest/noSplitSubreads.t --- blasr-0~20151014+git8e668be/ctest/noSplitSubreads.t 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/ctest/noSplitSubreads.t 2016-09-12 08:56:50.000000000 +0000 @@ -1,9 +1,9 @@ Set up $ . $TESTDIR/setup.sh -Test blasr with -noSplitSubreads +Test blasr with --noSplitSubreads # $ rm -rf $OUTDIR/lambda_bax_noSplitSubreads.m4 -# $ $EXEC $DATDIR/lambda_bax.fofn $DATDIR/lambda_ref.fasta -noSplitSubreads -m 4 -out lambda_bax_noSplitSubreads_tmp.m4 -nproc 15 +# $ $EXEC $DATDIR/lambda_bax.fofn $DATDIR/lambda_ref.fasta --noSplitSubreads -m 4 --out lambda_bax_noSplitSubreads_tmp.m4 --nproc 15 # [INFO]* (glob) # [INFO]* (glob) # $ sort lambda_bax_noSplitSubreads_tmp.m4 > $OUTDIR/lambda_bax_noSplitSubreads.m4 @@ -11,7 +11,7 @@ # This test takes a long time, use a subset instad. $ rm -rf $OUTDIR/lambda_bax_noSplitSubreads_subset.m4 - $ $EXEC $DATDIR/lambda_bax.fofn $DATDIR/lambda_ref.fasta -noSplitSubreads -m 4 -out $OUTDIR/lambda_bax_noSplitSubreads_tmp_subset.m4 -nproc 15 -holeNumbers 1-1000 -sa $DATDIR/lambda_ref.sa + $ $EXEC $DATDIR/lambda_bax.fofn $DATDIR/lambda_ref.fasta --noSplitSubreads -m 4 --out $OUTDIR/lambda_bax_noSplitSubreads_tmp_subset.m4 --nproc 15 --holeNumbers 1--1000 --sa $DATDIR/lambda_ref.sa [INFO]* (glob) [INFO]* (glob) $ sort $OUTDIR/lambda_bax_noSplitSubreads_tmp_subset.m4 > $OUTDIR/lambda_bax_noSplitSubreads_subset.m4 diff -Nru blasr-0~20151014+git8e668be/ctest/open_fail.t blasr-5.3/ctest/open_fail.t --- blasr-0~20151014+git8e668be/ctest/open_fail.t 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/ctest/open_fail.t 2016-09-12 08:56:50.000000000 +0000 @@ -4,5 +4,5 @@ If fail to open an bax/bas.h5 file because of unable to initialize required dataset, give an warning. $ $EXEC $DATDIR/open_fail_no_dyset.fofn $DATDIR/lambda_ref.fasta -m 4 [INFO]* (glob) - Could not open /mnt/secondary-siv/testdata/BlasrTestData/ctest/data/open_fail_no_dyset.fofn + Could not open /pbi/dept/secondary/siv/testdata/BlasrTestData/ctest/data/open_fail_no_dyset.fofn [1] diff -Nru blasr-0~20151014+git8e668be/ctest/samNM.t blasr-5.3/ctest/samNM.t --- blasr-0~20151014+git8e668be/ctest/samNM.t 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/ctest/samNM.t 1970-01-01 00:00:00.000000000 +0000 @@ -1,13 +0,0 @@ -Set up - $ . $TESTDIR/setup.sh - -Test Sam out nm tag - $ rm -rf $OUTDIR/read.sam - $ $EXEC $DATDIR/read.fasta $DATDIR/ref.fasta -sam -out $OUTDIR/read.sam - [INFO]* (glob) - [INFO]* (glob) - $ tail -n+5 $OUTDIR/read.sam |cut -f 21 - NM:i:2 - NM:i:3 - NM:i:2 - NM:i:4 diff -Nru blasr-0~20151014+git8e668be/ctest/setup.sh blasr-5.3/ctest/setup.sh --- blasr-0~20151014+git8e668be/ctest/setup.sh 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/ctest/setup.sh 2016-09-12 08:56:50.000000000 +0000 @@ -1,12 +1,13 @@ # Set up directories CURDIR=$TESTDIR -REMOTEDIR=/mnt/secondary-siv/testdata/BlasrTestData/ctest +REMOTEDIR=/pbi/dept/secondary/siv/testdata/BlasrTestData/ctest DATDIR=$REMOTEDIR/data OUTDIR=$CURDIR/out STDDIR=$REMOTEDIR/stdout # Set up the executable: blasr. -EXEC=$TESTDIR/../blasr +#EXEC=${BLASR_PATH}/blasr +EXEC=blasr # Define tmporary files TMP1=$OUTDIR/$$.tmp.out @@ -19,7 +20,7 @@ SAMTOOLS=/mnt/secondary/Smrtpipe/builds/Internal_Mainline_Nightly_LastSuccessfulBuild/analysis/bin/samtools #Update date -UPDATEDATE=2015_04_27 +UPDATEDATE=2015_11_09 # 2014_08_21 --> change 138516: added YS, YE, ZM tags # 2014_08_28 --> change 139176: Update SAM MD5 @@ -27,3 +28,4 @@ # 2015_04_09 --> change 148796: update read group id # 2015_04_25 --> change 149721, update CIGAR string, replace M with X= # 2015_04_26 --> change 149749, add opiton -cigarUseSeqMatch (default: false). If -cigarUseSeqMatch is turned on, CIGAR strings use '=' and 'X' to represent sequence match and mismatch instead of 'M'. +# 2015_11_05 --> change 166177, update CIGAR string, DO NOT allow adjacent indels unless -allowAdjacentIndels is ON. diff -Nru blasr-0~20151014+git8e668be/ctest/unaligned.t blasr-5.3/ctest/unaligned.t --- blasr-0~20151014+git8e668be/ctest/unaligned.t 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/ctest/unaligned.t 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,25 @@ +Set up + $ . $TESTDIR/setup.sh + +Test dataset.xml as input + $ $EXEC $DATDIR/test_dataset/chunking.subreadset.xml $DATDIR/ecoli_reference.fasta --unaligned $OUTDIR/unaligned.txt --noPrintUnalignedSeqs --concordant 1>/dev/null && echo $? + [INFO]* (glob) + [INFO]* (glob) + 0 + $ head -5 $OUTDIR/unaligned.txt + m150404_101626_42267_c100807920800000001823174110291514_s1_p0/55/0_1380 + m150404_101626_42267_c100807920800000001823174110291514_s1_p0/55/1432_3136 + m150404_101626_42267_c100807920800000001823174110291514_s1_p0/480/11699_11988 + m150404_101626_42267_c100807920800000001823174110291514_s1_p0/480/12033_13456 + m150404_101626_42267_c100807920800000001823174110291514_s1_p0/480/13519_14067 + + $ $EXEC $DATDIR/ecoli_subset.fasta $DATDIR/ecoli_reference.fasta --unaligned $OUTDIR/unaligned.txt --noPrintUnalignedSeqs 1>/dev/null && echo $? + [INFO]* (glob) + [INFO]* (glob) + 0 + $ head -5 $OUTDIR/unaligned.txt + m121004_000921_42130_c100440700060000001523060402151341_s1_p0/8/2724_3021 + m121004_000921_42130_c100440700060000001523060402151341_s1_p0/13/0_278 + m121004_000921_42130_c100440700060000001523060402151341_s1_p0/13/327_954 + m121004_000921_42130_c100440700060000001523060402151341_s1_p0/13/1004_1580 + m121004_000921_42130_c100440700060000001523060402151341_s1_p0/13/1625_2202 diff -Nru blasr-0~20151014+git8e668be/ctest/useccsallBestN1.t blasr-5.3/ctest/useccsallBestN1.t --- blasr-0~20151014+git8e668be/ctest/useccsallBestN1.t 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/ctest/useccsallBestN1.t 2016-09-12 08:56:50.000000000 +0000 @@ -1,11 +1,12 @@ Set up $ . $TESTDIR/setup.sh -Test -useccsall with bestn = 1 - $ $EXEC $DATDIR/ccstest.fofn $DATDIR/ccstest_ref.fasta -bestn 1 -useccsall -sam -out $OUTDIR/useccsall.sam -holeNumbers 76772 +Test --useccsall with bestn = 1 + $ $EXEC $DATDIR/ccstest.fofn $DATDIR/ccstest_ref.fasta --bestn 1 --useccsall --bam --out $OUTDIR/useccsall.bam --holeNumbers 76772 [INFO]* (glob) [INFO]* (glob) - $ sed -n '9,$ p' $OUTDIR/useccsall.sam > $TMP1 - $ sed -n '9,$ p' $STDDIR/$UPDATEDATE/useccsall.sam > $TMP2 + $ $SAMTOOLS view $OUTDIR/useccsall.bam > $OUTDIR/useccsall.sam + $ sed -n '9,$ p' $OUTDIR/useccsall.sam |cut -f 1-4 > $TMP1 + $ sed -n '9,$ p' $STDDIR/$UPDATEDATE/useccsall.sam | cut -f 1-4 > $TMP2 $ diff $TMP1 $TMP2 $ rm $TMP1 $TMP2 diff -Nru blasr-0~20151014+git8e668be/ctest/useccsallLargeGenome.t blasr-5.3/ctest/useccsallLargeGenome.t --- blasr-0~20151014+git8e668be/ctest/useccsallLargeGenome.t 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/ctest/useccsallLargeGenome.t 2016-09-12 08:56:50.000000000 +0000 @@ -1,13 +1,13 @@ Set up $ . $TESTDIR/setup.sh -Test -useccsall with Large genome. +Test --useccsall with Large genome. $ BASFILE=/mnt/data3/vol53/2450530/0014/Analysis_Results/m130507_052228_42161_c100519212550000001823079909281305_s1_p0.3.bax.h5 $ REFDIR=/mnt/secondary/Smrtpipe/repository/hg19_M_sorted/sequence $ REFFA=$REFDIR/hg19_M_sorted.fasta $ REFSA=$REFDIR/hg19_M_sorted.fasta.sa $ OUTFILE=$OUTDIR/intflow.m4 - $ $EXEC $BASFILE $REFFA -out $OUTFILE -m 4 -sa $REFSA -holeNumbers 109020 + $ $EXEC $BASFILE $REFFA --out $OUTFILE -m 4 --sa $REFSA --holeNumbers 109020 [INFO]* (glob) [INFO]* (glob) $ sort $OUTFILE > $TMP1 && sort $STDDIR/intflow_2014_06_10.m4 > $TMP2 && diff $TMP1 $TMP2 && echo $? diff -Nru blasr-0~20151014+git8e668be/ctest/verbose.t blasr-5.3/ctest/verbose.t --- blasr-0~20151014+git8e668be/ctest/verbose.t 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/ctest/verbose.t 2016-09-12 08:56:50.000000000 +0000 @@ -2,7 +2,7 @@ $ . $TESTDIR/setup.sh Test alignment score - $ $EXEC $DATDIR/lambda_bax.fofn $DATDIR/lambda_ref.fasta -holeNumbers 1-200 -V 3 > $TMP1 + $ $EXEC $DATDIR/lambda_bax.fofn $DATDIR/lambda_ref.fasta --holeNumbers 1--200 -V 3 > $TMP1 [INFO]* (glob) [INFO]* (glob) $ echo $? diff -Nru blasr-0~20151014+git8e668be/debian/changelog blasr-5.3/debian/changelog --- blasr-0~20151014+git8e668be/debian/changelog 2015-12-15 03:52:44.000000000 +0000 +++ blasr-5.3/debian/changelog 2016-11-29 17:33:15.000000000 +0000 @@ -1,3 +1,25 @@ +blasr (5.3-1build1) zesty; urgency=medium + + * Rebuild against new hdf5 + + -- Iain Lane Tue, 29 Nov 2016 17:33:15 +0000 + +blasr (5.3-1) unstable; urgency=medium + + * Imported Upstream version 5.3 + (upstream git revision 1baabc8 from 2016-09-12) + * Find PB libraries in the default LD path + * Bump Standards-Version to 3.9.8 + * Use encrypted protocols for VCS + * Update email address and copyright year + * Update paths to pbseqlib headers + * Remove unused lintian overrides + * Update debian/TODO + * Add d/NEWS to explain interface change + * d/rules: Minor clean-ups + + -- Afif Elghraoui Sun, 23 Oct 2016 18:21:55 -0700 + blasr (0~20151014+git8e668be-1) unstable; urgency=high * Don't install LICENSES.txt file diff -Nru blasr-0~20151014+git8e668be/debian/control blasr-5.3/debian/control --- blasr-0~20151014+git8e668be/debian/control 2015-12-13 21:59:18.000000000 +0000 +++ blasr-5.3/debian/control 2016-10-24 00:40:38.000000000 +0000 @@ -2,7 +2,7 @@ Section: science Priority: optional Maintainer: Debian Med Packaging Team -Uploaders: Afif Elghraoui +Uploaders: Afif Elghraoui Build-Depends: debhelper (>= 9), python, @@ -11,10 +11,10 @@ libblasr-dev, libpbdata-dev, libpbihdf-dev, -Standards-Version: 3.9.6 +Standards-Version: 3.9.8 Homepage: https://github.com/PacificBiosciences/blasr -Vcs-Git: git://anonscm.debian.org/debian-med/blasr.git -Vcs-Browser: http://anonscm.debian.org/cgit/debian-med/blasr.git +Vcs-Git: https://anonscm.debian.org/git/debian-med/blasr.git +Vcs-Browser: https://anonscm.debian.org/cgit/debian-med/blasr.git Package: blasr Architecture: any diff -Nru blasr-0~20151014+git8e668be/debian/copyright blasr-5.3/debian/copyright --- blasr-0~20151014+git8e668be/debian/copyright 2015-12-13 20:42:12.000000000 +0000 +++ blasr-5.3/debian/copyright 2016-07-24 23:35:30.000000000 +0000 @@ -4,11 +4,11 @@ Source: https://github.com/PacificBiosciences/blasr Files: * -Copyright: 2011-2015 Pacific Biosciences of California, Inc. +Copyright: 2011-2016 Pacific Biosciences of California, Inc. License: PacBio-BSD-3-Clause Files: debian/* -Copyright: 2015 Afif Elghraoui +Copyright: 2015-2016 Afif Elghraoui License: PacBio-BSD-3-Clause License: PacBio-BSD-3-Clause diff -Nru blasr-0~20151014+git8e668be/debian/lintian-overrides blasr-5.3/debian/lintian-overrides --- blasr-0~20151014+git8e668be/debian/lintian-overrides 2015-12-15 03:51:33.000000000 +0000 +++ blasr-5.3/debian/lintian-overrides 2016-10-24 00:26:17.000000000 +0000 @@ -1,6 +1,2 @@ -# libblasr/libpbdata/libpbihdf (pbseqlib) are internal libraries -# used only by blasr and pbdagcon -binary-or-shlib-defines-rpath usr/bin/* /usr/lib/*/pbdata -binary-or-shlib-defines-rpath usr/bin/* /usr/lib/*/pbihdf # False positive - CPPFLAGS are passed hardening-no-fortify-functions usr/bin/blasr diff -Nru blasr-0~20151014+git8e668be/debian/NEWS blasr-5.3/debian/NEWS --- blasr-0~20151014+git8e668be/debian/NEWS 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/debian/NEWS 2016-10-24 02:19:09.000000000 +0000 @@ -0,0 +1,6 @@ +blasr (5.3-1) unstable; urgency=medium + + The blasr command line interface has changed. Long options are now + specified using double dashes. + + -- Afif Elghraoui Sun, 24 Jul 2016 16:34:19 -0700 diff -Nru blasr-0~20151014+git8e668be/debian/rules blasr-5.3/debian/rules --- blasr-0~20151014+git8e668be/debian/rules 2015-12-15 03:29:44.000000000 +0000 +++ blasr-5.3/debian/rules 2016-10-24 02:19:28.000000000 +0000 @@ -1,19 +1,15 @@ #!/usr/bin/make -f -DPKG_EXPORT_BUILDFLAGS = 1 include /usr/share/dpkg/default.mk export HDF5_INC=/usr/include/hdf5/serial export HDF5_LIB=/usr/lib/$(DEB_HOST_MULTIARCH)/hdf5/serial -export LIBBLASR_INC=/usr/include/blasr -export LIBBLASR_LIB=/usr/lib/$(DEB_HOST_MULTIARCH)/blasr -export LIBPBDATA_INC=/usr/include/pbdata -export LIBPBDATA_LIB=/usr/lib/$(DEB_HOST_MULTIARCH)/pbdata -export LIBPBIHDF_INC=/usr/include/pbihdf -export LIBPBIHDF_LIB=/usr/lib/$(DEB_HOST_MULTIARCH)/pbihdf +PBSEQ_BASE=/usr/include/pbseq +export LIBBLASR_INC=$(PBSEQ_BASE)/alignment +export LIBPBDATA_INC=$(PBSEQ_BASE)/pbdata +export LIBPBIHDF_INC=$(PBSEQ_BASE)/hdf -export LD_RUN_PATH=$(LIBBLASR_LIB):$(LIBPBDATA_LIB):$(LIBPBIHDF_LIB) %: dh $@ --parallel @@ -22,7 +18,7 @@ ./configure.py --no-pbbam --shared # Tests require data not available in the source distribution -override_dh_auto_test: +override_dh_auto_test: ; override_dh_auto_clean: override_dh_auto_configure $(MAKE) clean diff -Nru blasr-0~20151014+git8e668be/debian/TODO blasr-5.3/debian/TODO --- blasr-0~20151014+git8e668be/debian/TODO 2015-07-31 04:46:08.000000000 +0000 +++ blasr-5.3/debian/TODO 2016-10-24 00:40:13.000000000 +0000 @@ -1,4 +1,7 @@ * Test the get-orig-source script -* Package cram to be able to run tests - https://bitbucket.org/brodie/cram +* Update manpages + +* build bax2bam and bam2bax + +* Look into building with pbbam diff -Nru blasr-0~20151014+git8e668be/extrautils/BasH5Simulator.cpp blasr-5.3/extrautils/BasH5Simulator.cpp --- blasr-0~20151014+git8e668be/extrautils/BasH5Simulator.cpp 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/extrautils/BasH5Simulator.cpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,524 +0,0 @@ -#include -#include -#include -#include "utils.hpp" -#include "Enumerations.h" -#include "DNASequence.hpp" -#include "FASTAReader.hpp" -#include "CommandLineParser.hpp" -#include "metagenome/TitleTable.hpp" -#include "metagenome/FindRandomSequence.hpp" -#include "statistics/StatUtils.hpp" -#include "HDFBasWriter.hpp" -#include "HDFRegionTableWriter.hpp" -#include "simulator/LengthHistogram.hpp" -#include "simulator/OutputSampleListSet.hpp" -#include "simulator/ContextOutputList.hpp" - -using namespace std; - -void SetHelp(string & str) { - stringstream helpStream; - helpStream - << "usage: alchemy outputModel [ options ]" << endl - << " options: " << endl - << " -genome genome.fasta" << endl - << " Simulate reads from the reference genome 'genome.fasta'." << endl << endl - << " -numBasesPerFile numBasesPerFile" << endl - << " Limit the number of bases per output file to this." << endl << endl - << " -sourceReads filename " << endl - << " When set, simulate reads by reading from 'filename', " << endl - << " rather than simulating from a genome." << endl - << " The format of the fasta titles should be >read_index|chr|start_pos|end_pos" << endl << endl - << " -lengthModel" << endl - << " Use lengths from the alchemy model, rather than the read length. This " << endl - << " is used in conjunction with the sourceReadsFile, to modulate the lenghts" << endl - << " of the reads." << endl << endl - << " -fixedLength length " << endl - << " Set simulated read length to a fixed value of 'length', rather than " << endl - << " sampling from a length mode." << endl - << " -movieName name (\"simulated_movie\")" << endl - << " Use 'name' for movies rather than m000_000..." << endl << endl - << " -titleTable name" < movieNames; - bool useLengthModel = false; - bool useFixedLength = false; - ofstream posMapFile; - int scaledLength = 0; - int fixedLength = 0; - int nBasFiles = 1; - bool useLengthsModel = true; - bool printHelp = false; - - - // Look to see if the refAsReads flag is specified anywhere before - // parsing the command line. - - CommandLineParser clp; - string commandLine; - string helpString; - SetHelp(helpString); - vector fns; - - clp.RegisterStringOption("genome", &refGenomeFileName, ""); - clp.RegisterIntOption("numBasesPerFile", (int*)&numBasesPerFile, "", - CommandLineParser::PositiveInteger); - clp.RegisterStringOption("sourceReads", &sourceReadsFileName, ""); - clp.RegisterStringOption("lengthModel", &lengthModelFileName, ""); - clp.RegisterIntOption("fixedLength", &fixedLength, "", - CommandLineParser::PositiveInteger); - clp.RegisterFlagOption("lengthModel", &useLengthModel, ""); - clp.RegisterStringOption("movieName", &movieName, ""); - clp.RegisterStringOption("titleTable", &titleTableFileName, ""); - clp.RegisterStringOption("baseFileName", &basH5BaseFileName, ""); - clp.RegisterIntOption("nFiles", &nBasFiles, "", - CommandLineParser::PositiveInteger); - clp.RegisterIntOption("meanLength", &scaledLength, "", - CommandLineParser::PositiveInteger); - clp.RegisterStringOption("posMap", &posMapFileName, ""); - clp.RegisterFlagOption("printPercentRepeat", &printPercentRepeat, ""); - clp.RegisterFlagOption("h", &printHelp, ""); - - clp.SetHelp(helpString); - clp.ParseCommandLine(argc, argv, fns); - clp.CommandLineToString(argc, argv, commandLine); - - clp.SetProgramName("alchemy"); - - outputModelFileName = fns[0]; - if (argc <= 1 or printHelp or outputModelFileName == "") { - cout << helpString << endl; - exit(0); - } - - if (usePosMap) { - CrucialOpen(posMapFileName, posMapFile, std::ios::out); - } - - if (sourceReadsFileName == "" and fixedLength == 0) { - useLengthModel = true; - } - - if (useLengthModel and fixedLength != 0) { - cout << "ERROR! You must either use a length model or a fixed length." << endl; - exit(1); - } - - if (sourceReadsFileName == "" and numBasesPerFile == 0) { - cout << "ERROR! You must specify either a set of read to use as " << endl - << "original reads for simulation or the total number of bases " << endl - << "to simulate in each bas.h5 file." << endl; - exit(1); - } - - if (sourceReadsFileName == "" and refGenomeFileName == "") { - cout << "ERROR! You must specify a genome to sample reads from or a set of read "< alignmentLengths; - int meanAlignmentLength; - - - if (scaledLength != 0 and useLengthModel) { - // - // Scale the histogram so that the average length is 'scaledLength'. - // - - // 1. Integrate histogram - long totalLength = 0; - long totalSamples = 0; - int hi; - for (hi = 0; hi < lengthHistogram.lengthHistogram.cdf.size()-1; hi++) { - int ni; - ni = lengthHistogram.lengthHistogram.cdf[hi+1] - lengthHistogram.lengthHistogram.cdf[hi]; - totalLength += ni * lengthHistogram.lengthHistogram.data[hi]; - } - totalSamples = lengthHistogram.lengthHistogram.cdf[lengthHistogram.lengthHistogram.cdf.size()-1]; - - float meanSampleLength = totalLength / (1.0*totalSamples); - float fractionIncrease = scaledLength / meanSampleLength; - - for (hi = 0; hi < lengthHistogram.lengthHistogram.cdf.size(); hi++) { - lengthHistogram.lengthHistogram.data[hi] *= fractionIncrease; - } - } - - FASTAReader inReader, seqReader; - vector reference; - DNALength refLength = 0; - int i; - if (refGenomeFileName != "") { - inReader.Init(refGenomeFileName); - inReader.ReadAllSequences(reference); - - for (i = 0; i < reference.size(); i++) { - refLength += reference[i].length; - } - } - - if (sourceReadsFileName != "") { - seqReader.Init(sourceReadsFileName); - } - - ofstream readsFile; - - // - // Create and simulate bas.h5 files. - // - int baseFileIndex; - bool readsRemain = true; - for (baseFileIndex = 0; ((sourceReadsFileName == "" and baseFileIndex < nBasFiles) // case 1 is reads are generated by file - or (sourceReadsFileName != "" and readsRemain)); // case 2 is reads are generated by an input file. - baseFileIndex++) { - // - // Prep the base file for writing. - // - stringstream fileNameStrm, movieNameStrm; - //string movieName = "m000000_000000_00000_cSIMULATED_s"; - movieNameStrm << movieName << baseFileIndex << "_p0"; - string fullMovieName = movieNameStrm.str(); - fileNameStrm << fullMovieName << ".bas.h5"; - - - HDFBasWriter basWriter; - HDFRegionTableWriter regionWriter; - // - // This is mainly used to create the atributes. - // - RegionTable regionTable; - regionTable.CreateDefaultAttributes(); - - basWriter.SetPlatform(Springfield); - // - // Use a fixed set of fields for now. - // - - // These are all pulled from the outputModel. - basWriter.IncludeField("Basecall"); - basWriter.IncludeField("QualityValue"); - basWriter.IncludeField("SubstitutionQV"); - basWriter.IncludeField("SubstitutionTag"); - basWriter.IncludeField("InsertionQV"); - basWriter.IncludeField("DeletionQV"); - basWriter.IncludeField("DeletionTag"); - basWriter.IncludeField("WidthInFrames"); - basWriter.IncludeField("PreBaseFrames"); - basWriter.IncludeField("PulseIndex"); - - vector qualityValue, substitutionQV, substitutionTag, insertionQV, deletionQV, deletionTag; - vector widthInFrames, preBaseFrames, pulseIndex; - - // Just go from 0 .. hole Number - basWriter.IncludeField("HoleNumber"); - // Fixed to 0. - basWriter.IncludeField("HoleXY"); - if (usePosMap == false) { - basWriter.IncludeField("SimulatedSequenceIndex"); - basWriter.IncludeField("SimulatedCoordinate"); - } - basWriter.SetChangeListID("1.3.0.50.104380"); - - - DNALength numSimulatedBases = 0; - FASTASequence sampleSeq; - //sampleSeq.length = readLength; - int maxRetry = 10000000; - int retryNumber = 0; - int numReads = 0; - int readLength = 0; - - while (numBasesPerFile == 0 or numSimulatedBases < numBasesPerFile) { - DNALength seqIndex, seqPos; - if (useLengthModel or fixedLength) { - if (useLengthModel) { - lengthHistogram.GetRandomLength(readLength); - } - else { - readLength = fixedLength; - } - } - if (refGenomeFileName != "") { - FindRandomPos(reference, seqIndex, seqPos, readLength + (outputModel.keyLength - 1)); - sampleSeq.seq = &reference[seqIndex].seq[seqPos]; - sampleSeq.length = readLength + (outputModel.keyLength - 1); - assert(reference[seqIndex].length >= sampleSeq.length); - } - else if (sourceReadsFileName != "") { - if (seqReader.GetNext(sampleSeq) == false) { - readsRemain = false; - break; - } - if (sampleSeq.length < outputModel.keyLength) { - continue; - } - // - // Now attempt to parse the position from the fasta title. - // - - if (useLengthModel) { - int tryNumber = 0; - readLength = 0; - int maxNTries = 1000; - int tryBuffer[5] = {-1,-1,-1,-1,-1}; - while (tryNumber < maxNTries and readLength < outputModel.keyLength) { - lengthHistogram.GetRandomLength(readLength); - readLength = sampleSeq.length = min(sampleSeq.length, (unsigned int) readLength); - tryBuffer[tryNumber%5] = readLength; - tryNumber++; - } - if (tryNumber >= maxNTries) { - cout << "ERROR. Could not generate a read length greater than the " << outputModel.keyLength << " requried " < tokens; - Splice(sampleSeq.title, "|", tokens); - if (tokens.size() == 4) { - seqPos = atoi(tokens[2].c_str()); - if (titleTableFileName == "") { - seqIndex = 0; - } - else { - int index; - titleTable.Lookup(tokens[1], index); - seqIndex = index; - } - } - else { - seqPos = 0; - } - } - - // - // If this is the first read printed to the base file, initialize it. - // - if (numSimulatedBases == 0) { - basWriter.Initialize(fileNameStrm.str(), movieNameStrm.str(), Springfield); - regionWriter.Initialize(basWriter.pulseDataGroup); - } - - numSimulatedBases += readLength; - - int p; - // create the sample sequence - int contextLength = outputModel.keyLength; - int contextMiddle = contextLength / 2; - string outputString; - - int nDel = 0; - int nIns = 0; - - // - // Simulate to beyond the sample length. - // - qualityValue.clear(); - substitutionQV.clear(); - substitutionTag.clear(); - insertionQV.clear(); - deletionQV.clear(); - deletionTag.clear(); - pulseIndex.clear(); - widthInFrames.clear(); - preBaseFrames.clear(); - assert(sampleSeq.length > contextMiddle + 1); - for (p = contextMiddle; - p < sampleSeq.length - contextMiddle - 1; p++) { - string refContext; - refContext.assign((const char*) &sampleSeq.seq[p-contextMiddle], contextLength); - - string outputContext; - int contextWasFound; - OutputSample sample; - int i; - for (i = 0; i < refContext.size(); i++) { refContext[i] = toupper(refContext[i]);} - outputModel.SampleRandomSample(refContext, sample); - - if (sample.type == OutputSample::Deletion ) { - // - // There was a deletion. Advance in reference, then output - // the base after the deletion. - // - p++; - ++nDel; - } - - int cp; - // - // Add the sampled context, possibly multiple characters because of an insertion. - // - for (i = 0; i < sample.nucleotides.size(); i++) { - outputString.push_back(sample.nucleotides[i]); - qualityValue.push_back(sample.qualities[i].qv[0]); - deletionQV.push_back(sample.qualities[i].qv[1]); - insertionQV.push_back(sample.qualities[i].qv[2]); - substitutionQV.push_back(sample.qualities[i].qv[3]); - deletionTag.push_back(sample.qualities[i].tags[0]); - substitutionTag.push_back(sample.qualities[i].tags[1]); - pulseIndex.push_back(sample.qualities[i].frameValues[0]); - preBaseFrames.push_back(sample.qualities[i].frameValues[1]); - widthInFrames.push_back(sample.qualities[i].frameValues[2]); - } - nIns += sample.qualities.size() - 1; - } - if (outputString.find('N') != outputString.npos or - outputString.find('n') != outputString.npos) { - cout << "WARNING! The sampled string " << endl << outputString << endl - << "should not contain N's, but it seems to. This is being ignored "<(4)); - string seqString; - seqString.assign((char*)ccsRead.seq, ccsRead.length); - - bamRecord.Impl().SetSequenceAndQualities(seqString, ccsRead.qual.ToString()); - // bamRecord.Impl().CigarData(Cigar::FromStdString("*")); - bamRecord.Impl().Bin(0); - bamRecord.Impl().InsertSize(0); - bamRecord.Impl().MatePosition(static_cast(-1)); - bamRecord.Impl().MateReferenceId(static_cast(-1)); - bamRecord.Impl().Position(static_cast(-1)); - bamRecord.Impl().ReferenceId(static_cast(-1)); - TagCollection tags; - tags["RG"] = ccsRead.GetReadGroupId(); - tags["np"] = ccsRead.numPasses; - tags["zm"] = ccsRead.zmwData.holeNumber; - tags["qs"] = 0; - tags["qe"] = ccsRead.length; - - samQVList.FormatQVOptionalFields(ccsRead); - // Add QVs to BamRecordImpl. - string insertionQVs, deletionQVs, substitutionQVs, mergeQVs, substitutionTags, deletionTags; - if (ccsRead.GetQVs("InsertionQV", insertionQVs)) { - tags["iq"] = insertionQVs; - } - if (ccsRead.GetQVs("DeletionQV", deletionQVs)) { - tags["dq"] = deletionQVs; - } - if (ccsRead.GetQVs("SubstitutionQV", substitutionQVs)) { - tags["sq"] = substitutionQVs; - } - if (ccsRead.GetQVs("MergeQV", mergeQVs)) { - tags["mq"] = mergeQVs; - } - // substitutionTag is not included by default - if (ccsRead.GetQVs("DeletionTag", deletionTags)) { - tags["dt"] = deletionTags; - } - bamRecord.Impl().Tags(tags); -} - -int main(int argc, char* argv[]) { - string progName = "ccsh5tobam"; - CommandLineParser clp; - clp.SetHelp("Convert ccs.h5 to bam.\n" + DISCLAIM); - clp.SetConciseHelp("ccsh5tobam ccs.h5|fofn out.bam\n" + DISCLAIM); - clp.SetProgramName(progName); - clp.SetVersion(GetVersion()); - string fofn, bamOutName; - clp.RegisterStringOption("in.ccs.h5", &fofn, "Input ccs.h5|fofn file.", true); - clp.RegisterStringOption("out.bam", &bamOutName, "Output bam file.", true); - clp.RegisterPreviousFlagsAsHidden(); - clp.ParseCommandLine(argc, argv); - - //cerr << "[INFO] " << GetTimestamp() << " [" << progName << "] started." << endl; - - vector ccsFileNames; - FileOfFileNames::StoreFileOrFileList(fofn, ccsFileNames); - - string so = "UNKNOWN"; // sorting order; - string version = GetVersion(); - string commandLineString; - clp.CommandLineToString(argc, argv, commandLineString); - - SupplementalQVList samQVList; - samQVList.SetDefaultQV(); - SequenceIndexDatabase seqdb; - SAMHeaderPrinter shp(so, seqdb, - ccsFileNames, ReadType::ReadTypeEnum::CCS, - samQVList, "ccsh52bam", version, - commandLineString); - string headerString = shp.ToString();// SAM/BAM header - - BamHeader header = BamHeader(headerString); - // Both file name and SAMHeader are required in order to create a BamWriter. - BamWriter * bamWriterPtr = new BamWriter(bamOutName, header); - - for (string ccsFileName: ccsFileNames) { - ReaderAgglomerate reader; - reader.SetReadFileName(ccsFileName); - reader.SetReadType(ReadType::ReadTypeEnum::CCS); - - // Initialize using already set file names. - int initReturnValue = reader.Initialize(); - if (initReturnValue <= 0) { - cerr << "WARNING! Could not open file " << ccsFileName << endl; - continue; - } - - // Check whether use ccs only. - assert (reader.GetFileType() == HDFCCSONLY); - int randint = 0; - CCSSequence ccsRead; - while(reader.GetNext(ccsRead, randint) != 0) { - if (ccsRead.length > 0) { - BamRecord bamRecord; - CCSReadToBamRecord(ccsRead, bamRecord, samQVList); - bamWriterPtr->Write(bamRecord); - } - } - } - - try { - bamWriterPtr->TryFlush(); - delete bamWriterPtr; - bamWriterPtr = NULL; - } catch (std::exception e) { - cout << "Error, could not flush bam records to bam file." << endl; - exit(1); - } - - //cerr << "[INFO] " << GetTimestamp() << " [" << progName << "] ended." << endl; - return 0; -} diff -Nru blasr-0~20151014+git8e668be/extrautils/ctest/alchemy.t blasr-5.3/extrautils/ctest/alchemy.t --- blasr-0~20151014+git8e668be/extrautils/ctest/alchemy.t 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/extrautils/ctest/alchemy.t 1970-01-01 00:00:00.000000000 +0000 @@ -1,20 +0,0 @@ -Set up - $ . $TESTDIR/setup.sh - -Set up the executable: alchemy. - $ EXEC=$TESTDIR/../alchemy - -test_alchemy.cmp.h5 was generated by -pbalign.py $DATDIR/test_alchemy_read.fa $DATDIR/test_alchemy_ref.fa test_alchemy.cmp.h5 - -$ ./cmpH5StoreQualityByContext $DATDIR/test_alchemy.cmp.h5 $OUTDIR/test_alchemy.qbc -contextLength 3 - - $ $EXEC $DATDIR/ecoli_out.qbc -genome $DATDIR/ecoli_reference.fasta -numBasesPerFile 100000 -baseFileName 'this_bas_file' -movieName $OUTDIR/alchemy_ - $ echo $? - 0 - -pls2fasta can be successfully applied to the simulated bas.h5 file. -$ pls2fasta *.bas.h5 $OUTDIR/test_alchemy_pls2fasta.fa -[INFO] * [pls2fasta] started. (glob) -[INFO] * [pls2fasta] ended. (glob) - diff -Nru blasr-0~20151014+git8e668be/extrautils/ctest/ccsh5tobam.t blasr-5.3/extrautils/ctest/ccsh5tobam.t --- blasr-0~20151014+git8e668be/extrautils/ctest/ccsh5tobam.t 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/extrautils/ctest/ccsh5tobam.t 1970-01-01 00:00:00.000000000 +0000 @@ -1,10 +0,0 @@ -Set up - $ . $TESTDIR/setup.sh - -Set up the executable: ccsh5tobam - $ EXEC=$TESTDIR/../ccsh5tobam - $ SMRTWRAP=/mnt/secondary/Smrtpipe/builds/Internal_Mainline_Nightly_LastSuccessfulBuild/smrtcmds/bin/smrtwrap - - $ $SMRTWRAP python $SCRIPTDIR/test_ccsh5tobam.py $EXEC $DATDIR/test_ccsh5tobam/input.fofn $OUTDIR/test_ccsh5tobam.bam - $ echo $? - 0 diff -Nru blasr-0~20151014+git8e668be/extrautils/ctest/printTupleCountTable.t blasr-5.3/extrautils/ctest/printTupleCountTable.t --- blasr-0~20151014+git8e668be/extrautils/ctest/printTupleCountTable.t 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/extrautils/ctest/printTupleCountTable.t 2016-09-12 08:56:50.000000000 +0000 @@ -11,7 +11,7 @@ Make OUTDIR $ mkdir -p $OUTDIR - $ $EXEC $OUTDIR/ecoli_tuple.table $DATDIR/ecoli_reference.fasta + $ $EXEC $OUTDIR/ecoli_tuple.table 8 $DATDIR/ecoli_reference.fasta $ echo $? 0 diff -Nru blasr-0~20151014+git8e668be/extrautils/makefile blasr-5.3/extrautils/makefile --- blasr-0~20151014+git8e668be/extrautils/makefile 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/extrautils/makefile 2016-09-12 08:56:50.000000000 +0000 @@ -5,12 +5,12 @@ include ${SRCDIR}/../rules.mk CXXOPTS := -std=c++0x -pedantic \ - -Wall -Wuninitialized -Wno-div-by-zero \ - -MMD -MP -w -fpermissive + -Wall -Wextra -Wno-overloaded-virtual \ + -MMD -MP CXXFLAGS += ${CXXOPTS} ${GCXXFLAGS} -EXE = sa2bwt bwt2sa alchemy excrep evolve bsdb simpleShredder swMatcher \ - samodify sals printTupleCountTable cmpH5StoreQualityByContext ccsh5tobam +EXE = sa2bwt bwt2sa excrep evolve bsdb simpleShredder swMatcher \ + samodify sals printTupleCountTable cmpH5StoreQualityByContext LD_LIBRARY_PATH=${HDF5_LIB}:${LIBBLASR_LIB}:${LIBPBIHDF_LIB}:${LIBPBDATA_LIB} export LD_LIBRARY_PATH @@ -24,7 +24,6 @@ sa2bwt: SuffixArrayToBWT.o bwt2sa: BwtToSuffixArray.o -alchemy: BasH5Simulator.o excrep: ExciseRepeats.o evolve: Evolve.o bsdb: BuildSequenceDB.o @@ -34,10 +33,9 @@ sals: SALS.o printTupleCountTable: PrintTupleCountTable.o cmpH5StoreQualityByContext: StoreQualityByContextFromCmpH5.o -ccsh5tobam: CCSH5ToBam.o CTESTS := \ -ctest/alchemy.t ctest/ccsh5tobam.t ctest/printTupleCountTable.t ctest/sals.t ctest/swmatcher.t \ +ctest/printTupleCountTable.t ctest/sals.t ctest/swmatcher.t \ ctest/bwt2sa.t ctest/cmpH5StoreQualityByContext.t ctest/sa2bwt.t ctest/samodify.t diff -Nru blasr-0~20151014+git8e668be/extrautils/SimpleShredder.cpp blasr-5.3/extrautils/SimpleShredder.cpp --- blasr-0~20151014+git8e668be/extrautils/SimpleShredder.cpp 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/extrautils/SimpleShredder.cpp 2016-09-12 08:56:50.000000000 +0000 @@ -25,7 +25,7 @@ clp.RegisterStringOption("inFile", &inFileName, "Reference sequence", 0); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterIntOption("readLength", (int*) &readLength, "The length of reads to simulate. The length is fixed.", - CommandLineParser::PositiveInteger, "Length of every read.", 0); + CommandLineParser::PositiveInteger, 0); clp.RegisterFloatOption("coverage", &coverage, "Total coverage (from which the number of reads is calculated", CommandLineParser::PositiveFloat, 0); clp.RegisterFlagOption("nonRandInit", &noRandInit, "Skip initializing the random number generator with time."); diff -Nru blasr-0~20151014+git8e668be/.gitignore blasr-5.3/.gitignore --- blasr-0~20151014+git8e668be/.gitignore 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/.gitignore 2016-09-12 08:56:50.000000000 +0000 @@ -1,4 +1,13 @@ defines.mk *.d *.o +*.err blasr +/utils/bam2bax/build/ +/utils/bam2bax/bin/ +/utils/bam2bax/tests/bin/ +/utils/bam2bax/tests/src/TestData.h +/utils/bax2bam/build/ +/utils/bax2bam/bin/ +/utils/bax2bam/tests/bin/ +/utils/bax2bam/tests/src/TestData.h diff -Nru blasr-0~20151014+git8e668be/iblasr/BlasrAlign.hpp blasr-5.3/iblasr/BlasrAlign.hpp --- blasr-0~20151014+git8e668be/iblasr/BlasrAlign.hpp 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/iblasr/BlasrAlign.hpp 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,85 @@ +// Author: Mark Chaisson +#pragma once + +#include "BlasrHeaders.h" +#include "BlasrMiscs.hpp" + +//------------------MAP READS---------------------------------// +template +void MapRead(T_Sequence &read, T_Sequence &readRC, T_RefSequence &genome, + T_SuffixArray &sarray, + BWT &bwt, + SeqBoundaryFtr &seqBoundary, + T_TupleCountTable &ct, + SequenceIndexDatabase &seqdb, + MappingParameters ¶ms, + MappingMetrics &metrics, + vector &alignmentPtrs, + MappingBuffers &mappingBuffers, + MappingIPC *mapData, + MappingSemaphores & semaphores); + +template +void MapRead(T_Sequence &read, T_Sequence &readRC, + vector &alignmentPtrs, + MappingBuffers &mappingBuffers, + MappingIPC *mapData, + MappingSemaphores & semaphores); + +/* +void MapReads(MappingData *mapData); +*/ + +//------------------MAKE ALIGNMENTS---------------------------// +template +void AlignIntervals(T_TargetSequence &genome, T_QuerySequence &read, T_QuerySequence &rcRead, + WeightedIntervalSet &weightedIntervals, + int mutationCostMatrix[][5], + int ins, int del, int sdpTupleSize, + int useSeqDB, SequenceIndexDatabase &seqDB, + vector &alignments, + MappingParameters ¶ms, + MappingBuffers &mappingBuffers, + int procId=0); + +template +void PairwiseLocalAlign(T_Sequence &qSeq, T_RefSequence &tSeq, + int k, + MappingParameters ¶ms, T_AlignmentCandidate &alignment, + MappingBuffers &mappingBuffers, + AlignmentType alignType=Global); + +// Extend target aligned sequence of the input alignement to both ends +// by flankSize bases. Update alignment->tAlignedSeqPos, +// alignment->tAlignedSeqLength and alignment->tAlignedSeq. +void FlankTAlignedSeq(T_AlignmentCandidate * alignment, + SequenceIndexDatabase &seqdb, + DNASequence & genome, + int flankSize); + +// Align a subread of a SMRT sequence to target sequence of an alignment. +// Input: +// subread - a subread of a SMRT sequence. +// unrolledRead - the full SMRT sequence. +// alignment - an alignment. +// passDirection - whether or not the subread has the +// same direction as query of the alignment. +// 0 = true, 1 = false. +// subreadInterval - [start, end) interval of the subread in the +// SMRT read. +// subreadIndex - index of the subread in allReadAlignments. +// params - mapping paramters. +// Output: +// allReadAlignments - where the sequence and alignments of the +// subread are saved. +// threadOut - an out stream for debugging the current thread. +void AlignSubreadToAlignmentTarget(ReadAlignments & allReadAlignments, + SMRTSequence & subread, SMRTSequence & unrolledRead, + T_AlignmentCandidate * alignment, + int passDirection, ReadInterval & subreadInterval, + int subreadIndex, + MappingParameters & params, + MappingBuffers & mappingBuffers, + ostream & threadOut); + +#include "BlasrAlignImpl.hpp" diff -Nru blasr-0~20151014+git8e668be/iblasr/BlasrAlignImpl.hpp blasr-5.3/iblasr/BlasrAlignImpl.hpp --- blasr-0~20151014+git8e668be/iblasr/BlasrAlignImpl.hpp 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/iblasr/BlasrAlignImpl.hpp 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,1527 @@ +// Author: Mark Chaisson +#pragma once + +template +void MapRead(T_Sequence &read, T_Sequence &readRC, T_RefSequence &genome, + T_SuffixArray &sarray, + BWT &bwt, + SeqBoundaryFtr &seqBoundary, + T_TupleCountTable &ct, + SequenceIndexDatabase &seqdb, + MappingParameters ¶ms, + MappingMetrics &metrics, + vector &alignmentPtrs, + MappingBuffers &mappingBuffers, + MappingIPC *mapData, + MappingSemaphores & semaphores) +{ + bool matchFound; + WeightedIntervalSet topIntervals(params.nCandidates); + int numKeysMatched=0, rcNumKeysMatched=0; (void)(numKeysMatched); (void)(rcNumKeysMatched); + int expand = params.minExpand; + metrics.clocks.total.Tick(); + int forwardNumBasesMatched = 0, reverseNumBasesMatched = 0; + do { + matchFound = false; + mappingBuffers.matchPosList.clear(); + mappingBuffers.rcMatchPosList.clear(); + alignmentPtrs.clear(); + topIntervals.clear(); + params.anchorParameters.expand = expand; + + metrics.clocks.mapToGenome.Tick(); + + if (params.useSuffixArray) { + params.anchorParameters.lcpBoundsOutPtr = mapData->lcpBoundsOutPtr; + numKeysMatched = + MapReadToGenome(genome, sarray, read, params.lookupTableLength, mappingBuffers.matchPosList, + params.anchorParameters); + + // + // Only print values for the read in forward direction (and only + // the first read). + // + mapData->lcpBoundsOutPtr = NULL; + if (!params.forwardOnly) { + rcNumKeysMatched = + MapReadToGenome(genome, sarray, readRC, params.lookupTableLength, mappingBuffers.rcMatchPosList, + params.anchorParameters); + } + } + else if (params.useBwt){ + numKeysMatched = MapReadToGenome(bwt, read, read.SubreadStart(), read.SubreadEnd(), + mappingBuffers.matchPosList, params.anchorParameters, forwardNumBasesMatched); + if (!params.forwardOnly) { + rcNumKeysMatched = MapReadToGenome(bwt, readRC, readRC.SubreadStart(), readRC.SubreadEnd(), + mappingBuffers.rcMatchPosList, params.anchorParameters, reverseNumBasesMatched); + } + } + + // + // Look to see if only the anchors are printed. + if (params.anchorFileName != "") { + size_t i; + if (params.nProc > 1) { +#ifdef __APPLE__ + sem_wait(semaphores.writer); +#else + sem_wait(&semaphores.writer); +#endif + } + *mapData->anchorFilePtr << read.title << endl; + for (i = 0; i < mappingBuffers.matchPosList.size(); i++) { + *mapData->anchorFilePtr << mappingBuffers.matchPosList[i] << endl; + } + *mapData->anchorFilePtr << readRC.title << " (RC) " << endl; + for (i = 0; i < mappingBuffers.rcMatchPosList.size(); i++) { + *mapData->anchorFilePtr << mappingBuffers.rcMatchPosList[i] << endl; + } + + if (params.nProc > 1) { +#ifdef __APPLE__ + sem_post(semaphores.writer); +#else + sem_post(&semaphores.writer); +#endif + } + } + + metrics.totalAnchors += mappingBuffers.matchPosList.size() + mappingBuffers.rcMatchPosList.size(); + metrics.clocks.mapToGenome.Tock(); + + metrics.clocks.sortMatchPosList.Tick(); + SortMatchPosList(mappingBuffers.matchPosList); + SortMatchPosList(mappingBuffers.rcMatchPosList); + metrics.clocks.sortMatchPosList.Tock(); + + PValueWeightor lisPValue(read, genome, ct.tm, &ct); + MultiplicityPValueWeightor lisPValueByWeight(genome); + + LISSumOfLogPWeightor > lisPValueByLogSum(genome); + + LISSizeWeightor > lisWeightFn; + + IntervalSearchParameters intervalSearchParameters; + intervalSearchParameters.globalChainType = params.globalChainType; + intervalSearchParameters.advanceHalf = params.advanceHalf; + intervalSearchParameters.warp = params.warp; + intervalSearchParameters.fastMaxInterval = params.fastMaxInterval; + intervalSearchParameters.aggressiveIntervalCut = params.aggressiveIntervalCut; + intervalSearchParameters.verbosity = params.verbosity; + + // + // If specified, only align a band from the anchors. + // + DNALength squareRefLength = read.length * 1.25 + params.limsAlign; + if (params.limsAlign != 0) { + size_t fi; + for (fi = 0; fi < mappingBuffers.matchPosList.size(); fi++) { + if (mappingBuffers.matchPosList[fi].t >= squareRefLength) { break; } + } + if (fi < mappingBuffers.matchPosList.size()) { + mappingBuffers.matchPosList.resize(fi); + } + } + + metrics.clocks.findMaxIncreasingInterval.Tick(); + + // + // For now say that something that has a 50% chance of happening + // by chance is too high of a p value. This is probably many times + // the size. + // + intervalSearchParameters.maxPValue = log(0.5); + intervalSearchParameters.aboveCategoryPValue = -300; + VarianceAccumulator accumPValue; + VarianceAccumulator accumWeight; + VarianceAccumulator accumNBases; + + mappingBuffers.clusterList.Clear(); + mappingBuffers.revStrandClusterList.Clear(); + + // + // Remove anchors that are fully encompassed by longer ones. This + // speeds up limstemplate a lot. + // + + RemoveOverlappingAnchors(mappingBuffers.matchPosList); + RemoveOverlappingAnchors(mappingBuffers.rcMatchPosList); + + if (params.pValueType == 0) { + if (params.printDotPlots) { + ofstream dotPlotOut; + string dotPlotName = string(read.title) + ".anchors"; + CrucialOpen(dotPlotName, dotPlotOut, std::ios::out); + for (size_t mp = 0; mp < mappingBuffers.matchPosList.size(); mp++ ){ + dotPlotOut << mappingBuffers.matchPosList[mp].q << " " << mappingBuffers.matchPosList[mp].t << " " << mappingBuffers.matchPosList[mp].l << " " << endl; + } + dotPlotOut.close(); + } + /* + This is an optimization that is being tested out that places a grid over the + area where there are anchors, and then finds an increasing maximally weighted + path through the grid. The weight of a cell in the grid is the sum of the + number of anchors in it. All other anchors are to be removed. This will likely + only work for LIMSTemplate sequences, or other sequences with little structural + variation. + FindBand(mappingBuffers.matchPosList, + refCopy, read, 100); + */ + FindMaxIncreasingInterval(Forward, + mappingBuffers.matchPosList, + // allow for indels to stretch out the mapping of the read. + (DNALength) ((read.SubreadLength()) * (1 + params.indelRate)), params.nCandidates, + seqBoundary, + lisPValue,//lisPValue2, + lisWeightFn, + topIntervals, genome, read, intervalSearchParameters, + &mappingBuffers.globalChainEndpointBuffer, + mappingBuffers.clusterList, + accumPValue, accumWeight, accumNBases); + // Uncomment when the version of the weight functor needs the sequence. + + mappingBuffers.clusterList.ResetCoordinates(); + + FindMaxIncreasingInterval(Reverse, mappingBuffers.rcMatchPosList, + (DNALength) ((read.SubreadLength()) * (1 + params.indelRate)), params.nCandidates, + seqBoundary, + lisPValue,//lisPValue2 + lisWeightFn, + topIntervals, genome, readRC, intervalSearchParameters, + &mappingBuffers.globalChainEndpointBuffer, + mappingBuffers.revStrandClusterList, + accumPValue, accumWeight, accumNBases); + } + else if (params.pValueType == 1) { + FindMaxIncreasingInterval(Forward, + mappingBuffers.matchPosList, + // allow for indels to stretch out the mapping of the read. + (DNALength) ((read.SubreadLength()) * (1 + params.indelRate)), params.nCandidates, + seqBoundary, + lisPValueByWeight, // different from pvaltype == 2 and 0 + lisWeightFn, + topIntervals, genome, read, intervalSearchParameters, + &mappingBuffers.globalChainEndpointBuffer, + mappingBuffers.clusterList, + accumPValue, accumWeight, accumNBases); + + + mappingBuffers.clusterList.ResetCoordinates(); + FindMaxIncreasingInterval(Reverse, mappingBuffers.rcMatchPosList, + (DNALength) ((read.SubreadLength()) * (1 + params.indelRate)), params.nCandidates, + seqBoundary, + lisPValueByWeight, // different from pvaltype == 2 and 0 + lisWeightFn, + topIntervals, genome, readRC, intervalSearchParameters, + &mappingBuffers.globalChainEndpointBuffer, + mappingBuffers.revStrandClusterList, + accumPValue, accumWeight, accumNBases); + } + else if (params.pValueType == 2) { + FindMaxIncreasingInterval(Forward, + mappingBuffers.matchPosList, + // allow for indels to stretch out the mapping of the read. + (DNALength) ((read.SubreadLength()) * (1 + params.indelRate)), params.nCandidates, + seqBoundary, + lisPValueByLogSum, // different from pvaltype == 1 and 0 + lisWeightFn, + topIntervals, genome, read, intervalSearchParameters, + &mappingBuffers.globalChainEndpointBuffer, + mappingBuffers.clusterList, + accumPValue, accumWeight, accumNBases); + + mappingBuffers.clusterList.ResetCoordinates(); + FindMaxIncreasingInterval(Reverse, mappingBuffers.rcMatchPosList, + (DNALength) ((read.SubreadLength()) * (1 + params.indelRate)), params.nCandidates, + seqBoundary, + lisPValueByLogSum, // different from pvaltype == 1 and 0 + lisWeightFn, + topIntervals, genome, readRC, intervalSearchParameters, + &mappingBuffers.globalChainEndpointBuffer, + mappingBuffers.revStrandClusterList, + accumPValue, accumWeight, accumNBases); + } + + mappingBuffers.clusterList.numBases.insert(mappingBuffers.clusterList.numBases.end(), + mappingBuffers.revStrandClusterList.numBases.begin(), + mappingBuffers.revStrandClusterList.numBases.end()); + + mappingBuffers.clusterList.numAnchors.insert(mappingBuffers.clusterList.numAnchors.end(), + mappingBuffers.revStrandClusterList.numAnchors.begin(), + mappingBuffers.revStrandClusterList.numAnchors.end()); + + metrics.clocks.findMaxIncreasingInterval.Tock(); + + // + // Print verbose output. + // + WeightedIntervalSet::iterator topIntIt, topIntEnd; + topIntEnd = topIntervals.end(); + if (params.verbosity > 0) { + int topintind = 0; + cout << " intv: index start end qstart qend seq_boundary_start seq_boundary_end pvalue " << endl; + for (topIntIt = topIntervals.begin();topIntIt != topIntEnd ; ++topIntIt) { + cout << " intv: " << topintind << " " << (*topIntIt).start << " " + << (*topIntIt).end << " " + << (*topIntIt).qStart << " " << (*topIntIt).qEnd << " " + << seqBoundary((*topIntIt).start) << " " << seqBoundary((*topIntIt).end) << " " + << (*topIntIt).pValue << endl; + if (params.verbosity > 2) { + for (size_t m = 0; m < (*topIntIt).matches.size(); m++) { + cout << " (" << (*topIntIt).matches[m].q << ", " << (*topIntIt).matches[m].t << ", " << (*topIntIt).matches[m].l << ") "; + } + cout << endl; + } + ++topintind; + } + } + + // + // Allocate candidate alignments on the stack. Each interval is aligned. + // + alignmentPtrs.resize(topIntervals.size()); + UInt i; + for (i = 0; i < alignmentPtrs.size(); i++ ) { + alignmentPtrs[i] = new T_AlignmentCandidate; + } + metrics.clocks.alignIntervals.Tick(); + AlignIntervals( genome, read, readRC, + topIntervals, + SMRTDistanceMatrix, + params.indel, params.indel, + params.sdpTupleSize, + params.useSeqDB, seqdb, + alignmentPtrs, + params, + mappingBuffers, + params.startRead ); + + /* cout << read.title << endl; + for (i = 0; i < alignmentPtrs.size(); i++) { + cout << alignmentPtrs[i]->clusterScore << " " << alignmentPtrs[i]->score << endl; + } + */ + StoreRankingStats(alignmentPtrs, accumPValue, accumWeight); + + std::sort(alignmentPtrs.begin(), alignmentPtrs.end(), SortAlignmentPointersByScore()); + metrics.clocks.alignIntervals.Tock(); + + // + // Evalutate the matches that are found for 'good enough'. + // + + matchFound = CheckForSufficientMatch(read, alignmentPtrs, params); + + // + // When no proper alignments are found, the loop will resume. + // Delete all alignments because they are bad. + // + if (expand < params.maxExpand and matchFound == false) { + DeleteAlignments(alignmentPtrs, 0); + } + + // + // Record some metrics that show how long this took to run per base. + // + + if (alignmentPtrs.size() > 0) { + metrics.RecordNumAlignedBases(read.length); + metrics.RecordNumCells(alignmentPtrs[0]->nCells); + } + + if (matchFound == true) { + metrics.totalAnchorsForMappedReads += mappingBuffers.matchPosList.size() + mappingBuffers.rcMatchPosList.size(); + } + ++expand; + } while ( expand <= params.maxExpand and matchFound == false); + metrics.clocks.total.Tock(); + UInt i; + int totalCells = 0; + for (i = 0; i< alignmentPtrs.size(); i++) { + totalCells += alignmentPtrs[i]->nCells; + } + metrics.clocks.AddCells(totalCells); + int totalBases = 0; + for (i = 0; i < alignmentPtrs.size(); i++) { + totalBases += alignmentPtrs[i]->qLength; + } + metrics.clocks.AddBases(totalBases); + // + // Some of the alignments are to spurious regions. Delete the + // references that have too small of a score. + // + + int effectiveReadLength = 0; + for (i = 0; i< read.length; i++) { + if (read.seq[i] != 'N') effectiveReadLength++; + } + if (params.sdpFilterType == 0) { + RemoveLowQualityAlignments(read, alignmentPtrs, params); + } + else if (params.sdpFilterType == 1) { + RemoveLowQualitySDPAlignments(effectiveReadLength, alignmentPtrs, params); + } + + // + // Now remove overlapping alignments. + // + + vector bothQueryStrands; + bothQueryStrands.resize(2); + bothQueryStrands[Forward] = &read; + bothQueryStrands[Reverse] = &readRC; + + + // + // Possibly use banded dynamic programming to refine the columns + // of an alignment and the alignment score. + // + if (params.refineAlignments) { + RefineAlignments(bothQueryStrands, genome, alignmentPtrs, params, mappingBuffers); + RemoveLowQualityAlignments(read,alignmentPtrs,params); + RemoveOverlappingAlignments(alignmentPtrs, params); + } + + // + // Look to see if the number of anchors found for this read match + // what is expected given the expected distribution of number of + // anchors. + // + + if (alignmentPtrs.size() > 0) { + size_t clusterIndex; + // + // Compute some stats on the read. For now this is fixed but will + // be updated on the fly soon. + // + float meanAnchorBasesPerRead, sdAnchorBasesPerRead; + float meanAnchorsPerRead, sdAnchorsPerRead; + + int lookupValue; + // + // If a very short anchor size was used, or very long min match + // size there may be no precomputed distributions for it. + // Handle this by bounding the min match by the smallest and + // largest values for which there are precomputed statistics. + + int boundedMinWordMatchLength = min(max(params.minMatchLength, PacBio::AnchorDistributionTable::anchorMinKValues[0]), PacBio::AnchorDistributionTable::anchorMinKValues[1]); + + // + // Do a similar bounding for match length and accuracy. + // + int boundedMatchLength = min(max((int) alignmentPtrs[0]->qAlignedSeq.length, PacBio::AnchorDistributionTable::anchorReadLengths[0]), PacBio::AnchorDistributionTable::anchorReadLengths[1]); + int boundedPctSimilarity = min(max((int)alignmentPtrs[0]->pctSimilarity, PacBio::AnchorDistributionTable::anchorReadAccuracies[0]), PacBio::AnchorDistributionTable::anchorReadAccuracies[1]); + + lookupValue = LookupAnchorDistribution(boundedMatchLength, boundedMinWordMatchLength, boundedPctSimilarity, + meanAnchorsPerRead, sdAnchorsPerRead, meanAnchorBasesPerRead, sdAnchorBasesPerRead); + + float minExpAnchors = meanAnchorsPerRead - sdAnchorsPerRead; + // + // The number of standard deviations is just trial and error. + float minExpAnchorBases = meanAnchorBasesPerRead - 2 * sdAnchorBasesPerRead; + if (lookupValue < 0 or minExpAnchorBases < 0) { + minExpAnchorBases = 0; + } + int numSignificantClusters = 0; + int totalSignificantClusterSize = 0; + int maxClusterSize = 0; + int numAlnAnchorBases, numAlnAnchors; + alignmentPtrs[0]->ComputeNumAnchors(boundedMinWordMatchLength, numAlnAnchors, numAlnAnchorBases); + int totalAnchorBases = 0; + if (numAlnAnchorBases > meanAnchorBasesPerRead + sdAnchorBasesPerRead) { + numSignificantClusters = 1; + } + else { + if (alignmentPtrs[0]->score < params.maxScore) { + for (clusterIndex = 0; clusterIndex < mappingBuffers.clusterList.numBases.size(); clusterIndex++) { + if (mappingBuffers.clusterList.numBases[clusterIndex] > maxClusterSize) { + maxClusterSize = mappingBuffers.clusterList.numBases[clusterIndex]; + } + } + int scaledExpectedClusterSize = maxClusterSize / ((float)numAlnAnchorBases) * minExpAnchorBases; + for (clusterIndex = 0; clusterIndex < mappingBuffers.clusterList.numBases.size(); clusterIndex++) { + if (mappingBuffers.clusterList.numBases[clusterIndex] >= scaledExpectedClusterSize) { + // cout << mappingBuffers.clusterList.numBases[clusterIndex] << " " << scaledExpectedClusterSize << " " << meanAnchorBasesPerRead << " " << sdAnchorBasesPerRead << endl; + ++numSignificantClusters; + totalSignificantClusterSize += meanAnchorBasesPerRead; + } + // + // The following output block is useful in debugging mapqv + // calculation. It should be uncommented and examined when + // mapqvs do not look correct. + // + totalAnchorBases += mappingBuffers.clusterList.numBases[clusterIndex]; + } + } + + if (lookupValue == 0) { + alignmentPtrs[0]->ComputeNumAnchors(params.minMatchLength, numAlnAnchors, numAlnAnchorBases); + } + } + + for (i = 0; i < alignmentPtrs.size(); i++) { + alignmentPtrs[i]->numSignificantClusters = numSignificantClusters; + } + if (mapData->clusterFilePtr != NULL and topIntervals.size() > 0 and alignmentPtrs.size() > 0) { + WeightedIntervalSet::iterator intvIt = topIntervals.begin(); + if (params.nProc > 1) { +#ifdef __APPLE__ + sem_wait(semaphores.hitCluster); +#else + sem_wait(&semaphores.hitCluster); +#endif + } + + *mapData->clusterFilePtr << (*intvIt).size << " " << (*intvIt).pValue << " " << (*intvIt).nAnchors << " " + << read.length << " " << alignmentPtrs[0]->score << " " << alignmentPtrs[0]->pctSimilarity << " " + << " " << minExpAnchors << " " << alignmentPtrs[0]->qAlignedSeq.length << endl; + + if (params.nProc > 1) { +#ifdef __APPLE__ + sem_post(semaphores.hitCluster); +#else + sem_post(&semaphores.hitCluster); +#endif + } + } + + } + + // + // Assign the query name and strand for each alignment. + // + + for (i = 0; i < alignmentPtrs.size(); i++) { + T_AlignmentCandidate *aref = alignmentPtrs[i]; + if (aref->tStrand == 0) { + aref->qName = read.GetName(); + } + else { + aref->qName = readRC.GetName(); + } + } + + AssignRefContigLocations(alignmentPtrs, seqdb, genome); +} + +template +void MapRead(T_Sequence &read, T_Sequence &readRC, + vector &alignmentPtrs, + MappingBuffers &mappingBuffers, + MappingIPC *mapData, + MappingSemaphores & semaphores) +{ + DNASuffixArray sarray; + TupleCountTable ct; + SequenceIndexDatabase seqdb; + T_GenomeSequence genome; + BWT *bwtPtr = mapData->bwtPtr; + mapData->ShallowCopySuffixArray(sarray); + mapData->ShallowCopyReferenceSequence(genome); + mapData->ShallowCopySequenceIndexDatabase(seqdb); + mapData->ShallowCopyTupleCountTable(ct); + SeqBoundaryFtr seqBoundary(&seqdb); + + return + MapRead(read, readRC, + genome, // possibly multi fasta file read into one sequence + sarray, *bwtPtr, // The suffix array, and the bwt-fm index structures + seqBoundary, // Boundaries of contigs in the + // genome, alignments do not span + // the ends of boundaries. + ct, // Count table to use word frequencies in the genome to weight matches. + seqdb, // Information about the names of + // chromosomes in the genome, and + // where their sequences are in the genome. + mapData->params,// A huge list of parameters for + // mapping, only compile/command + // line values set. + mapData->metrics, // Keep track of time/ hit counts, + // etc.. Not fully developed, but + // should be. + alignmentPtrs, // Where the results are stored. + mappingBuffers, // A class of buffers for structurs + // like dyanmic programming + // matrices, match lists, etc., that are not + // reallocated between calls to + // MapRead. They are cleared though. + mapData, // Some values that are shared + // across threads. + semaphores); +} + +template +void AlignIntervals(T_TargetSequence &genome, T_QuerySequence &read, T_QuerySequence &rcRead, + WeightedIntervalSet &weightedIntervals, + int mutationCostMatrix[][5], + int ins, int del, int sdpTupleSize, + int useSeqDB, SequenceIndexDatabase &seqDB, + vector &alignments, + MappingParameters ¶ms, + MappingBuffers &mappingBuffers, + int procId) { + (void)(mutationCostMatrix); (void)(ins); (void)(del); (void)(procId); + + vector forrev; + forrev.resize(2); + forrev[Forward] = &read; + forrev[Reverse] = &rcRead; + + // + // Use an edit distance scoring function instead of IDS. Although + // the IDS should be more accurate, it is more slow, and it is more + // important at this stage to have faster alignments than accurate, + // since all alignments are rerun using GuidedAlignment later on. + // + DistanceMatrixScoreFunction distScoreFn(SMRTDistanceMatrix, params.insertion, params.deletion); + DistanceMatrixScoreFunction distScoreFn2(SMRTDistanceMatrix, ins, ins); + + // + // Assume there is at least one interval. + // + if (weightedIntervals.size() == 0) + return; + + WeightedIntervalSet::iterator intvIt = weightedIntervals.begin(); + int alignmentIndex = 0; + + + do { + + T_AlignmentCandidate *alignment = alignments[alignmentIndex]; + alignment->clusterWeight= (*intvIt).size; // totalAnchorSize == size + alignment->clusterScore = (*intvIt).pValue; + + // + // Advance references. Intervals are stored in reverse order, so + // go backwards in the list, and alignments are in forward order. + // That should probably be changed. + // + ++alignmentIndex; + + // + // Try aligning the read to the genome. + // + DNALength matchIntervalStart, matchIntervalEnd; + matchIntervalStart = (*intvIt).start; + matchIntervalEnd = (*intvIt).end; + bool readOverlapsContigStart = false; + bool readOverlapsContigEnd = false; + int startOverlappedContigIndex = 0; + int endOverlappedContigIndex = 0; + (void)(readOverlapsContigStart); (void)(readOverlapsContigEnd); (void)(startOverlappedContigIndex); (void)(endOverlappedContigIndex); + + if (params.verbosity > 0) { + cout << "aligning interval : " << read.length << " " << (*intvIt).start << " " + << (*intvIt).end << " " << (*intvIt).qStart << " " << (*intvIt).qEnd + << " " << matchIntervalStart << " to " << matchIntervalEnd << " " + << params.approximateMaxInsertionRate << " " << endl; + } + assert(matchIntervalEnd >= matchIntervalStart); + + // + // If using a sequence database, check to make sure that the + // boundaries of the sequence windows do not overlap with + // the boundaries of the reads. If the beginning is before + // the boundary, move the beginning up to the start of the read. + // If the end is past the end boundary of the read, similarly move + // the window boundary to the end of the read boundary. + + int seqDBIndex = 0; + + // + // Stretch the alignment interval so that it is close to where + // the read actually starts. + // + DNALength subreadStart = read.SubreadStart(); + DNALength subreadEnd = read.SubreadEnd(); + if ((*intvIt).GetStrandIndex() == Reverse) { + subreadEnd = read.MakeRCCoordinate(read.SubreadStart()) + 1; + subreadStart = read.MakeRCCoordinate(read.SubreadEnd()-1); + } + + DNALength lengthBeforeFirstMatch = ((*intvIt).qStart - subreadStart) * params.approximateMaxInsertionRate ; + DNALength lengthAfterLastMatch = (subreadEnd - (*intvIt).qEnd) * params.approximateMaxInsertionRate; + if (matchIntervalStart < lengthBeforeFirstMatch or params.doGlobalAlignment) { + matchIntervalStart = 0; + } + else { + matchIntervalStart -= lengthBeforeFirstMatch; + } + + if (genome.length < matchIntervalEnd + lengthAfterLastMatch or params.doGlobalAlignment) { + matchIntervalEnd = genome.length; + } + else { + matchIntervalEnd += lengthAfterLastMatch; + } + + DNALength intervalContigStartPos, intervalContigEndPos; + if (useSeqDB) { + // + // The sequence db index is the one where the actual match is + // contained. The matchIntervalStart might be before the sequence + // index boundary due to the extrapolation of alignment start by + // insertion rate. If this is the case, bump up the + // matchIntervalStart to be at the beginning of the boundary. + // Modify bounds similarly for the matchIntervalEnd and the end + // of a boundary. + // + seqDBIndex = seqDB.SearchForIndex((*intvIt).start); + intervalContigStartPos = seqDB.seqStartPos[seqDBIndex]; + if (intervalContigStartPos > matchIntervalStart) { + matchIntervalStart = intervalContigStartPos; + } + intervalContigEndPos = seqDB.seqStartPos[seqDBIndex+1] - 1; + if (intervalContigEndPos < matchIntervalEnd) { + matchIntervalEnd = intervalContigEndPos; + } + alignment->tName = seqDB.GetSpaceDelimitedName(seqDBIndex); + alignment->tLength = intervalContigEndPos - intervalContigStartPos; + // + // When there are multiple sequences in the database, store the + // index of this sequence. This lets one compare the contigs + // that reads are mapped to, for instance. + // + alignment->tIndex = seqDBIndex; + } + else { + alignment->tLength = genome.length; + alignment->tName = genome.GetName(); + intervalContigStartPos = 0; + intervalContigEndPos = genome.length; + // + // When there are multiple sequences in the database, store the + // index of this sequence. This lets one compare the contigs + // that reads are mapped to, for instance. + // + } + alignment->qName = read.title; + // + // Look to see if a read overhangs the beginning of a contig. + // + if (params.verbosity > 2) { + cout << "Check for prefix/suffix overlap on interval: " << (*intvIt).qStart << " ?> " << (*intvIt).start - intervalContigStartPos < (*intvIt).start - intervalContigStartPos) { + readOverlapsContigStart = true; + startOverlappedContigIndex = seqDBIndex; + } + + // + // Look to see if the read overhangs the end of a contig. + // + if (params.verbosity > 2) { + cout << "Check for suffix/prefix overlap on interval, read overhang: " << read.length - (*intvIt).qEnd << " ?> " << matchIntervalEnd - (*intvIt).end < matchIntervalEnd - (*intvIt).end) { + if (params.verbosity > 2) { + cout << "read overlaps genome end." << endl; + } + readOverlapsContigEnd = true; + endOverlappedContigIndex = seqDBIndex; + } + int alignScore; + alignScore = 0; + + alignment->tAlignedSeqPos = matchIntervalStart; + alignment->tAlignedSeqLength = matchIntervalEnd - matchIntervalStart; + if ((*intvIt).GetStrandIndex() == Forward) { + alignment->tAlignedSeq.Copy(genome, alignment->tAlignedSeqPos, alignment->tAlignedSeqLength); + alignment->tStrand = Forward; + } + else { + DNALength rcAlignedSeqPos = genome.MakeRCCoordinate(alignment->tAlignedSeqPos + alignment->tAlignedSeqLength - 1); + genome.CopyAsRC(alignment->tAlignedSeq, rcAlignedSeqPos, alignment->tAlignedSeqLength); + // Map forward coordinates into reverse complement. + + intervalContigStartPos = genome.MakeRCCoordinate(intervalContigStartPos) + 1; + intervalContigEndPos = genome.MakeRCCoordinate(intervalContigEndPos - 1); + swap(intervalContigStartPos, intervalContigEndPos); + alignment->tAlignedSeqPos = rcAlignedSeqPos; + alignment->tStrand = Reverse; + } + + // Configure the part of the query that is aligned. The entire + // query should always be aligned. + alignment->qAlignedSeqPos = 0; + alignment->qAlignedSeq.ReferenceSubstring(read); + alignment->qAlignedSeqLength = alignment->qAlignedSeq.length; + alignment->qLength = read.length; + alignment->qStrand = 0; + + if (params.verbosity > 1) { + cout << "aligning read " << endl; + static_cast(&(alignment->qAlignedSeq))->PrintSeq(cout); + cout << endl << "aligning reference" << endl; + static_cast(&(alignment->tAlignedSeq))->PrintSeq(cout); + cout << endl; + } + + // + // The type of alignment that is performed depends on the mode + // blasr is running in. If it is running in normal mode, local + // aligment is performed and guided by SDP alignment. When + // running in overlap mode, the alignments are forced to the ends + // of reads. + // + + int intervalSize = 0; + // + // Check to see if the matches to the genome are sufficiently + // dense to allow them to be used instead of having to redo + // sdp alignment. + // + + // First count how much of the read matches the genome exactly. + for (size_t m = 0; m < intvIt->matches.size(); m++) { intervalSize += intvIt->matches[m].l;} + + int subreadLength = forrev[(*intvIt).GetStrandIndex()]->SubreadEnd() - forrev[(*intvIt).GetStrandIndex()]->SubreadStart(); + if ((1.0*intervalSize) / subreadLength < params.sdpBypassThreshold and !params.emulateNucmer) { + // + // Not enough of the read maps to the genome, need to use + // sdp alignment to define the regions of the read that map. + // + if (params.refineBetweenAnchorsOnly) { + + // + // Run SDP alignment only between the genomic anchors, + // including the genomic anchors as part of the alignment. + // + size_t m; + + vector *matches; + vector rcMatches; + Alignment anchorsOnly; + DNASequence tAlignedSeq; + FASTQSequence qAlignedSeq; + // + // The strand bookkeeping is a bit confusing, so hopefully + // this will set things straight. + // + // If the alignment is forward strand, the coordinates of the + // blocks are relative to the forward read, starting at 0, not + // the subread start. + // If the alignment is reverse strand, the coordinates of the + // blocks are relative to the reverse strand, starting at the + // position of the subread on the reverse strand. + // + // The coordinates of the blocks in the genome are always + // relative to the forward strand on the genome, starting at + // 0. + // + + // + // The first step to refining between anchors only is to make + // the anchors relative to the tAlignedSeq. + + matches = (vector*) &(*intvIt).matches; + tAlignedSeq = alignment->tAlignedSeq; + qAlignedSeq = alignment->qAlignedSeq; + + if (alignment->tStrand == 0) { + for (m = 0; m < matches->size(); m++) { + (*matches)[m].t -= alignment->tAlignedSeqPos; + (*matches)[m].q -= alignment->qAlignedSeqPos; + } + } + else { + // + // Flip the entire alignment if it is on the reverse strand. + DNALength rcAlignedSeqPos = genome.MakeRCCoordinate(alignment->tAlignedSeqPos + alignment->tAlignedSeqLength - 1); + for (m = 0; m < matches->size(); m++) { + (*matches)[m].t -= rcAlignedSeqPos; + (*matches)[m].q -= alignment->qAlignedSeqPos; + } + + alignment->tAlignedSeq.CopyAsRC(tAlignedSeq); + rcMatches.resize((*intvIt).matches.size()); + // + // Make the reverse complement of the match list. + // + + // 1. Reverse complement the coordinates. + for (m = 0; m < (*intvIt).matches.size(); m++) { + int revCompIndex = rcMatches.size() - m - 1; + rcMatches[revCompIndex].q = read.MakeRCCoordinate((*intvIt).matches[m].q + (*intvIt).matches[m].l - 1); + rcMatches[revCompIndex].t = tAlignedSeq.MakeRCCoordinate((*intvIt).matches[m].t + (*intvIt).matches[m].l - 1); + rcMatches[revCompIndex].l = (*intvIt).matches[m].l; + } + matches = &rcMatches; + } + + /* + Uncomment to get a dot plot + ofstream matchFile; + matchFile.open("matches.txt"); + matchFile << "q t l " << endl; + for (m = 0; matches->size() > 0 and m < matches->size() - 1; m++) { + matchFile << (*matches)[m].q << " " << (*matches)[m].t << " " << (*matches)[m].l << endl; + } + */ + DNASequence tSubSeq; + FASTQSequence qSubSeq; + for (m = 0; matches->size() > 0 and m < matches->size() - 1; m++) { + Block block; + block.qPos = (*matches)[m].q; + block.tPos = (*matches)[m].t; + block.length = (*matches)[m].l; + + // + // Find the lengths of the gaps between anchors. + // + int tGap, qGap; + tGap = (*matches)[m+1].t - ((*matches)[m].t + (*matches)[m].l); + qGap = (*matches)[m+1].q - ((*matches)[m].q + (*matches)[m].l); + + if (tGap > 0 and qGap > 0) { + DNALength tPos, qPos; + tPos = block.tPos + block.length; + qPos = block.qPos + block.length; + tSubSeq.ReferenceSubstring(tAlignedSeq, tPos, tGap); + qSubSeq.ReferenceSubstring(alignment->qAlignedSeq, qPos, qGap); + Alignment alignmentInGap; + + /* + The following code is experimental code for trying to do + something like affine gap alignment in long gaps. It + would eventually be used in cDNA alignment to align + between exons, but for now is being tested here by using + it to align when there is a big gap between anchors. + */ + if (params.separateGaps == true and + qSubSeq.length > 0 and tSubSeq.length > 0 and + ( (1.0*qSubSeq.length)/tSubSeq.length < 0.25 )) { + OneGapAlign(qSubSeq, tSubSeq, distScoreFn, mappingBuffers, alignmentInGap); + } + else { + /* + This is the 'normal/default' way to align between + gaps. It is more well tested than OneGapAlign. + */ + SDPAlign(qSubSeq, tSubSeq, distScoreFn, params.sdpTupleSize, + params.sdpIns, params.sdpDel, params.indelRate*2, + alignmentInGap, mappingBuffers, Global, + params.detailedSDPAlignment, + params.extendFrontAlignment, + params.recurseOver, + params.fastSDP); + } + + // + // Now, splice the fragment alignment into the current + // alignment. + // + if (alignmentInGap.blocks.size() > 0) { + size_t b; + // + // Configure this block to be relative to the beginning + // of the aligned substring. + // + for (b = 0; b < alignmentInGap.size(); b++) { + alignmentInGap.blocks[b].tPos += tPos + alignmentInGap.tPos; + alignmentInGap.blocks[b].qPos += qPos + alignmentInGap.qPos; + assert(alignmentInGap.blocks[b].tPos < alignment->tAlignedSeq.length); + assert(alignmentInGap.blocks[b].qPos < alignment->qAlignedSeq.length); + } + } + // Add the original block + alignment->blocks.push_back(block); + anchorsOnly.blocks.push_back(block); + // Add the blocks for the refined alignment + alignment->blocks.insert(alignment->blocks.end(), + alignmentInGap.blocks.begin(), + alignmentInGap.blocks.end()); + } + } + + // Add the last block + m = (*matches).size() - 1; + Block block; + block.qPos = (*matches)[m].q; + block.tPos = (*matches)[m].t; + + assert(block.tPos <= alignment->tAlignedSeq.length); + assert(block.qPos <= alignment->qAlignedSeq.length); + + block.length = (*matches)[m].l; + alignment->blocks.push_back(block); + anchorsOnly.blocks.push_back(block); + + // + // By convention, blocks start at 0, and the + // alignment->tPos,qPos give the start of the alignment. + // Modify the block positions so that they are offset by 0. + alignment->tPos = alignment->blocks[0].tPos; + alignment->qPos = alignment->blocks[0].qPos; + size_t b; + size_t blocksSize = alignment->blocks.size(); + for (b = 0; b < blocksSize ; b++) { + assert(alignment->tPos <= alignment->blocks[b].tPos); + assert(alignment->qPos <= alignment->blocks[b].qPos); + alignment->blocks[b].tPos -= alignment->tPos; + alignment->blocks[b].qPos -= alignment->qPos; + } + for (b = 0; b < anchorsOnly.blocks.size(); b++) { + anchorsOnly.blocks[b].tPos -= alignment->tPos; + anchorsOnly.blocks[b].qPos -= alignment->qPos; + } + anchorsOnly.tPos = alignment->tPos; + anchorsOnly.qPos = alignment->qPos; + ComputeAlignmentStats(*alignment, alignment->qAlignedSeq.seq, alignment->tAlignedSeq.seq, + distScoreFn); + + tAlignedSeq.Free(); + qAlignedSeq.Free(); + tSubSeq.Free(); + qSubSeq.Free(); + } + else { + alignScore = SDPAlign(alignment->qAlignedSeq, alignment->tAlignedSeq, distScoreFn, + sdpTupleSize, params.sdpIns, params.sdpDel, params.indelRate*3, + *alignment, mappingBuffers, + Local, + params.detailedSDPAlignment, + params.extendFrontAlignment, + params.recurseOver, + params.fastSDP); + ComputeAlignmentStats(*alignment, alignment->qAlignedSeq.seq, alignment->tAlignedSeq.seq, + distScoreFn); + } + } + else { + // + // The anchors used to anchor the sequence are sufficient to extend the alignment. + // + size_t m; + for (m = 0; m < (*intvIt).matches.size(); m++ ){ + Block block; + block.qPos = (*intvIt).matches[m].q - alignment->qAlignedSeqPos; + block.tPos = (*intvIt).matches[m].t - alignment->tAlignedSeqPos; + block.length = (*intvIt).matches[m].l; + alignment->blocks.push_back(block); + } + } + + // + // The anchors/sdp alignments may leave portions of the read + // unaligned at the beginning and end. If the parameters + // specify extending alignments, try and align extra bases at + // the beginning and end of alignments. + if (params.extendAlignments) { + + // + // Modify the alignment so that the start and end of the + // alignment strings are at the alignment boundaries. + // + // Since the query sequence is pointing at a subsequence of the + // read (and is always in the forward direction), just reference + // a new portion of the read. + alignment->qAlignedSeqPos = alignment->qAlignedSeqPos + alignment->qPos; + alignment->qAlignedSeqLength = alignment->QEnd(); + alignment->qAlignedSeq.ReferenceSubstring(read, alignment->qAlignedSeqPos, alignment->qAlignedSeqLength ); + alignment->qPos = 0; + + // + // Since the target sequence may be on the forward or reverse + // strand, a copy of the subsequence is made, and the original + // sequence free'd. + // + DNASequence tSubseq; + alignment->tAlignedSeqPos = alignment->tAlignedSeqPos + alignment->tPos; + alignment->tAlignedSeqLength = alignment->TEnd(); + tSubseq.Copy(alignment->tAlignedSeq, alignment->tPos, alignment->tAlignedSeqLength); + alignment->tPos = 0; + + alignment->tAlignedSeq.Free(); + alignment->tAlignedSeq.TakeOwnership(tSubseq); + + DNALength maximumExtendLength = 500; + + if (alignment->blocks.size() > 0 ) { + int lastAlignedBlock = alignment->blocks.size() - 1; + DNALength lastAlignedQPos = alignment->blocks[lastAlignedBlock].QEnd() + alignment->qPos + alignment->qAlignedSeqPos; + DNALength lastAlignedTPos = alignment->blocks[lastAlignedBlock].TEnd() + alignment->tPos + alignment->tAlignedSeqPos; + T_AlignmentCandidate extendedAlignmentForward, extendedAlignmentReverse; + int forwardScore, reverseScore; + + SMRTSequence readSuffix; + DNALength readSuffixLength; + DNASequence genomeSuffix; + DNALength genomeSuffixLength; + + SMRTSequence readPrefix; + DNALength readPrefixLength; + DNASequence genomePrefix; + DNALength genomePrefixLength; + + // + // Align the entire end of the read if it is short enough. + // + readSuffixLength = min(read.length - lastAlignedQPos, maximumExtendLength); + if (readSuffixLength > 0) { + readSuffix.ReferenceSubstring(read, lastAlignedQPos, readSuffixLength); + } + else { + readSuffix.length = 0; + } + + // + // Align The entire end of the genome up to the maximum extend length; + // + genomeSuffixLength = min(intervalContigEndPos - lastAlignedTPos, maximumExtendLength); + if (genomeSuffixLength > 0) { + if (alignment->tStrand == Forward) { + genomeSuffix.Copy(genome, lastAlignedTPos, genomeSuffixLength); + } + else { + static_cast(&genome)->CopyAsRC(genomeSuffix, lastAlignedTPos, genomeSuffixLength); + } + } + else { + genomeSuffix.length = 0; + } + forwardScore = 0; + if (readSuffix.length > 0 and genomeSuffix.length > 0) { + forwardScore = ExtendAlignmentForward(readSuffix, 0, + genomeSuffix, 0, + params.extendBandSize, + // Reuse buffers to speed up alignment + mappingBuffers.scoreMat, + mappingBuffers.pathMat, + // Do the alignment in the forward direction. + extendedAlignmentForward, + distScoreFn, + 1, // don't bother attempting + // to extend the alignment + // if one of the sequences + // is less than 1 base long + params.maxExtendDropoff); + } + + if ( forwardScore < 0 ) { + // + // The extended alignment considers the whole genome, but + // should be modified to be starting at the end of where + // the original alignment left off. + // + if (params.verbosity > 0) { + cout << "forward extended an alignment of score " << alignment->score << " with score " << forwardScore << " by " << extendedAlignmentForward.blocks.size() << " blocks and length " << extendedAlignmentForward.blocks[extendedAlignmentForward.blocks.size()-1].qPos << endl; + } + extendedAlignmentForward.tAlignedSeqPos = lastAlignedTPos; + + extendedAlignmentForward.qAlignedSeqPos = lastAlignedQPos; + + genomeSuffix.length = extendedAlignmentForward.tPos + extendedAlignmentForward.TEnd(); + alignment->tAlignedSeq.Append(genomeSuffix); + alignment->qAlignedSeq.length += extendedAlignmentForward.qPos + extendedAlignmentForward.QEnd(); + assert(alignment->qAlignedSeq.length <= read.length); + alignment->AppendAlignment(extendedAlignmentForward); + } + + DNALength firstAlignedQPos = alignment->qPos + alignment->qAlignedSeqPos; + DNALength firstAlignedTPos = alignment->tPos + alignment->tAlignedSeqPos; + + readPrefixLength = min(firstAlignedQPos, maximumExtendLength); + if (readPrefixLength > 0) { + readPrefix.ReferenceSubstring(read, firstAlignedQPos-readPrefixLength, readPrefixLength); + } + else { + readPrefix.length = 0; + } + + genomePrefixLength = min(firstAlignedTPos - intervalContigStartPos, maximumExtendLength); + if (genomePrefixLength > 0) { + if (alignment->tStrand == 0) { + genomePrefix.Copy(genome, firstAlignedTPos - genomePrefixLength, genomePrefixLength); + } + else { + static_cast(&genome)->MakeRC(genomePrefix, firstAlignedTPos - genomePrefixLength, genomePrefixLength); + } + } + reverseScore = 0; + if (readPrefix.length > 0 and genomePrefix.length > 0) { + reverseScore = ExtendAlignmentReverse(readPrefix, readPrefix.length-1, + genomePrefix, genomePrefixLength - 1, + params.extendBandSize, //k + mappingBuffers.scoreMat, + mappingBuffers.pathMat, + extendedAlignmentReverse, + distScoreFn, + 1, // don't bother attempting + // to extend the alignment + // if one of the sequences + // is less than 1 base long + params.maxExtendDropoff); + } + + if (reverseScore < 0 ) { + // + // Make alignment->tPos relative to the beginning of the + // extended alignment so that when it is appended, the + // coordinates match correctly. + if (params.verbosity > 0) { + cout << "reverse extended an alignment of score " << alignment->score << " with score " << reverseScore << " by " << extendedAlignmentReverse.blocks.size() << " blocks and length " << extendedAlignmentReverse.blocks[extendedAlignmentReverse.blocks.size()-1].qPos << endl; + } + extendedAlignmentReverse.tAlignedSeqPos = firstAlignedTPos - genomePrefixLength; + extendedAlignmentReverse.qAlignedSeqPos = firstAlignedQPos - readPrefixLength; + extendedAlignmentReverse.AppendAlignment(*alignment); + + genomePrefix.Append(alignment->tAlignedSeq, genomePrefix.length - alignment->tPos); + alignment->tAlignedSeq.Free(); + alignment->tAlignedSeq.TakeOwnership(genomePrefix); + + alignment->blocks = extendedAlignmentReverse.blocks; + + alignment->tAlignedSeqPos = extendedAlignmentReverse.tAlignedSeqPos; + alignment->tPos = extendedAlignmentReverse.tPos; + + + alignment->qAlignedSeqPos = extendedAlignmentReverse.qAlignedSeqPos; + alignment->qAlignedSeq.length = readPrefix.length + alignment->qAlignedSeq.length; + alignment->qPos = extendedAlignmentReverse.qPos; + alignment->qAlignedSeq.seq = readPrefix.seq; + // + // Make sure the two ways of accounting for aligned sequence + // length are in sync. This needs to go. + // + if (alignment->blocks.size() > 0) { + alignment->qAlignedSeqLength = alignment->qAlignedSeq.length; + alignment->tAlignedSeqLength = alignment->tAlignedSeq.length; + } + else { + alignment->qAlignedSeqLength = alignment->qAlignedSeq.length = 0; + alignment->tAlignedSeqLength = alignment->tAlignedSeq.length = 0; + } + } // end of if (reverseScore < 0 ) + readSuffix.Free(); + readPrefix.Free(); + genomePrefix.Free(); + genomeSuffix.Free(); + } + tSubseq.Free(); + } + + if (params.verbosity > 0) { + cout << "interval align score: " << alignScore << endl; + StickPrintAlignment(*alignment, + (DNASequence&) alignment->qAlignedSeq, + (DNASequence&) alignment->tAlignedSeq, + cout, + 0, alignment->tAlignedSeqPos); + + } + ComputeAlignmentStats(*alignment, + alignment->qAlignedSeq.seq, + alignment->tAlignedSeq.seq, + distScoreFn2); + //SMRTDistanceMatrix, ins, del ); + + + intvIt++; + } while (intvIt != weightedIntervals.end()); +} + + +template +void PairwiseLocalAlign(T_Sequence &qSeq, T_RefSequence &tSeq, + int k, + MappingParameters ¶ms, T_AlignmentCandidate &alignment, + MappingBuffers &mappingBuffers, + AlignmentType alignType) { + // + // Perform a pairwise alignment between qSeq and tSeq, but choose + // the pairwise alignment method based on the parameters. The + // options for pairwise alignment are: + // - Affine KBanded alignment: usually used for sequences with no + // quality information. + // - KBanded alignment: For sequences with quality information. + // Gaps are scored with quality values. + // + QualityValueScoreFunction scoreFn; + scoreFn.del = params.indel; + scoreFn.ins = params.indel; + + DistanceMatrixScoreFunction distScoreFn2( + SMRTDistanceMatrix, params.indel, params.indel); + + IDSScoreFunction idsScoreFn; + idsScoreFn.ins = params.insertion; + idsScoreFn.del = params.deletion; + idsScoreFn.substitutionPrior = params.substitutionPrior; + idsScoreFn.globalDeletionPrior = params.globalDeletionPrior; + idsScoreFn.InitializeScoreMatrix(SMRTDistanceMatrix); + + int kbandScore; + int qvAwareScore; + if (params.ignoreQualities || qSeq.qual.Empty() || !ReadHasMeaningfulQualityValues(qSeq) ) { + + kbandScore = AffineKBandAlign(qSeq, tSeq, SMRTDistanceMatrix, + params.indel+2, params.indel - 3, // homopolymer insertion open and extend + params.indel+2, params.indel - 1, // any insertion open and extend + params.indel, // deletion + k*1.2, + mappingBuffers.scoreMat, mappingBuffers.pathMat, + mappingBuffers.hpInsScoreMat, mappingBuffers.hpInsPathMat, + mappingBuffers.insScoreMat, mappingBuffers.insPathMat, + alignment, Global); + + alignment.score = kbandScore; + if (params.verbosity >= 2) { + cout << "align score: " << kbandScore << endl; + } + } + else { + + + if (qSeq.insertionQV.Empty() == false) { + qvAwareScore = KBandAlign(qSeq, tSeq, SMRTDistanceMatrix, + params.indel+2, // ins + params.indel+2, // del + k, + mappingBuffers.scoreMat, mappingBuffers.pathMat, + alignment, idsScoreFn, alignType); + if (params.verbosity >= 2) { + cout << "ids score fn score: " << qvAwareScore << endl; + } + } + else { + qvAwareScore = KBandAlign(qSeq, tSeq, SMRTDistanceMatrix, + params.indel+2, // ins + params.indel+2, // del + k, + mappingBuffers.scoreMat, mappingBuffers.pathMat, + alignment, scoreFn, alignType); + if (params.verbosity >= 2) { + cout << "qv score fn score: " << qvAwareScore << endl; + } + } + alignment.sumQVScore = qvAwareScore; + alignment.score = qvAwareScore; + alignment.probScore = 0; + } + // Compute stats and assign a default alignment score using an edit distance. + ComputeAlignmentStats(alignment, qSeq.seq, tSeq.seq, distScoreFn2); + + if (params.scoreType == 1) { + alignment.score = alignment.sumQVScore; + } +} + +// Extend target aligned sequence of the input alignement to both ends +// by flankSize bases. Update alignment->tAlignedSeqPos, +// alignment->tAlignedSeqLength and alignment->tAlignedSeq. +void FlankTAlignedSeq(T_AlignmentCandidate * alignment, + SequenceIndexDatabase &seqdb, + DNASequence & genome, + int flankSize) { + assert(alignment != NULL and alignment->tIsSubstring); + + UInt forwardTPos, newTAlignedSeqPos, newTAlignedSeqLen; + // New aligned start position relative to this chromosome, with + // the same direction as alignment->tStrand. + newTAlignedSeqPos = UInt((alignment->tAlignedSeqPos > UInt(flankSize))? + (alignment->tAlignedSeqPos - flankSize): 0); + newTAlignedSeqLen = min(alignment->tAlignedSeqPos + alignment->tAlignedSeqLength + + flankSize, alignment->tLength) - newTAlignedSeqPos; + + if (alignment->tStrand ==0) { + forwardTPos = newTAlignedSeqPos; + } else { + forwardTPos = alignment->tLength - newTAlignedSeqPos - 1; + } + + // Find where this chromosome is in the genome. + int seqIndex = seqdb.GetIndexOfSeqName(alignment->tName); + assert(seqIndex != -1); + UInt newGenomePos = seqdb.ChromosomePositionToGenome(seqIndex, forwardTPos); + + if (alignment->tIsSubstring == false) { + alignment->tAlignedSeq.Free(); + } + alignment->tAlignedSeqPos = newTAlignedSeqPos; + alignment->tAlignedSeqLength = newTAlignedSeqLen; + if (alignment->tStrand == 0) { + alignment->tAlignedSeq.ReferenceSubstring(genome, newGenomePos, newTAlignedSeqLen); + } else { + // Copy and then reverse complement. + genome.MakeRC(alignment->tAlignedSeq, + newGenomePos + 1 - alignment->tAlignedSeqLength, + alignment->tAlignedSeqLength); + alignment->tIsSubstring = false; + } +} + +// Align a subread of a SMRT sequence to target sequence of an alignment. +// Input: +// subread - a subread of a SMRT sequence. +// unrolledRead - the full SMRT sequence. +// alignment - an alignment. +// passDirection - whether or not the subread has the +// same direction as query of the alignment. +// 0 = true, 1 = false. +// subreadInterval - [start, end) interval of the subread in the +// SMRT read. +// subreadIndex - index of the subread in allReadAlignments. +// params - mapping paramters. +// Output: +// allReadAlignments - where the sequence and alignments of the +// subread are saved. +// threadOut - an out stream for debugging the current thread. +void AlignSubreadToAlignmentTarget(ReadAlignments & allReadAlignments, + SMRTSequence & subread, SMRTSequence & unrolledRead, + T_AlignmentCandidate * alignment, + int passDirection, ReadInterval & subreadInterval, + int subreadIndex, + MappingParameters & params, + MappingBuffers & mappingBuffers, + ostream & threadOut) { + assert(passDirection == 0 or passDirection == 1); + // + // Determine where in the genome the subread has mapped. + // + DNASequence alignedForwardRefSequence, alignedReverseRefSequence; + + if (alignment->tStrand == 0) { + // This needs to be changed -- copy copies RHS into LHS, + // CopyAsRC copies LHS into RHS + alignedForwardRefSequence.Copy(alignment->tAlignedSeq); + alignment->tAlignedSeq.CopyAsRC(alignedReverseRefSequence); + } + else { + alignment->tAlignedSeq.CopyAsRC(alignedForwardRefSequence); + alignedReverseRefSequence.Copy(alignment->tAlignedSeq); + } + + IDSScoreFunction idsScoreFn; + idsScoreFn.ins = params.insertion; + idsScoreFn.del = params.deletion; + idsScoreFn.InitializeScoreMatrix(SMRTDistanceMatrix); + idsScoreFn.globalDeletionPrior = params.globalDeletionPrior; + idsScoreFn.substitutionPrior = params.substitutionPrior; + + DistanceMatrixScoreFunction distScoreFn2( + SMRTDistanceMatrix, params.indel, params.indel); + // + // Determine the strand to align the subread to. + // + T_AlignmentCandidate exploded; + bool sameAlignmentPassDirection = (alignment->tStrand == passDirection); + bool computeProbIsFalse = false; + DNASequence & alignedRefSequence = (sameAlignmentPassDirection? + alignedForwardRefSequence:alignedReverseRefSequence); + // + // In the original code, parameters: bandSize=10, alignType=Global, + // sdpTupleSize=4 (instead of 12, Local and 6) were used when + // alignment & pass have different directions. + // + int explodedScore = GuidedAlign(subread, alignedRefSequence, + idsScoreFn, 12, params.sdpIns, params.sdpDel, + params.indelRate, mappingBuffers, exploded, + Local, computeProbIsFalse, 6); + + if (params.verbosity >= 3) { + threadOut << "zmw " << unrolledRead.zmwData.holeNumber + << ", subreadIndex " << subreadIndex + << ", passDirection " << passDirection + << ", subreadInterval [" << subreadInterval.start + << ", " << subreadInterval.end << ")" << endl + << "Exploded score " << explodedScore << endl + << "StickPrintAlignment subread-reference alignment which has" + << " the " << (sameAlignmentPassDirection?"same":"different") + << " direction as the ccs-reference (or the " + << "longestSubread-reference) alignment. " << endl + << "subread: " << endl; + static_cast(&subread)->PrintSeq(threadOut); + threadOut << endl; + threadOut << "alignedRefSeq: " << endl; + static_cast(&alignedRefSequence)->PrintSeq(threadOut); + StickPrintAlignment(exploded, (DNASequence&) subread, + (DNASequence&) alignedRefSequence, + threadOut, exploded.qAlignedSeqPos, + exploded.tAlignedSeqPos); + } + + if (exploded.blocks.size() > 0) { + DistanceMatrixScoreFunction distScoreFn( + SMRTDistanceMatrix, params.indel, params.indel); + ComputeAlignmentStats(exploded, subread.seq, + alignedRefSequence.seq, + distScoreFn2); + if (exploded.score <= params.maxScore) { + // + // The coordinates of the alignment should be + // relative to the reference sequence (the specified chromosome, + // not the whole genome). + // + exploded.qStrand = 0; + exploded.tStrand = sameAlignmentPassDirection?0:1; + exploded.qLength = unrolledRead.length; + exploded.tLength = alignment->tLength; + exploded.tAlignedSeq.Copy(alignedRefSequence); + exploded.tAlignedSeqPos = (passDirection == 0)? + (alignment->tAlignedSeqPos): + (exploded.tLength - alignment->tAlignedSeqPos + - alignment->tAlignedSeqLength); + exploded.tAlignedSeqLength = alignment->tAlignedSeqLength; + + exploded.qAlignedSeq.ReferenceSubstring(subread); + exploded.qAlignedSeqPos = subreadInterval.start; + exploded.qAlignedSeqLength = subreadInterval.end - subreadInterval.start; + exploded.mapQV = alignment->mapQV; + exploded.tName = alignment->tName; + exploded.tIndex = alignment->tIndex; + + stringstream namestrm; + namestrm << "/" << subreadInterval.start + << "_" << subreadInterval.end; + exploded.qName = string(unrolledRead.title) + namestrm.str(); + + // + // Don't call AssignRefContigLocation as the coordinates + // of the alignment is already relative to the chromosome coordiantes. + // + // Save this alignment for printing later. + // + T_AlignmentCandidate *alignmentPtr = new T_AlignmentCandidate; + // Refine concordant alignments + if (params.refineConcordantAlignments) { + vector vquery; + vquery.push_back(&unrolledRead); + RefineAlignment(vquery, alignedRefSequence, exploded, params, mappingBuffers); + } + + *alignmentPtr = exploded; + // + // Check if need to be filtered + // For now filtering only in concordant mode + // Later add filtration in other modes + // + if (allReadAlignments.alignMode == ZmwSubreads) { + if (params.filterCriteria.Satisfy(alignmentPtr)) { + if (params.verbosity > 3) { + std::cerr << " Filters passed. Adding slave alignment in concordant mode" << std::endl; + } + allReadAlignments.AddAlignmentForSeq(subreadIndex, alignmentPtr); + } + else { + // delete alignment immediately + if (params.verbosity > 3) { + std::cerr << " Filters failed. Delete alignment immediately" << std::endl; + } + delete alignmentPtr; + } + } + // for all modes except ZmwSubreads no filtering for now + else { + allReadAlignments.AddAlignmentForSeq(subreadIndex, alignmentPtr); + } + } // End of exploded score <= maxScore. + if (params.verbosity >= 3) { + threadOut << "exploded score: " << exploded.score << endl + << "exploded alignment: "<< endl; + exploded.Print(threadOut); + threadOut << endl; + } + } // End of exploded.blocks.size() > 0. +} diff -Nru blasr-0~20151014+git8e668be/iblasr/BlasrHeaders.h blasr-5.3/iblasr/BlasrHeaders.h --- blasr-0~20151014+git8e668be/iblasr/BlasrHeaders.h 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/iblasr/BlasrHeaders.h 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,114 @@ +#pragma once + +#ifdef __GLIBC__ +# include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(__GLIBC__) || defined(__APPLE__) +# include +#endif + +#define MAX_PHRED_SCORE 254 +#define MAPQV_END_ALIGN_WIGGLE 5 + +using namespace std; + +#include +#ifdef USE_PBBAM +#include +#include +#endif + +#include +#include +#include +#include +#include +#include +#include + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "MappingIPC.h" +#include "MappingSemaphores.h" +#include "MappingBuffers.hpp" +#include "ReadAlignments.hpp" + + +typedef SMRTSequence T_Sequence; +typedef FASTASequence T_GenomeSequence; +typedef DNASuffixArray T_SuffixArray; +typedef DNATuple T_Tuple; +typedef LISPValueWeightor > PValueWeightor; +typedef LISSMatchFrequencyPValueWeightor > MultiplicityPValueWeightor; +typedef MappingData MappingIPC; diff -Nru blasr-0~20151014+git8e668be/iblasr/BlasrMiscs.hpp blasr-5.3/iblasr/BlasrMiscs.hpp --- blasr-0~20151014+git8e668be/iblasr/BlasrMiscs.hpp 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/iblasr/BlasrMiscs.hpp 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,62 @@ +// Author: Mark Chaisson +#pragma once + +#include "BlasrHeaders.h" + +//-------------------------Fetch Reads----------------------------// +template +bool GetNextReadThroughSemaphore(ReaderAgglomerate &reader, + MappingParameters ¶ms, + T_Sequence &read, + string & readGroupId, + int & associatedRandInt, + MappingSemaphores & semaphores); + +//---------------------MAKE & CHECK READS-------------------------// +//FIXME: move to SMRTSequence +bool ReadHasMeaningfulQualityValues(FASTQSequence &sequence); + +//FIXME: Move to SMRTSequence +// Given a SMRT sequence and a subread interval, make the subread. +// Input: +// smrtRead - a SMRT sequence +// subreadInterval - a subread interval +// params - mapping parameters +// Output: +// subreadSequence - the constructed subread +void MakeSubreadOfInterval(SMRTSequence & subreadSequence, + SMRTSequence & smrtRead, + ReadInterval & subreadInterval, + MappingParameters & params); + +//FIXME: Move to SMRTSequence +// Given a SMRT sequence and one of its subreads, make the +// reverse complement of the subread in the coordinate of the +// reverse complement sequence of the SMRT sequence. +// Input: +// smrtRead - a SMRT read +// subreadSequence - a subread of smrtRead +// Output: +// subreadSequenceRC - the reverse complement of the subread +// in the coordinate of the reverse +// complement of the SMRT read. +void MakeSubreadRC(SMRTSequence & subreadSequenceRC, + SMRTSequence & subreadSequence, + SMRTSequence & smrtRead); + +// Construct subreads invervals from subreads +void MakeSubreadIntervals(vector & subreads, + vector & subreadIntervals); + +// Return index of subread which will be used as concordant template. +// If Zmw has exactly one subread, return index of the subread (i.e., 0). +// If Zmw has exactly two subreads, return index of the longer subread. +// If Zmw has three or more subreads, return index of the median-length +// subread in range subreadIntervals[1:-1]. Avoid using the first and last +// subreads (which are less likely to be full-pass) if possible. +int GetIndexOfConcordantTemplate(const vector & subreadIntervals); + +//-------------------------MISC-----------------------------------// +int CountZero(unsigned char *ptr, int length); + +#include "BlasrMiscsImpl.hpp" diff -Nru blasr-0~20151014+git8e668be/iblasr/BlasrMiscsImpl.hpp blasr-5.3/iblasr/BlasrMiscsImpl.hpp --- blasr-0~20151014+git8e668be/iblasr/BlasrMiscsImpl.hpp 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/iblasr/BlasrMiscsImpl.hpp 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,179 @@ +// Author: Mark Chaisson +#pragma once + +#include + +template +bool GetNextReadThroughSemaphore(ReaderAgglomerate &reader, + MappingParameters ¶ms, + T_Sequence &read, + string & readGroupId, + int & associatedRandInt, + MappingSemaphores & semaphores) +{ + // Wait on a semaphore + if (params.nProc > 1) { +#ifdef __APPLE__ + sem_wait(semaphores.reader); +#else + sem_wait(&semaphores.reader); +#endif + } + + bool returnValue = true; + // + // CCS Reads are read differently from other reads. Do static casting here + // of this. + // + if (reader.GetNext(read, associatedRandInt) == 0) { + returnValue = false; + } + + // + // Set the read group id before releasing the semaphore, since other + // threads may change the reader object to a new read group before + // sending this alignment out to printing. + readGroupId = reader.readGroupId; + + if (params.nProc > 1) { +#ifdef __APPLE__ + sem_post(semaphores.reader); +#else + sem_post(&semaphores.reader); +#endif + } + return returnValue; +} + + + +bool ReadHasMeaningfulQualityValues(FASTQSequence &sequence) +{ + if (sequence.qual.Empty() == true) { + return 0; + } + else { + int numZero=0, numNonZero=0; + if (sequence.qual.data == NULL) { + return false; + } + numZero = CountZero(sequence.qual.data, sequence.length); + numNonZero = sequence.length - numZero; + int subNumZero = 0, subNonZero = 0; + + if (sequence.substitutionQV.data == NULL) { + return false; + } + subNumZero = CountZero(sequence.substitutionQV.data, sequence.length); + subNonZero = sequence.length - subNumZero; + + if (numZero < 0.5*numNonZero and subNumZero < 0.5 * subNonZero) { + return true; + } + else { + return false; + } + } +} + +// Given a SMRT sequence and a subread interval, make the subread. +// Input: +// smrtRead - a SMRT sequence +// subreadInterval - a subread interval +// params - mapping parameters +// Output: +// subreadSequence - the constructed subread +void MakeSubreadOfInterval(SMRTSequence & subreadSequence, + SMRTSequence & smrtRead, + ReadInterval & subreadInterval, + MappingParameters & params) +{ + int start = subreadInterval.start; + int end = subreadInterval.end; + + assert(smrtRead.length >= subreadSequence.length); + smrtRead.MakeSubreadAsMasked(subreadSequence, start, end); + + if (!params.preserveReadTitle) { + smrtRead.SetSubreadTitle(subreadSequence, + subreadSequence.SubreadStart(), + subreadSequence.SubreadEnd()); + } + else { + subreadSequence.CopyTitle(smrtRead.title); + } + subreadSequence.zmwData = smrtRead.zmwData; +} + +// Given a SMRT sequence and one of its subreads, make the +// reverse complement of the subread in the coordinate of the +// reverse complement sequence of the SMRT sequence. +// Input: +// smrtRead - a SMRT read +// subreadSequence - a subread of smrtRead +// Output: +// subreadSequenceRC - the reverse complement of the subread +// in the coordinate of the reverse +// complement of the SMRT read. +void MakeSubreadRC(SMRTSequence & subreadSequenceRC, + SMRTSequence & subreadSequence, + SMRTSequence & smrtRead) +{ + assert(smrtRead.length >= subreadSequence.length); + // Reverse complement sequence of the subread. + subreadSequence.MakeRC(subreadSequenceRC); + // Update start and end positions of subreadSequenceRC in the + // coordinate of reverse compelement sequence of the SMRT read. + subreadSequenceRC.SubreadStart(smrtRead.length - subreadSequence.SubreadEnd()); + subreadSequenceRC.SubreadEnd (smrtRead.length - subreadSequence.SubreadStart()); + subreadSequenceRC.zmwData = smrtRead.zmwData; +} + +int CountZero(unsigned char *ptr, int length) +{ + int i; + int nZero = 0; + for (i = 0; i < length; i++) { + if (ptr[i] == 0) { ++nZero; } + } + return nZero; +} + +void MakeSubreadIntervals(vector & subreads, + vector & subreadIntervals) +{ + subreadIntervals.clear(); + for (auto subread: subreads) { + subreadIntervals.push_back(ReadInterval(subread.SubreadStart(), + subread.SubreadEnd(), subread.highQualityRegionScore)); + } +} + +int GetIndexOfConcordantTemplate(const vector & subreadIntervals) +{ + assert(subreadIntervals.size() != 0); + if (subreadIntervals.size() == 1) return 0; // Zmw has exactly one subread. + else if (subreadIntervals.size() == 2) { + // Zmw has two subreads, return index of the longer one. + const ReadInterval & first = subreadIntervals[0]; + const ReadInterval & second = subreadIntervals[1]; + if (first.Length() < second.Length()) return 1; + else return 0; + } else { + // Zmw has more than two subreads, look for the median-length subread + // in subreadIntervals[1:-1]. The first and last subreads are not + // considered because they are usually non-full-pass. + vector intervals; + intervals.insert(intervals.begin(), subreadIntervals.begin() + 1, subreadIntervals.end() - 1); + std::sort(intervals.begin(), intervals.end(), + [](const ReadInterval& a, const ReadInterval& b)->bool + {return a.Length() < b.Length();}); + const ReadInterval & template_interval = intervals[int(intervals.size()/2)]; + for (int pos = 1; pos < int(subreadIntervals.size()) -1; pos ++) { + if (subreadIntervals[pos] == template_interval) { + return pos; + } + } + } + return 0; +} diff -Nru blasr-0~20151014+git8e668be/iblasr/BlasrUtils.hpp blasr-5.3/iblasr/BlasrUtils.hpp --- blasr-0~20151014+git8e668be/iblasr/BlasrUtils.hpp 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/iblasr/BlasrUtils.hpp 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,170 @@ +// Author: Mark Chaisson +#pragma once + +#include "BlasrHeaders.h" + +//----------------------MODIFY ALIGNMENTS--------------------------// +//FIXME: refactor class SequenceIndexDatabase +void AssignRefContigLocation(T_AlignmentCandidate &alignment, + SequenceIndexDatabase &seqdb, + DNASequence &genome); + +//FIXME: refactor class SequenceIndexDatabase +void AssignRefContigLocations(vector &alignmentPtrs, + SequenceIndexDatabase &seqdb, + DNASequence &genome); + +template +//FIXME: refactor class SequenceIndexDatabase +void AssignGenericRefContigName(vector &alignmentPtrs, + T_RefSequence &genome); + +//FIXME: move to class ReadAlignments +void StoreRankingStats(vector &alignments, + VarianceAccumulator &accumPValue, + VarianceAccumulator &accumWeight); + +//FIXME: mapQV should be assigned when alignments are created. +void AssignMapQV(vector &alignmentPtrs); + +//FIXME: move to class ReadAlignments +void ScaleMapQVByClusterSize(T_AlignmentCandidate &alignment, + MappingParameters ¶ms); + +void StoreMapQVs(SMRTSequence &read, + vector &alignmentPtrs, + MappingParameters ¶ms); + + +//--------------------SEARCH & CHECK ALIGNMENTS-------------------// +//FIXME: move to class ReadAlignments +template +bool CheckForSufficientMatch(T_Sequence &read, + vector &alignmentPtrs, + MappingParameters ¶ms); + +//FIXME: move to class ReadAlignments +int FindMaxLengthAlignment(vector alignmentPtrs, + int &maxLengthIndex); + +//FIXME: move to class T_AlignmentCandidate +void SumMismatches(SMRTSequence &read, + T_AlignmentCandidate &alignment, + int mismatchScore, + int fullIntvStart, int fullIntvEnd, + MappingParameters ¶ms, + int &sum); + +//FIXME: move to class T_AlignmentCandidate +/// \returns whether two alignments overlap by more than minPcercentOverlap% +bool AlignmentsOverlap(T_AlignmentCandidate &alnA, + T_AlignmentCandidate &alnB, + float minPercentOverlap); + +/// \Partition overlapping alignments. +void PartitionOverlappingAlignments(vector &alignmentPtrs, + vector > &partitions, + float minOverlap); + + +//--------------------FILTER ALIGNMENTS---------------------------// +//FIXME: move to class T_AlignmentCandidate and ReadAlignments +int RemoveLowQualitySDPAlignments(int readLength, + vector &alignmentPtrs, + MappingParameters ¶ms); + +//FIXME: move to class ReadAlignments +template +int RemoveLowQualityAlignments(T_Sequence &read, + vector &alignmentPtrs, + MappingParameters ¶ms); + +//FIXME: move to class ReadAlignments +int RemoveOverlappingAlignments(vector &alignmentPtrs, + MappingParameters ¶ms); + +// FIXME: move to class ReadAlignments +// Delete all alignments from index startIndex in vector, inclusive. +void DeleteAlignments(vector &alignmentPtrs, + int startIndex=0); + +//--------------------REFINE ALIGNMENTS---------------------------// +template +void RefineAlignment(vector &bothQueryStrands, + T_RefSequence &genome, + T_AlignmentCandidate &alignmentCandidate, + MappingParameters ¶ms, + MappingBuffers &mappingBuffers); + + +template +void RefineAlignments(vector &bothQueryStrands, + T_RefSequence &genome, + vector &alignmentPtrs, + MappingParameters ¶ms, + MappingBuffers &mappingBuffers); + + +//--------------------PRINT ALIGNMENTS---------------------------// +vector +SelectAlignmentsToPrint(vector alignmentPtrs, + MappingParameters & params, + const int & associatedRandInt); + +// +// The full read is not the subread, and does not have masked off characters. +// +void PrintAlignment(T_AlignmentCandidate &alignment, + SMRTSequence &fullRead, + MappingParameters ¶ms, + AlignmentContext &alignmentContext, + ostream &outFile +#ifdef USE_PBBAM + , SMRTSequence &subread + , PacBio::BAM::IRecordWriter * bamWriterPtr +#endif + ); + +// Print all alignments in vector alignmentPtrs +void PrintAlignments(vector alignmentPtrs, + SMRTSequence &read, + MappingParameters ¶ms, ostream &outFile, + AlignmentContext alignmentContext, +#ifdef USE_PBBAM + SMRTSequence &subread, + PacBio::BAM::IRecordWriter * bamWriterPtr, +#endif + MappingSemaphores & semaphores); + +void PrintAlignmentPtrs(vector & alignmentPtrs, + ostream & out = cout); + + +// Print an unaligned read, if noPrintUnalignedSeqs is True, print title only; +// otherwise, print title and sequence of the read. +void PrintUnaligned(const SMRTSequence & unalignedRead, + ostream & unalignedFilePtr, + const bool noPrintUnalignedSeqs); + +// Print all alignments for subreads in allReadAlignments. +// Input: +// allReadAlignments - contains a set of subreads, each of which +// is associated with a group of alignments. +// alignmentContext - an alignment context of each subread used +// for printing in SAM format. +// params - mapping parameters. +// Output: +// outFilePtr - where to print alignments for subreads. +// unalignedFilePtr - where to print sequences for unaligned subreads. +void PrintAllReadAlignments(ReadAlignments & allReadAlignments, + AlignmentContext & alignmentContext, + ostream & outFilePtr, + ostream & unalignedFilePtr, + MappingParameters & params, + vector & subreads, +#ifdef USE_PBBAM + PacBio::BAM::IRecordWriter * bamWriterPtr, +#endif + MappingSemaphores & semaphores); + +#include "BlasrUtilsImpl.hpp" diff -Nru blasr-0~20151014+git8e668be/iblasr/BlasrUtilsImpl.hpp blasr-5.3/iblasr/BlasrUtilsImpl.hpp --- blasr-0~20151014+git8e668be/iblasr/BlasrUtilsImpl.hpp 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/iblasr/BlasrUtilsImpl.hpp 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,1212 @@ +// Author: Mark Chaisson +#pragma once + +#include "BlasrAlign.hpp" + + +//----------------------MODIFY ALIGNMENTS--------------------------// +void AssignRefContigLocation(T_AlignmentCandidate &alignment, + SequenceIndexDatabase &seqdb, + DNASequence &genome) +{ + // + // If the sequence database is used, the start position of + // the alignment is relative to the start of the chromosome, + // not the entire index. Subtract off the start position of + // the chromosome to get the true position. + // + DNALength forwardTPos; + int seqDBIndex; + if (alignment.tStrand == 0) { + forwardTPos = alignment.tAlignedSeqPos; + seqDBIndex = seqdb.SearchForIndex(forwardTPos); + alignment.tAlignedSeqPos -= seqdb.seqStartPos[seqDBIndex]; + } + else { + // + // Flip coordinates into forward strand in order to find the boundaries + // of the contig, then reverse them in order to find offset. + // + + // Find the reverse complement coordinate of the index of the last aligned base. + assert(alignment.tAlignedSeqLength > 0); + forwardTPos = genome.MakeRCCoordinate(alignment.tAlignedSeqPos + alignment.tAlignedSeqLength - 1); + seqDBIndex = seqdb.SearchForIndex(forwardTPos); + + + // + // Find the reverse comlement coordinate of the last base of this + // sequence. This would normally be the start of the next contig + // -1 to get the length, but since an 'N' is added between every + // pair of sequences, this is -2. + // + DNALength reverseTOffset; + reverseTOffset = genome.MakeRCCoordinate(seqdb.seqStartPos[seqDBIndex+1]-2); + alignment.tAlignedSeqPos -= reverseTOffset; + } +} + +void AssignRefContigLocations(vector &alignmentPtrs, + SequenceIndexDatabase &seqdb, + DNASequence &genome) +{ + + UInt i; + for (i = 0; i < alignmentPtrs.size(); i++) { + T_AlignmentCandidate *aref = alignmentPtrs[i]; + AssignRefContigLocation(*aref, seqdb, genome); + } +} + +template +void AssignGenericRefContigName(vector &alignmentPtrs, + T_RefSequence &genome) { + UInt i; + for (i = 0; i < alignmentPtrs.size(); i++) { + T_AlignmentCandidate *aref = alignmentPtrs[i]; + aref->tName = genome.title; + } +} + + +void StoreRankingStats(vector &alignments, + VarianceAccumulator &accumPValue, + VarianceAccumulator &accumWeight) { + int i; + for (i = 0; i < int(alignments.size()); i++) { + alignments[i]->pvalVariance = accumPValue.GetVariance(); + alignments[i]->pvalNStdDev = accumPValue.GetNStdDev(alignments[i]->clusterScore); + alignments[i]->weightVariance = accumWeight.GetVariance(); + alignments[i]->weightNStdDev = accumWeight.GetNStdDev(alignments[i]->clusterWeight); + } +} + +void AssignMapQV(vector &alignmentPtrs) { + int i; + int mapQV = 1; + if (alignmentPtrs.size() > 1 and alignmentPtrs[0]->score == alignmentPtrs[1]->score) { + // the top two alignments have the same score, don't consider them as mapped. + mapQV = 0; + } + + for (i = 0; i < int(alignmentPtrs.size()); i++) { + alignmentPtrs[i]->mapQV = mapQV; + } +} + +void ScaleMapQVByClusterSize(T_AlignmentCandidate &alignment, + MappingParameters ¶ms) +{ + if (alignment.numSignificantClusters > int(params.nCandidates)) { + alignment.mapQV = Phred((1-InversePhred(alignment.mapQV))* ((float)params.nCandidates / alignment.numSignificantClusters)); + } + else if (alignment.numSignificantClusters == 0) { + alignment.mapQV = 0; + } +} + +void StoreMapQVs(SMRTSequence &read, + vector &alignmentPtrs, + MappingParameters ¶ms) +{ + // + // Only weight alignments for mapqv against eachother if they are overlapping. + // + int a; + vector > partitions; // Each set contains alignments that overlap on the read. + DistanceMatrixScoreFunction distScoreFn; + distScoreFn.del = params.deletion; + distScoreFn.ins = params.insertion; + // bug 24363, set affineOpen and affineExtend for distScoreFn + distScoreFn.affineOpen = params.affineOpen; + distScoreFn.affineExtend = params.affineExtend; + distScoreFn.InitializeScoreMatrix(SMRTLogProbMatrix); + IDSScoreFunction idsScoreFn; + idsScoreFn.ins = params.insertion; + idsScoreFn.del = params.deletion; + idsScoreFn.affineExtend = params.affineExtend; + idsScoreFn.affineOpen = params.affineOpen; + idsScoreFn.substitutionPrior = params.substitutionPrior; + idsScoreFn.globalDeletionPrior = params.globalDeletionPrior; + + // + // Rescore the alignment so that it uses probabilities. + // + for (a = 0; a < int(alignmentPtrs.size()); a++) { + if (params.ignoreQualities == false) { + // bug 24363, pass -affineAlign to compute correct alignment score. + alignmentPtrs[a]->probScore = -ComputeAlignmentScore(*alignmentPtrs[a], + alignmentPtrs[a]->qAlignedSeq, + alignmentPtrs[a]->tAlignedSeq, + idsScoreFn, + params.affineAlign) / 10.0; + } + else { + alignmentPtrs[a]->probScore = -ComputeAlignmentScore(*alignmentPtrs[a], + alignmentPtrs[a]->qAlignedSeq, + alignmentPtrs[a]->tAlignedSeq, + distScoreFn, + params.affineAlign) / 10.0; + } + } + PartitionOverlappingAlignments(alignmentPtrs, partitions, params.minFractionToBeConsideredOverlapping); + + int p; + set::iterator partIt, partEnd; + + // + // For each partition, store where on the read it begins, and where + // it ends. + // + vector partitionBeginPos, partitionEndPos; + partitionBeginPos.resize(partitions.size()); + partitionEndPos.resize(partitions.size()); + fill(partitionBeginPos.begin(), partitionBeginPos.end(), -1); + fill(partitionEndPos.begin(), partitionEndPos.end(), -1); + vector assigned; + assigned.resize( alignmentPtrs.size()); + fill(assigned.begin(), assigned.end(), false); + + for (p = 0; p < int(partitions.size()); p++) { + partEnd = partitions[p].end(); + int alnStart, alnEnd; + + if (partitions[p].size() > 0) { + partIt = partitions[p].begin(); + alignmentPtrs[*partIt]->GetQInterval(alnStart, alnEnd); + partitionBeginPos[p] = alnStart; + partitionEndPos[p] = alnEnd; + ++partIt; + partEnd = partitions[p].end(); + for (; partIt != partEnd; ++partIt) { + // Comment out because all reads are now in the forward strand. + // alignmentPtrs[*partIt]->GetQInterval(alnStart, alnEnd, convertToForwardStrand); + alignmentPtrs[*partIt]->GetQInterval(alnStart, alnEnd); + if (alnEnd - alnStart > partitionEndPos[p] - partitionBeginPos[p]) { + partitionBeginPos[p] = alnStart; + partitionEndPos[p] = alnEnd; + } + } + } + } + + // + // For each partition, determine the widest parts of the read that + // are aligned in the partition. All alignments will be extended to + // the end of the widest parts of the partition. + // + const static bool convertToForwardStrand = true; + + UInt i; + + // + // For now, just use the alignment score as the probability score. + // Although it is possible to use the full forward probability, for + // the most part it is pretty much the same as the Vitterbi + // probability, but it takes a lot longer to compute. + // + + // + // Now estimate what the alignment scores would be if they were + // extended past the ends of their current alignment. + // + + for (p = 0; p < int(partitions.size()); p++) { + partEnd = partitions[p].end(); + int alnStart, alnEnd; + for (partIt = partitions[p].begin(); partitions[p].size() > 0 and partIt != partEnd; ++partIt) { + int mismatchSum = 0; + alignmentPtrs[*partIt]->GetQInterval(alnStart, alnEnd, convertToForwardStrand); + if (alnStart - partitionBeginPos[p] > MAPQV_END_ALIGN_WIGGLE or + partitionEndPos[p] - alnEnd > MAPQV_END_ALIGN_WIGGLE) { + // bug 24363, use updated SumMismatches to compute mismatch score when + // no QV is available. + SumMismatches(read, *alignmentPtrs[*partIt], 15, + partitionBeginPos[p], partitionEndPos[p], params, mismatchSum); + } + // + // Random sequence can be aligned with about 50% similarity due + // to optimization, so weight the qv sum + // + alignmentPtrs[*partIt]->probScore += -(mismatchSum) * 0.5; + } + } + + // + // Determine mapqv by summing qvscores in partitions + + float mapQVDenominator = 0; + for (p = 0; p < int(partitions.size()); p++) { + set::iterator nextIt; + if (partitions[p].size() == 0) { + continue; + } + int index = *partitions[p].begin(); + + mapQVDenominator = alignmentPtrs[index]->probScore; + + if (partitions[p].size() > 1) { + partIt = partitions[p].begin(); + partEnd = partitions[p].end(); + ++partIt; + + for (; partIt != partEnd; ++partIt) { + index = *partIt; + mapQVDenominator = LogSumOfTwo(mapQVDenominator, alignmentPtrs[index]->probScore); + } + } + + + for (partIt = partitions[p].begin(); + partIt != partitions[p].end(); ++partIt) { + // + // If only one alignment is found, assume maximum mapqv. + // + assigned[*partIt] = true; + if (partitions[p].size() == 1) { + alignmentPtrs[*partIt]->mapQV = MAX_PHRED_SCORE; + } + + // + // Look for overflow. + // + else if (alignmentPtrs[*partIt]->probScore - mapQVDenominator < -20) { + alignmentPtrs[*partIt]->mapQV = 0; + } + else { + double log10 = log(10); + double sub = alignmentPtrs[*partIt]->probScore - mapQVDenominator; + double expo = exp(log10*sub); + double diff = 1.0 - expo; + int phredValue; + + if (expo == 0) { + phredValue = 0; + } + else if (diff == 0) { + phredValue = MAX_PHRED_SCORE; + } + else { + phredValue = Phred(diff); + } + if (phredValue > MAX_PHRED_SCORE) { + phredValue = MAX_PHRED_SCORE; + } + + alignmentPtrs[*partIt]->mapQV = phredValue; + assigned[*partIt]=true; + } + + if (params.scaleMapQVByNumSignificantClusters) { + ScaleMapQVByClusterSize(*alignmentPtrs[*partIt], params); + } + } + } + + for (i = 0; i < assigned.size(); i++) { + assert(assigned[i]); + } +} + +//--------------------SEARCH & CHECK ALIGNMENTS-------------------// +template +bool CheckForSufficientMatch(T_Sequence &read, + vector &alignmentPtrs, + MappingParameters ¶ms) +{ + (void)(read); + if (alignmentPtrs.size() > 0 and alignmentPtrs[0]->score < params.maxScore) { + return true; + } + else { + return false; + } +} + +int FindMaxLengthAlignment(vector alignmentPtrs, + int &maxLengthIndex) +{ + int i; + int maxLength = 0; + maxLengthIndex = -1; + + for (i = 0; i < int(alignmentPtrs.size()); i++) { + int qStart, qEnd; + alignmentPtrs[i]->GetQInterval(qStart, qEnd); + if (qEnd - qStart > maxLength) { + maxLengthIndex = i; + maxLength = qEnd - qStart; + } + } + return (maxLength != -1); +} + +void SumMismatches(SMRTSequence &read, + T_AlignmentCandidate &alignment, + int mismatchScore, + int fullIntvStart, int fullIntvEnd, + MappingParameters ¶ms, + int &sum) +{ + int alnStart, alnEnd; + alignment.GetQIntervalOnForwardStrand(alnStart, alnEnd); + int p; + sum = 0; + if (not params.ignoreQualities and read.substitutionQV.Empty() == false) { + for (p = fullIntvStart; p < alnStart; p++) { + sum += read.substitutionQV[p]; + } + for (p = alnEnd; p < fullIntvEnd; p++) { + sum += read.substitutionQV[p]; + } + } else { + // bug 24363, compute mismatch score when QV is not available. + sum += mismatchScore * ((alnStart - fullIntvStart) + (fullIntvEnd - alnEnd)); + } +} + + +bool AlignmentsOverlap(T_AlignmentCandidate &alnA, + T_AlignmentCandidate &alnB, + float minPercentOverlap) +{ + int alnAStart, alnAEnd, alnBStart, alnBEnd; + bool useForwardStrand=true; + alnA.GetQInterval(alnAStart, alnAEnd, useForwardStrand); + alnB.GetQInterval(alnBStart, alnBEnd, useForwardStrand); + // Look if one alignment encompasses the other + int ovp = 0; + if (alnAStart <= alnBStart and alnAEnd >= alnBEnd) { + return true; + } + else if (alnBStart <= alnAStart and alnBEnd >= alnAEnd) { + return true; + //ovp = alnAEnd - alnAStart; + } + else { + // + // Look to see if the alignments overlap + // + + if (alnAEnd >= alnBStart and alnAEnd <= alnBEnd) { + ovp = alnAEnd - alnBStart; + } + else if (alnAStart >= alnBStart and alnAStart <= alnBEnd) { + ovp = alnBEnd - alnAStart; + } + } + + // float ovpPercent = (2.0*ovp) / ((alnAEnd - alnAStart) + (alnBEnd - alnBStart)); + float ovpPercent = 0; + if (alnAEnd - alnAStart > 0 and alnBEnd - alnBStart > 0) { + // overlap percentage: maximum overlap percent in A and B. + ovpPercent = max(float(ovp)/float(alnAEnd - alnAStart), + float(ovp)/float(alnBEnd - alnBStart)); + } + + // returns true when an overlap is found. + return (ovpPercent > minPercentOverlap); +} + +void PartitionOverlappingAlignments(vector &alignmentPtrs, + vector > &partitions, + float minOverlap) { + if (alignmentPtrs.size() == 0) { + partitions.clear(); + return; + } + + set::iterator setIt, setEnd; + int i, p; + bool overlapFound = false; + for (i = 0; i < int(alignmentPtrs.size()); i++) { + overlapFound = false; + for (p = 0; p < int(partitions.size()) and overlapFound == false; p++) { + setEnd = partitions[p].end(); + for (setIt = partitions[p].begin(); setIt != partitions[p].end() and overlapFound == false; ++setIt) { + if (AlignmentsOverlap(*alignmentPtrs[i], *alignmentPtrs[*setIt], minOverlap) or + ((alignmentPtrs[i]->QAlignStart() <= alignmentPtrs[*setIt]->QAlignStart()) and + (alignmentPtrs[i]->QAlignEnd() > alignmentPtrs[*setIt]->QAlignEnd()))) { + partitions[p].insert(i); + overlapFound = true; + } + } + } + // + // If this alignment does not overlap any other, create a + // partition with it as the first element. + // + if (overlapFound == false) { + partitions.push_back(set()); + partitions[partitions.size()-1].insert(i); + } + } +} + +//--------------------FILTER ALIGNMENTS---------------------------// +int RemoveLowQualitySDPAlignments(int readLength, + vector &alignmentPtrs, + MappingParameters ¶ms) +{ + // Just a hack. For now, assume there is at least 1 match per 50 bases. + int totalBasesMatched = 0; + int a; + for (a = 0; a < int(alignmentPtrs.size()); a++) { + int b; + for (b = 0; b < int(alignmentPtrs[a]->blocks.size()); b++) { + totalBasesMatched += alignmentPtrs[a]->blocks[b].length; + } + int expectedMatches = params.sdpTupleSize/50.0 * readLength; + if (totalBasesMatched < expectedMatches) { + delete alignmentPtrs[a]; + alignmentPtrs[a] = NULL; + } + } + int packedAlignmentIndex = 0; + for (a = 0; a < int(alignmentPtrs.size()); a++) { + if (alignmentPtrs[a] != NULL) { + alignmentPtrs[packedAlignmentIndex] = alignmentPtrs[a]; + packedAlignmentIndex++; + } + } + alignmentPtrs.resize(packedAlignmentIndex); + return packedAlignmentIndex; +} + +template +int RemoveLowQualityAlignments(T_Sequence &read, + vector &alignmentPtrs, + MappingParameters ¶ms) +{ + PB_UNUSED(read); + if (params.verbosity > 0) { + cout << "checking at least " << alignmentPtrs.size() << " alignments to see if they are accurate." << endl; + } + for (size_t i = 0; i < MIN(static_cast(params.nCandidates), alignmentPtrs.size()); i++) { + if (params.verbosity > 0) { + cout << "Quality check " << i << " " << alignmentPtrs[i]->score << endl; + } + if (alignmentPtrs[i]->blocks.size() == 0 or + alignmentPtrs[i]->score > params.maxScore) { + // + // Since the alignments are sorted according to alignment + // score, once one of the alignments is too low of a score, + // all remaining alignments are also too low, and should be + // removed as well. Do that all at once. + // + if (alignmentPtrs[i]->blocks.size() == 0 and params.verbosity > 0) { + cout << "Removing empty alignment " << alignmentPtrs[i]->qName << endl; + } + if (params.verbosity > 0) { + cout << alignmentPtrs[i]->qName << " alignment " << i << " is too low of a score." << alignmentPtrs[i]->score << endl; + } + for (size_t deletedIndex = i; deletedIndex < alignmentPtrs.size(); deletedIndex++) { + delete alignmentPtrs[deletedIndex]; + alignmentPtrs[deletedIndex] = NULL; + } + alignmentPtrs.erase(i + alignmentPtrs.begin(), alignmentPtrs.end()); + break; + } + else { + if (params.verbosity > 0) { + cout << "Keeping alignment " << i << " " << alignmentPtrs[i]->qPos << " " << alignmentPtrs[i]->qLength + << " " << alignmentPtrs[i]->tName << " " << alignmentPtrs[i]->tPos << " " << alignmentPtrs[i]->tLength + << " from score: " << alignmentPtrs[i]->score << endl; + } + } + } + return alignmentPtrs.size(); +} + + +//FIXME: move to class ReadAlignments +int RemoveOverlappingAlignments(vector &alignmentPtrs, + MappingParameters ¶ms) +{ + vector alignmentIsContained; + alignmentIsContained.resize(alignmentPtrs.size()); + std::fill(alignmentIsContained.begin(), alignmentIsContained.end(), false); + + int j; + int numContained = 0; + int curNotContained = 0; + + if (alignmentPtrs.size() > 0) { + UInt i; + for (i = 0; i < alignmentPtrs.size()-1; i++ ){ + T_AlignmentCandidate *aref = alignmentPtrs[i]; + if (aref->pctSimilarity < params.minPctSimilarity) { + continue; + } + for (j = i + 1; j < int(alignmentPtrs.size()); j++ ){ + // + // Make sure this alignment isn't already removed. + // + if (alignmentIsContained[j]) { + continue; + } + + // + // Only check for containment if the two sequences are from the same contig. + // + if (alignmentPtrs[i]->tIndex != alignmentPtrs[j]->tIndex) { + continue; + } + + // + // Check for an alignment that is fully overlapping another + // alignment. + if (aref->GenomicTBegin() <= alignmentPtrs[j]->GenomicTBegin() and + aref->GenomicTEnd() >= alignmentPtrs[j]->GenomicTEnd() and + alignmentPtrs[i]->tIndex == alignmentPtrs[j]->tIndex) { + // + // Alignment i is contained in j is only true if it has a worse score. + // + if (aref->score <= alignmentPtrs[j]->score) { + alignmentIsContained[j] = true; + } + if (params.verbosity >= 2) { + cout << "alignment " << i << " is contained in " << j << endl; + cout << aref->tAlignedSeqPos << " " << alignmentPtrs[j]->tAlignedSeqPos << " " + << aref->tAlignedSeqPos + aref->tAlignedSeqLength << " " + << alignmentPtrs[j]->tAlignedSeqPos + alignmentPtrs[j]->tAlignedSeqLength << endl; + } + } + else if (alignmentPtrs[j]->GenomicTBegin() <= aref->GenomicTBegin() and + alignmentPtrs[j]->GenomicTEnd() >= aref->GenomicTEnd() and + alignmentPtrs[i]->tIndex == alignmentPtrs[j]->tIndex) { + if (params.verbosity >= 2) { + cout << "ALIGNMENT " << j << " is contained in " << i << endl; + cout << alignmentPtrs[j]->tAlignedSeqPos << " " << aref->tAlignedSeqPos << " " + << alignmentPtrs[j]->tAlignedSeqPos + alignmentPtrs[j]->tAlignedSeqLength << " " + << aref->tAlignedSeqPos + aref->tAlignedSeqLength << endl; + } + if (alignmentPtrs[j]->score <= aref->score) { + alignmentIsContained[i] = true; + } + } + } + } + for (i = 0; i < alignmentPtrs.size(); i++) { + T_AlignmentCandidate *aref = alignmentPtrs[i]; + if (alignmentIsContained[i]) { + delete alignmentPtrs[i]; + alignmentPtrs[i] = NULL; + numContained++; + } + else { + alignmentPtrs[curNotContained] = aref; + ++curNotContained; + } + } + alignmentPtrs.resize(alignmentPtrs.size() - numContained); + } + return alignmentPtrs.size(); +} + +// Delete all alignments from index startIndex in vector, inclusive. +void DeleteAlignments(vector &alignmentPtrs, + int startIndex) +{ + int i; + for (i = startIndex; i < int(alignmentPtrs.size()); i++ ) { + delete alignmentPtrs[i]; + } + alignmentPtrs.resize(0); +} + + +//--------------------REFINE ALIGNMENTS---------------------------// +template +void RefineAlignment(vector &bothQueryStrands, + T_RefSequence &genome, + T_AlignmentCandidate &alignmentCandidate, + MappingParameters ¶ms, + MappingBuffers &mappingBuffers) +{ + (void)(genome); + FASTQSequence qSeq; + DNASequence tSeq; + DistanceMatrixScoreFunction distScoreFn( + SMRTDistanceMatrix, params.deletion, params.insertion); + + DistanceMatrixScoreFunction distScoreFn2( + SMRTDistanceMatrix, params.indel, params.indel); + + QualityValueScoreFunction scoreFn; + IDSScoreFunction idsScoreFn; + idsScoreFn.InitializeScoreMatrix(SMRTDistanceMatrix); + scoreFn.del = params.indel; + scoreFn.ins = params.indel; + idsScoreFn.ins = params.insertion; + idsScoreFn.del = params.deletion; + idsScoreFn.affineExtend = params.affineExtend; + idsScoreFn.affineOpen = params.affineOpen; + idsScoreFn.substitutionPrior = params.substitutionPrior; + idsScoreFn.globalDeletionPrior = params.globalDeletionPrior; + + if (params.doGlobalAlignment) { + SMRTSequence subread; + subread.ReferenceSubstring(*bothQueryStrands[0], + bothQueryStrands[0]->SubreadStart(), + (bothQueryStrands[0]->SubreadLength())); + + int drift = ComputeDrift(alignmentCandidate); + T_AlignmentCandidate refinedAlignment; + + KBandAlign(subread, alignmentCandidate.tAlignedSeq, SMRTDistanceMatrix, + params.insertion, params.deletion, + drift, + mappingBuffers.scoreMat, mappingBuffers.pathMat, + refinedAlignment, idsScoreFn, Global); + refinedAlignment.RemoveEndGaps(); + ComputeAlignmentStats(refinedAlignment, + subread.seq, + alignmentCandidate.tAlignedSeq.seq, + distScoreFn2); + //idsScoreFn); + + alignmentCandidate.blocks = refinedAlignment.blocks; + alignmentCandidate.gaps = refinedAlignment.gaps; + alignmentCandidate.tPos = refinedAlignment.tPos; + alignmentCandidate.qPos = refinedAlignment.qPos + bothQueryStrands[0]->SubreadStart(); + alignmentCandidate.score = refinedAlignment.score; + subread.Free(); + } + else if (params.useGuidedAlign) { + T_AlignmentCandidate refinedAlignment; + int lastBlock = alignmentCandidate.blocks.size() - 1; + + + if (alignmentCandidate.blocks.size() > 0) { + + /* + * Refine the alignment without expanding past the current + * boundaries of the sequences that are already aligned. + */ + + // + // NOTE** this only makes sense when + // alignmentCandidate.blocks[0].tPos == 0. Otherwise the length + // of the sequence is not correct. + // + tSeq.Copy(alignmentCandidate.tAlignedSeq, + alignmentCandidate.tPos, + (alignmentCandidate.blocks[lastBlock].tPos + + alignmentCandidate.blocks[lastBlock].length - + alignmentCandidate.blocks[0].tPos)); + + // qSeq.ReferenceSubstring(alignmentCandidate.qAlignedSeq, + qSeq.ReferenceSubstring(*bothQueryStrands[0], + alignmentCandidate.qAlignedSeqPos + alignmentCandidate.qPos, + (alignmentCandidate.blocks[lastBlock].qPos + + alignmentCandidate.blocks[lastBlock].length)); + + if (!params.ignoreQualities && ReadHasMeaningfulQualityValues(alignmentCandidate.qAlignedSeq)) { + if (params.affineAlign) { + AffineGuidedAlign(qSeq, tSeq, alignmentCandidate, + idsScoreFn, params.bandSize, + mappingBuffers, + refinedAlignment, Global, false); + } + else { + GuidedAlign(qSeq, tSeq, alignmentCandidate, + idsScoreFn, params.guidedAlignBandSize, + mappingBuffers, + refinedAlignment, Global, false); + } + } + else { + if (params.affineAlign) { + AffineGuidedAlign(qSeq, tSeq, alignmentCandidate, + distScoreFn, params.bandSize, + mappingBuffers, + refinedAlignment, Global, false); + } + else { + GuidedAlign(qSeq, tSeq, alignmentCandidate, + distScoreFn, params.guidedAlignBandSize, + mappingBuffers, + refinedAlignment, Global, false); + } + } + ComputeAlignmentStats(refinedAlignment, + qSeq.seq, + tSeq.seq, + distScoreFn2, params.affineAlign); + // + // Copy the refine alignment, which may be a subsequence of the + // alignmentCandidate into the alignment candidate. + // + + // First copy the alignment block and gap (the description of + // the base by base alignment). + + alignmentCandidate.blocks.clear(); + alignmentCandidate.blocks = refinedAlignment.blocks; + + alignmentCandidate.CopyStats(refinedAlignment); + + alignmentCandidate.gaps = refinedAlignment.gaps; + alignmentCandidate.score = refinedAlignment.score; + alignmentCandidate.nCells = refinedAlignment.nCells; + + // Next copy the information that describes what interval was + // aligned. Since the reference sequences of the alignment + // candidate have been modified, they are reassigned. + alignmentCandidate.tAlignedSeq.Free(); + alignmentCandidate.tAlignedSeq.TakeOwnership(tSeq); + alignmentCandidate.ReassignQSequence(qSeq); + alignmentCandidate.tAlignedSeqPos += alignmentCandidate.tPos; + alignmentCandidate.qAlignedSeqPos += alignmentCandidate.qPos; + + // + // tPos and qPos are the positions within the interval where the + // alignment begins. The refined alignment has adifferent tPos + // and qPos from the alignment candidate. + alignmentCandidate.tPos = refinedAlignment.tPos; + alignmentCandidate.qPos = refinedAlignment.qPos; + + // The lengths of the newly aligned sequences may differ, update those. + alignmentCandidate.tAlignedSeqLength = tSeq.length; + alignmentCandidate.qAlignedSeqLength = qSeq.length; + } + } + else { + + + // + // This assumes an SDP alignment has been performed to create 'alignmentCandidate'. + + // + // Recompute the alignment using a banded smith waterman to + // get rid of any spurious effects of usign the seeded gaps. + // + + // + // The k-banded alignment is over a subsequence of the first + // (sparse dynamic programming, SDP) alignment. The SDP + // alignment is over a large window that may contain the + // candidate sequence. The k-band alignment is over a tighter + // region. + + int drift = ComputeDrift(alignmentCandidate); + + // + // Rescore the alignment with a banded alignment that has a + // better model of sequencing error. + // + + if (alignmentCandidate.blocks.size() == 0 ){ + alignmentCandidate.score = 0; + return; + } + int lastBlock = alignmentCandidate.blocks.size() - 1; + + // + // Assign the sequences that are going to be realigned using + // banded alignment. The SDP alignment does not give that great + // of a score, but it does do a good job at finding a backbone + // alignment that closely defines the sequence that is aligned. + // Reassign the subsequences for alignment with a tight bound + // around the beginning and ending of each sequence, so that + // global banded alignment may be performed. + // + + // + // This section needs to be cleaned up substantially. Right now it + // copies a substring from the ref to a temp, then from the temp + // back to the ref. It may be possible to just keep one pointer per + // read to the memory that was allocated, then allow the seq + // parameter to float around. The reason for all the copying is + // that in case there is a compressed version of the genome the + // seqences must be transformed before alignment. + // + + if (alignmentCandidate.qIsSubstring) { + qSeq.ReferenceSubstring(*bothQueryStrands[0], // the original sequence + alignmentCandidate.qPos + alignmentCandidate.qAlignedSeqPos, + alignmentCandidate.blocks[lastBlock].qPos + alignmentCandidate.blocks[lastBlock].length); + } + else { + qSeq.ReferenceSubstring(alignmentCandidate.qAlignedSeq, // the subsequence that the alignment points to + alignmentCandidate.qPos + alignmentCandidate.qAlignedSeqPos, + alignmentCandidate.blocks[lastBlock].qPos + alignmentCandidate.blocks[lastBlock].length - alignmentCandidate.blocks[0].qPos); + } + + tSeq.Copy(alignmentCandidate.tAlignedSeq, // the subsequence the alignment points to + alignmentCandidate.tPos, // ofset into the subsequence + alignmentCandidate.blocks[lastBlock].tPos + alignmentCandidate.blocks[lastBlock].length - alignmentCandidate.blocks[0].tPos); + + T_AlignmentCandidate refinedAlignment; + + // + // When the parameter bandSize is 0, set the alignment band size + // to the drift off the diagonal, plus a little more for wiggle + // room. When the parameteris nonzero, use that as a fixed band. + // + int k; + if (params.bandSize == 0) { + k = abs(drift) * 1.5; + } + else { + k = params.bandSize; + } + if (params.verbosity > 0) { + cout << "drift: " << drift << " qlen: " << alignmentCandidate.qAlignedSeq.length << " tlen: " << alignmentCandidate.tAlignedSeq.length << " k: " << k << endl; + cout << "aligning in " << k << " * " << alignmentCandidate.tAlignedSeq.length << " " << k * alignmentCandidate.tAlignedSeq.length << endl; + } + if (k < 10) { + k = 10; + } + + alignmentCandidate.tAlignedSeqPos += alignmentCandidate.tPos; + + VectorIndex lastSDPBlock = alignmentCandidate.blocks.size() - 1; + + if (alignmentCandidate.blocks.size() > 0) { + alignmentCandidate.tAlignedSeqLength = (alignmentCandidate.blocks[lastSDPBlock].tPos + + alignmentCandidate.blocks[lastSDPBlock].length + - alignmentCandidate.blocks[0].tPos); + } + else { + alignmentCandidate.tAlignedSeqLength = 0; + } + + alignmentCandidate.tPos = 0; + alignmentCandidate.qAlignedSeqPos += alignmentCandidate.qPos; + + if (alignmentCandidate.blocks.size() > 0) { + alignmentCandidate.qAlignedSeqLength = (alignmentCandidate.blocks[lastSDPBlock].qPos + + alignmentCandidate.blocks[lastSDPBlock].length + - alignmentCandidate.blocks[0].qPos); + } + else { + alignmentCandidate.qAlignedSeqLength = 0; + } + alignmentCandidate.qPos = 0; + + alignmentCandidate.blocks.clear(); + alignmentCandidate.tAlignedSeq.Free(); + alignmentCandidate.tAlignedSeq.TakeOwnership(tSeq); + alignmentCandidate.ReassignQSequence(qSeq); + + if (params.verbosity >= 2) { + cout << "refining target: " << endl; + alignmentCandidate.tAlignedSeq.PrintSeq(cout); + cout << "refining query: " << endl; + static_cast(&alignmentCandidate.qAlignedSeq)->PrintSeq(cout); + cout << endl; + } + PairwiseLocalAlign(qSeq, tSeq, k, params, alignmentCandidate, mappingBuffers, Fit); + } +} + +template +void RefineAlignments(vector &bothQueryStrands, + T_RefSequence &genome, + vector &alignmentPtrs, MappingParameters ¶ms, MappingBuffers &mappingBuffers) { + + + UInt i; + for (i = 0; i < alignmentPtrs.size(); i++ ) { + RefineAlignment(bothQueryStrands, genome, *alignmentPtrs[i], params, mappingBuffers); + } + // + // It's possible the alignment references change their order after running + // the local alignments. This is made into a parameter rather than resorting + // every time so that the performance gain by resorting may be measured. + // + if (params.sortRefinedAlignments) { + std::sort(alignmentPtrs.begin(), alignmentPtrs.end(), SortAlignmentPointersByScore()); + } +} + +vector +SelectAlignmentsToPrint(vector alignmentPtrs, + MappingParameters & params, + const int & associatedRandInt) { + if (params.placeRandomly) {assert(params.hitPolicy.IsRandombest());} + + if (alignmentPtrs.size() == 0) {return vector({});} + + std::sort(alignmentPtrs.begin(), alignmentPtrs.end(), + SortAlignmentPointersByScore()); + + // Apply filter criteria and hit policy. + // Shallow copy AlignmentCandidate pointers. + vector filtered; + for (auto ptr: alignmentPtrs) { + if (params.filterCriteria.Satisfy(ptr)) { + filtered.push_back(ptr); + if (int(filtered.size()) == params.nBest) break; + } + } + + return params.hitPolicy.Apply(filtered, false, associatedRandInt); +} + +// The full read is not the subread, and does not have masked off characters. +void PrintAlignment(T_AlignmentCandidate &alignment, + SMRTSequence &fullRead, + MappingParameters ¶ms, + AlignmentContext &alignmentContext, + ostream &outFile +#ifdef USE_PBBAM + , SMRTSequence & subread + , PacBio::BAM::IRecordWriter * bamWriterPtr +#endif + ) { + try { + if (params.printFormat == StickPrint) { + PrintAlignmentStats(alignment, outFile); + StickPrintAlignment(alignment, + (DNASequence&) alignment.qAlignedSeq, + (DNASequence&) alignment.tAlignedSeq, + outFile, + alignment.qAlignedSeqPos, alignment.tAlignedSeqPos); + } + else if (params.printFormat == SAM) { + SAMOutput::PrintAlignment(alignment, fullRead, outFile, alignmentContext, params.samQVList, params.clipping, params.cigarUseSeqMatch, params.allowAdjacentIndels); + } + else if (params.printFormat == BAM) { +#ifdef USE_PBBAM + BAMOutput::PrintAlignment(alignment, fullRead, subread, *bamWriterPtr, alignmentContext, params.samQVList, params.clipping, params.cigarUseSeqMatch, params.allowAdjacentIndels); +#else + REQUIRE_PBBAM_ERROR(); +#endif + } + else if (params.printFormat == CompareXML) { + XMLOutput::Print(alignment, + (DNASequence&) alignment.qAlignedSeq, (DNASequence&) alignment.tAlignedSeq, + outFile, + alignment.qAlignedSeqPos, alignment.tAlignedSeqPos); + } + else if (params.printFormat == Vulgar) { + PrintAlignmentStats(alignment, outFile); + VulgarOutput::Print(alignment, outFile); + } + else if (params.printFormat == CompareSequencesParsable) { + CompareSequencesOutput::Print(alignment, alignment.qAlignedSeq, alignment.tAlignedSeq, outFile); + } + else if (params.printFormat == Interval) { + if (alignment.blocks.size() > 0) { + IntervalOutput::Print(alignment, outFile); + } + } + else if (params.printFormat == SummaryPrint) { + if (alignment.blocks.size() > 0) { + SummaryOutput::Print(alignment, outFile); + } + } + } + catch (ostream::failure f) { + cout << "ERROR writing to output file. The output drive may be full, or you " << endl; + cout << "may not have proper write permissions." << endl; + exit(1); + } +} + +// Print all alignments in vector alignmentPtrs +void PrintAlignments(vector alignmentPtrs, + SMRTSequence &read, + MappingParameters ¶ms, ostream &outFile, + AlignmentContext alignmentContext, +#ifdef USE_PBBAM + SMRTSequence &subread, + PacBio::BAM::IRecordWriter * bamWriterPtr, +#endif + MappingSemaphores & semaphores) { + if (params.nProc > 1) { +#ifdef __APPLE__ + sem_wait(semaphores.writer); +#else + sem_wait(&semaphores.writer); +#endif + } + for (int i = 0; i < int(alignmentPtrs.size()); i++) { + T_AlignmentCandidate *aref = alignmentPtrs[i]; + + if (aref->blocks.size() == 0) { + + // + // If the SDP alignment finds nothing, there will be no + // blocks. This may happen if the sdp block size is larger + // than the anchor size found with the suffix array. When no + // blocks are found there is no alignment, so zero-out the + // score and continue. + // + aref->score = 0; + if (params.verbosity > 0) { + cout << "Zero blocks found for " << aref->qName << " " << aref->qAlignedSeqPos << " " << aref->tAlignedSeqPos << endl; + } + continue; + } + + // + // Configure some of the alignment context before printing. + // + if (i > 0 and params.placeRandomly == false) { + alignmentContext.isPrimary = false; + } + else { + alignmentContext.isPrimary = true; + } + + if (params.printSAM or params.printBAM) { + DistanceMatrixScoreFunction editdistScoreFn(EditDistanceMatrix, 1, 1); + T_AlignmentCandidate & alignment = *alignmentPtrs[i]; + alignmentContext.editDist = ComputeAlignmentScore(alignment, + alignment.qAlignedSeq, + alignment.tAlignedSeq, + editdistScoreFn); + } + + PrintAlignment(*alignmentPtrs[i], read, + params, alignmentContext, outFile +#ifdef USE_PBBAM + , subread + , bamWriterPtr +#endif + ); + } + + if (params.nProc > 1) { +#ifdef __APPLE__ + sem_post(semaphores.writer); +#else + sem_post(&semaphores.writer); +#endif + } + +} + +void PrintAlignmentPtrs(vector & alignmentPtrs, + ostream & out) { + for(int alignmentIndex = 0; + alignmentIndex < int(alignmentPtrs.size()); + alignmentIndex++) { + out << "["<< alignmentIndex << "/" + << alignmentPtrs.size() << "]" << endl; + T_AlignmentCandidate *alignment = alignmentPtrs[alignmentIndex]; + alignment->Print(out); + } + out << endl; +} + + +void PrintUnaligned(const SMRTSequence & unalignedRead, + ostream & unalignedFilePtr, + const bool noPrintUnalignedSeqs) { + if (noPrintUnalignedSeqs) { + string s = unalignedRead.GetTitle(); + SMRTTitle st(s); + if (st.isSMRTTitle) + unalignedFilePtr << st.ToString() << endl; + else + //size_t pos = s.rfind("/"); + //if (pos != string::npos) + // unalignedFilePtr << s.substr(0, pos) << std::endl; + //else + unalignedFilePtr << s << std::endl; + } else + unalignedRead.PrintSeq(unalignedFilePtr); +} + + +// Print all alignments for subreads in allReadAlignments. +// Input: +// allReadAlignments - contains a set of subreads, each of which +// is associated with a group of alignments. +// alignmentContext - an alignment context of each subread used +// for printing in SAM format. +// params - mapping parameters. +// Output: +// outFilePtr - where to print alignments for subreads. +// unalignedFilePtr - where to print sequences for unaligned subreads. +void PrintAllReadAlignments(ReadAlignments & allReadAlignments, + AlignmentContext & alignmentContext, + ostream & outFilePtr, + ostream & unalignedFilePtr, + MappingParameters & params, + vector & subreads, +#ifdef USE_PBBAM + PacBio::BAM::IRecordWriter * bamWriterPtr, +#endif + MappingSemaphores & semaphores) +{ + int subreadIndex; + int nAlignedSubreads = allReadAlignments.GetNAlignedSeq(); + + // + // Initialize the alignemnt context with information applicable to SAM output. + // + alignmentContext.alignMode = allReadAlignments.alignMode; + for (subreadIndex = 0; subreadIndex < nAlignedSubreads; subreadIndex++) { + if (allReadAlignments.subreadAlignments[subreadIndex].size() > 0) { + alignmentContext.numProperlyAlignedSubreads++; + } + } + + if (alignmentContext.numProperlyAlignedSubreads == int(allReadAlignments.subreadAlignments.size())) { + alignmentContext.allSubreadsProperlyAligned = true; + } + alignmentContext.nSubreads = nAlignedSubreads; + + for (subreadIndex = 0; subreadIndex < nAlignedSubreads; subreadIndex++) { + alignmentContext.subreadIndex = subreadIndex; + if (subreadIndex < nAlignedSubreads-1 and allReadAlignments.subreadAlignments[subreadIndex+1].size() > 0) { + alignmentContext.nextSubreadPos = allReadAlignments.subreadAlignments[subreadIndex+1][0]->QAlignStart(); + alignmentContext.nextSubreadDir = allReadAlignments.subreadAlignments[subreadIndex+1][0]->qStrand; + alignmentContext.rNext = allReadAlignments.subreadAlignments[subreadIndex+1][0]->tName; + alignmentContext.hasNextSubreadPos = true; + } else { + alignmentContext.nextSubreadPos = 0; + alignmentContext.nextSubreadDir = 0; + alignmentContext.rNext = ""; + alignmentContext.hasNextSubreadPos = false; + } + SMRTSequence * sourceSubread = &(allReadAlignments.subreads[subreadIndex]); + if (subreads.size() == allReadAlignments.subreads.size()) { + sourceSubread = &subreads[subreadIndex]; + } + if (allReadAlignments.subreadAlignments[subreadIndex].size() > 0) { + PrintAlignments(allReadAlignments.subreadAlignments[subreadIndex], + allReadAlignments.subreads[subreadIndex], + // for these alignments + params, outFilePtr,//*mapData->outFilePtr, + alignmentContext, +#ifdef USE_PBBAM + *sourceSubread, + bamWriterPtr, +#endif + semaphores); + } else { + // + // Print the unaligned sequences. + // + if (params.printUnaligned == true) { + if (params.nProc == 1) { + PrintUnaligned(*sourceSubread, + unalignedFilePtr, + params.noPrintUnalignedSeqs); + } + else { +#ifdef __APPLE__ + sem_wait(semaphores.unaligned); +#else + sem_wait(&semaphores.unaligned); +#endif + PrintUnaligned(*sourceSubread,//subreads[subreadIndex], + unalignedFilePtr, + params.noPrintUnalignedSeqs); +#ifdef __APPLE__ + sem_post(semaphores.unaligned); +#else + sem_post(&semaphores.unaligned); +#endif + } // End of nproc > 1. + } // End of printing unaligned sequences. + } // End of finding no alignments for the subread with subreadIndex. + } // End of printing and processing alignmentContext for each subread. +} diff -Nru blasr-0~20151014+git8e668be/iblasr/MappingBuffers.hpp blasr-5.3/iblasr/MappingBuffers.hpp --- blasr-0~20151014+git8e668be/iblasr/MappingBuffers.hpp 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/iblasr/MappingBuffers.hpp 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,78 @@ +// Author: Mark Chaisson +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + + +// +// Define a list of buffers that are meant to grow to high-water +// marks, and not shrink down past that. The memory is reused rather +// than having multiple calls to new. +// +class MappingBuffers { +public: + vector hpInsScoreMat, insScoreMat; + vector kbandScoreMat; + vector hpInsPathMat, insPathMat; + vector kbandPathMat; + vector scoreMat; + vector pathMat; + vector affineScoreMat; + vector affinePathMat; + vector matchPosList; + vector rcMatchPosList; + vector > globalChainEndpointBuffer; + vector sdpFragmentSet, sdpPrefixFragmentSet, sdpSuffixFragmentSet; + TupleList sdpCachedTargetTupleList; + TupleList sdpCachedTargetPrefixTupleList; + TupleList sdpCachedTargetSuffixTupleList; + std::vector sdpCachedMaxFragmentChain; + vector probMat; + vector optPathProbMat; + vector lnSubPValueMat; + vector lnInsPValueMat; + vector lnDelPValueMat; + vector lnMatchPValueMat; + vector clusterNumBases; + ClusterList clusterList; + ClusterList revStrandClusterList; + + void Reset(void); +}; + + +inline void MappingBuffers::Reset(void) { + vector().swap(hpInsScoreMat); + vector().swap(insScoreMat); + vector().swap(kbandScoreMat); + vector().swap(hpInsPathMat); + vector().swap(insPathMat); + vector().swap(kbandPathMat); + vector().swap(scoreMat); + vector().swap(pathMat); + vector().swap(matchPosList); + vector().swap(rcMatchPosList); + vector >().swap(globalChainEndpointBuffer); + vector().swap(sdpFragmentSet); + vector().swap(sdpPrefixFragmentSet); + vector().swap(sdpSuffixFragmentSet); + sdpCachedTargetTupleList.Reset(); + sdpCachedTargetPrefixTupleList.Reset(); + sdpCachedTargetSuffixTupleList.Reset(); + vector().swap(sdpCachedMaxFragmentChain); + vector().swap(probMat); + vector().swap(optPathProbMat); + vector().swap(lnSubPValueMat); + vector().swap(lnInsPValueMat); + vector().swap(lnDelPValueMat); + vector().swap(lnMatchPValueMat); + vector().swap(clusterNumBases); +} diff -Nru blasr-0~20151014+git8e668be/iblasr/MappingIPC.h blasr-5.3/iblasr/MappingIPC.h --- blasr-0~20151014+git8e668be/iblasr/MappingIPC.h 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/iblasr/MappingIPC.h 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,101 @@ +#pragma once + +#include + +#include "MappingParameters.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +/* + * This structure contains pointers to all required data structures + * for mapping reads to a suffix array and evaluating the significance + * of the matches. + */ + +template +class MappingData { +public: + T_SuffixArray *suffixArrayPtr; + BWT *bwtPtr; + T_GenomeSequence *referenceSeqPtr; + SequenceIndexDatabase *seqDBPtr; + TupleCountTable *ctabPtr; + MappingParameters params; + MappingMetrics metrics; + RegionTable *regionTablePtr; + ReaderAgglomerate *reader; + ostream *outFilePtr; + ostream *unalignedFilePtr; + ostream *anchorFilePtr; + ostream *clusterFilePtr; + ostream *lcpBoundsOutPtr; + + // Declare a semaphore for blocking on reading from the same hdhf file. + + void ShallowCopySuffixArray(T_SuffixArray &dest) { + dest.index = suffixArrayPtr->index; + dest.length = suffixArrayPtr->length; + dest.target = suffixArrayPtr->target; + dest.startPosTable = suffixArrayPtr->startPosTable; + dest.endPosTable = suffixArrayPtr->endPosTable; + dest.lookupTableLength = suffixArrayPtr->lookupTableLength; + dest.lookupPrefixLength = suffixArrayPtr->lookupPrefixLength; + dest.tm = suffixArrayPtr->tm; + dest.deleteStructures = false; + // dest.useLCPTable = suffixArrayPtr->useLCPTable; + } + + void ShallowCopySequenceIndexDatabase(SequenceIndexDatabase &dest) { + dest.nSeqPos = seqDBPtr->nSeqPos; + dest.seqStartPos = seqDBPtr->seqStartPos; + dest.nameLengths = seqDBPtr->nameLengths; + dest.names = seqDBPtr->names; + dest.deleteStructures = false; + } + + void ShallowCopyTupleCountTable( TupleCountTable &dest) { + dest.countTable = ctabPtr->countTable; + dest.countTableLength = ctabPtr->countTableLength; + dest.nTuples = ctabPtr->nTuples; + dest.tm = ctabPtr->tm; + dest.deleteStructures = false; + } + + void ShallowCopyReferenceSequence(T_GenomeSequence &refSeq) { + refSeq.ShallowCopy(*referenceSeqPtr); + refSeq.deleteOnExit = false; + } + + void Initialize(T_SuffixArray *saP, T_GenomeSequence *refP, + SequenceIndexDatabase *seqDBP, + TupleCountTable *ctabP, + MappingParameters ¶msP, + ReaderAgglomerate *readerP, + RegionTable *regionTableP, + ostream *outFileP, + ostream *unalignedFileP, + ostream *anchorFilePtrP, + ostream *clusterFilePtrP=NULL) { + suffixArrayPtr = saP; + referenceSeqPtr = refP; + seqDBPtr = seqDBP; + ctabPtr = ctabP; + regionTablePtr = regionTableP; + params = paramsP; + reader = readerP; + outFilePtr = outFileP; + unalignedFilePtr = unalignedFileP; + anchorFilePtr = anchorFilePtrP; + clusterFilePtr= clusterFilePtrP; + } +}; diff -Nru blasr-0~20151014+git8e668be/iblasr/MappingParameters.h blasr-5.3/iblasr/MappingParameters.h --- blasr-0~20151014+git8e668be/iblasr/MappingParameters.h 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/iblasr/MappingParameters.h 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,733 @@ +#pragma once + +#define REQUIRE_PBBAM_ERROR() \ +assert("blasr must be compiled with lib pbbam to perform IO on bam." == 0); + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +class MappingParameters { +public: + // + // Parameters for global substitution, insertion, and deletion priors. + // + float minFractionToBeConsideredOverlapping; + float indelRate; + float minRatio; + int indel; + int idsIndel; + int sdpIndel; + int sdpIns, sdpDel; + int insertion; + int deletion; + int mismatch; + int sdpTupleSize; + int match; + int showAlign; + bool useScoreCutoff; + int maxScore; + int argi; + int nProc; + int globalChainType; + SAMOutput::Clipping clipping; + string clippingString; + QVScale qvScaleType; + vector readsFileNames; // = queryFileNames, genomeFileName + vector queryFileNames; + vector scrapsFileNames; // needed for noSplitSubread flag in PBBAM, deriived from queryFileNames + string genomeFileName; + // Query file type: FASTA/FASTQ/HDF*/PBBAM, + // Note that mixed query file types is not allowed. + FileType queryFileType; + // Query read type, SUBREAD, CCS or UNKNOWN + // Note that mixed read types is not allowed. + ReadType::ReadTypeEnum queryReadType; + vector regionTableFileNames; + vector ccsFofnFileNames; + string tupleListName; + string posTableName; + string outFileName; + string suffixArrayFileName; + string bwtFileName; + string indexFileName; + string anchorFileName; + string clusterFileName; + int nBest; + int printWindow; + int doCondense; + int do4BitComp; + int cutoff; + int useSuffixArray; + int useBwt; + int useReverseCompressIndex; + int useTupleList; + int useSeqDB; + string seqDBName; + int useCountTable; + string countTableName; + int minMatchLength; + int listTupleSize; + int printFormat; + int maxExpand, minExpand; + int startRead; + int stride; + int pValueType; + float subsample; + int sortRefinedAlignments; + int verbosity; + bool printSAM; + bool cigarUseSeqMatch; + bool printBAM; + bool sam_via_bam; // for SAM output via pbbam using IRecordWriter + bool storeMapQV; + bool useRandomSeed; + int randomSeed; + bool placeRandomly; + bool printHeader; + bool samplePaths; + bool warp, nowarp; + //bool usePrefixLookupTable; + bool doSensitiveSearch; + bool emulateNucmer; + bool refineBetweenAnchorsOnly; + bool byAdapter; + bool extendDenovoCCSSubreads; + TupleMetrics saTupleMetrics; + TupleMetrics sdpTupleMetrics; + int lookupTableLength; + //int branchQualityThreshold; + int qualityLowerCaseThreshold; + AnchorParameters anchorParameters; + int readsFileIndex; + //int numBranches; + bool storeMetrics; + bool ignoreQualities; + bool extendFrontAlignment; + bool extendAlignments; + int maxExtendDropoff; + int minReadLength; + int maxReadLength; + int minSubreadLength; + int minRawSubreadScore; + int minAvgQual; + bool overlap; + bool advanceHalf; + int advanceExactMatches; + float approximateMaxInsertionRate; + float minPctSimilarity; // [0, 100] + float minPctAccuracy; // [0, 100] + bool refineAlignments; + int nCandidates; + bool doGlobalAlignment; + string tempDirectory; + bool useTitleTable; + string titleTableName; + bool readSeparateRegionTable; + bool readSeparateCcsFofn; + string regionTableFileName; + string ccsFofnFileName; + //float averageMismatchScore; + bool mapSubreadsSeparately; + bool concordant; + bool refineConcordantAlignments; + int flankSize; + bool useRegionTable; + bool useHQRegionTable; + bool printUnaligned; + bool noPrintUnalignedSeqs; // print unaligned reads names only. + string unalignedFileName; + string metricsFileName; + string lcpBoundsFileName; + string fullMetricsFileName; + bool printSubreadTitle; + bool useCcs; + bool useAllSubreadsInCcs; + bool useCcsOnly; + bool detailedSDPAlignment, nouseDetailedSDPAlignment; + int chunkSize; + int sdpFilterType; + bool useGuidedAlign; + int guidedAlignBandSize; + int bandSize; + int extendBandSize; + bool useQVScore; + int scoreType; + bool printVerboseHelp; + bool printDiscussion; + float sdpBypassThreshold; + bool computeAlignProbability; + float qvMatchWeight; + float qvMismatchWeight; + float qvInsWeight; + float qvDelWeight; + float readAccuracyPrior; + bool printVersion; + int substitutionPrior; + int globalDeletionPrior; + bool outputByThread; + int recurseOver; + bool allowAdjacentIndels; + bool separateGaps; + string scoreMatrixString; + bool printDotPlots; + bool preserveReadTitle; + bool forwardOnly; + bool printOnlyBest; + bool affineAlign; + int affineExtend; + int affineOpen; + bool scaleMapQVByNumSignificantClusters; + int limsAlign; + string holeNumberRangesStr; + Ranges holeNumberRanges; + int minAlnLength; + bool printSAMQV; + vector samQV; + SupplementalQVList samQVList; + bool fastMaxInterval; + bool aggressiveIntervalCut; + bool fastSDP; + string concordantTemplate; + bool concordantAlignBothDirections; + FilterCriteria filterCriteria; + string hitPolicyStr; + HitPolicy hitPolicy; + bool enableHiddenPaths; + + void Init() { + qvMatchWeight = 1.0; + qvMismatchWeight = 1.0; + qvInsWeight = 1.0; + qvDelWeight = 1.0; + minFractionToBeConsideredOverlapping = 0.75; + minRatio = 0.25; + indelRate = 0.3; + indel = 5; + insertion = 4; // asymmetric indel parameters + deletion = 5; + idsIndel = 15; + sdpIndel = 5; + sdpIns = 5; + sdpDel = 10; + sdpTupleSize = 11; + match = 0; + mismatch = 0; + showAlign = 1; + useScoreCutoff = false; + maxScore = -200; + argi = 1; + nProc = 1; + readsFileNames.clear(); + queryFileNames.clear(); + genomeFileName = ""; + queryReadType = ReadType::UNKNOWN; + queryFileType = FileType::None; + tupleListName = ""; + posTableName = ""; + suffixArrayFileName= ""; + bwtFileName = ""; + indexFileName = ""; + anchorFileName = ""; + outFileName = ""; + nBest = 10; + nCandidates = 10; + printWindow = 0; + doCondense = 0; + do4BitComp = 0; + pValueType = 0; + cutoff = 0; + useSuffixArray = 0; + useBwt = 0; + useReverseCompressIndex = 0; + useTupleList = 0; + useSeqDB = 0; + seqDBName = ""; + useCountTable = 0; + countTableName = ""; + lookupTableLength = 8; + anchorParameters.minMatchLength = minMatchLength = 12; + printFormat = SummaryPrint; + maxExpand = 0; + minExpand = 0; + startRead = 0; + stride = 1; + subsample = 1.1; + listTupleSize = 6; + sortRefinedAlignments = 1; + anchorParameters.verbosity = verbosity = 0; + saTupleMetrics.Initialize(listTupleSize); + sdpTupleMetrics.Initialize(sdpTupleSize); + qualityLowerCaseThreshold = 0; + anchorParameters.branchQualityThreshold = 0; + readsFileIndex = 0; + printSAM = false; + printBAM = false; + sam_via_bam = false; + useRandomSeed = false; + randomSeed = 0; + placeRandomly = false; + samplePaths = false; + nowarp = false; + storeMapQV = true; + warp = true; + extendDenovoCCSSubreads = false; + storeMetrics = false; + ignoreQualities = true; + extendFrontAlignment = false; + extendAlignments = false; + maxExtendDropoff = 10; + minReadLength = 50; + maxReadLength = 0; // means no max read length + minSubreadLength = 0; + minRawSubreadScore = -1; // raw subread score in region table should be in range [0, 1000]. + minAvgQual = 0; + overlap = false; + advanceHalf = false; + refineAlignments = true; + anchorParameters.advanceExactMatches = advanceExactMatches = 0; + approximateMaxInsertionRate = 1.30; + minPctSimilarity = 0; + minPctAccuracy = 0; + doGlobalAlignment = false; + tempDirectory = ""; + useTitleTable = false; + titleTableName = ""; + readSeparateRegionTable = false; + readSeparateCcsFofn = false; + regionTableFileName = ""; + ccsFofnFileName = ""; + mapSubreadsSeparately=true; + concordant=false; + refineConcordantAlignments=false; + flankSize=40; + useRegionTable = true; + useHQRegionTable=true; + printUnaligned = false; + unalignedFileName = ""; + noPrintUnalignedSeqs = false; + globalChainType = 0; + metricsFileName = ""; + fullMetricsFileName = ""; + doSensitiveSearch = false; + emulateNucmer = false; + refineBetweenAnchorsOnly = false; + printSubreadTitle = true; + detailedSDPAlignment = true; + nouseDetailedSDPAlignment = false; + useCcs = false; + useCcsOnly = false; + useAllSubreadsInCcs = false; + chunkSize = 10000000; + sdpFilterType = 0; + anchorParameters.stopMappingOnceUnique = true; + useGuidedAlign = true; + bandSize = 0; + extendBandSize = 10; + guidedAlignBandSize = 10; + useQVScore = false; + printVerboseHelp = false; + printDiscussion = false; + sdpBypassThreshold = 1000000.0; + scoreType = 0; + byAdapter = false; + qvScaleType = PHRED; + printHeader = false; + computeAlignProbability = false; + readAccuracyPrior = 0.85; + printVersion = false; + clipping = SAMOutput::none; + clippingString = ""; + substitutionPrior = 20; + globalDeletionPrior = 13; + outputByThread = false; + recurseOver = 10000; + allowAdjacentIndels = false; + separateGaps = false; + scoreMatrixString = ""; + printDotPlots = false; + preserveReadTitle = false; + forwardOnly = false; + printOnlyBest = false; + affineAlign = false; + affineExtend = 0; + affineOpen = 10; + scaleMapQVByNumSignificantClusters = false; + limsAlign = 0; + holeNumberRangesStr = ""; + minAlnLength = 0; + printSAMQV = false; + cigarUseSeqMatch = false; + samQV.clear(); + samQVList.clear(); + fastMaxInterval = false; + aggressiveIntervalCut = false; + fastSDP = false; + concordantTemplate = "mediansubread"; // typicalsubread or longestsubread + concordantAlignBothDirections = false; + + hitPolicyStr = "all"; + ResetFilterAndHit(); + enableHiddenPaths = false; //turn off hidden paths. + } + + MappingParameters() + : filterCriteria(0, 0, 0, false, Score(0, ScoreSign::NEGATIVE)) + , hitPolicy("all", ScoreSign::NEGATIVE) + { + Init(); + } + + void MakeSane() { + // Expand FOFN + FileOfFileNames::ExpandFileNameList(readsFileNames); + + // Must have at least a query and a genome + if (readsFileNames.size() <= 1) { + cout << "Error, you must provide at least one reads file and a genome file." < string.scraps.bam substitute subreads to scraps + // 2. string.bam -> string.scraps.bam insert .scraps before .bam + // TODO loop over query check for each + // not needed for xml since scraps specified explicetely + // + if (not mapSubreadsSeparately && (queryFileType == FileType::PBBAM) ) { + const string dsubdb = ".subreads.bam"; + const string dbam = ".bam"; + // loop over all subread files and fill the vector or scraps files + for (size_t i = 0; i < queryFileNames.size(); i++) { + scrapsFileNames.push_back(queryFileNames[i]); + size_t dsubdb_pos = scrapsFileNames[i].find(dsubdb); // find .subreads.bam + if (dsubdb_pos != std::string::npos) { + // TODO check that .subreads.bam is LAST occurence + // replace subreads.bam with scraps.bam + scrapsFileNames[i].replace(dsubdb_pos,dsubdb.length(),".scraps.bam"); + } + else { + // insert scraps before .bam" + // actually we can just replace last 4 characters + // fix later + size_t dbam_pos = scrapsFileNames[0].find(dbam); // find .bam + scrapsFileNames[i].replace(dbam_pos,dbam.length(),".scraps.bam"); + } + } + } + + // -useQuality can not be used in combination with a fasta input + if (!ignoreQualities) { + if (queryFileType == FileType::Fasta) { + cout<<"ERROR, you can not use -useQuality option when any of the input reads files are in multi-fasta format."< 1) { + cerr << "Warning: using new filter method for SDP alignments. The parameter is " << endl + << "either 0 or 1, but " << sdpFilterType << " was specified." << endl; + sdpFilterType = 1; + } + if (sdpFilterType == 0) { + detailedSDPAlignment = true; + nouseDetailedSDPAlignment = false; + } + if (detailedSDPAlignment == false) { + sdpFilterType = 1; + } + if (useGuidedAlign == true and bandSize == 0) { + bandSize = 16; + } + anchorParameters.minMatchLength = minMatchLength; + if (suffixArrayFileName != "") { + useSuffixArray = true; + } + if (bwtFileName != "") { + useBwt = true; + } + if (useBwt and useSuffixArray) { + cout << "ERROR, sa and bwt must be used independently." << endl; + exit(1); + } + if (countTableName != "") { + useCountTable = true; + } + if (metricsFileName != "" or fullMetricsFileName != "") { + storeMetrics = true; + } + if (useCcsOnly) { + useCcs = true; + } + if (useAllSubreadsInCcs == true) { + useCcs = true; + } + if (titleTableName != "") { + useTitleTable = true; + } + if (unalignedFileName != "") { + printUnaligned = true; + } + if (regionTableFileName != "") { + useRegionTable = true; + readSeparateRegionTable = true; + } + if (ccsFofnFileName != "") { + readSeparateCcsFofn = true; + } + if (nouseDetailedSDPAlignment == true) { + detailedSDPAlignment = false; + } + if (nouseDetailedSDPAlignment == false) { + detailedSDPAlignment = true; + } + if (anchorParameters.maxLCPLength != 0 and int(anchorParameters.maxLCPLength) < int(anchorParameters.minMatchLength)) { + cerr << "ERROR: maxLCPLength is less than minLCPLength, which will result in no hits." << endl; + } + if (subsample < 1 and stride > 1) { + cout << "ERROR, subsample and stride must be used independently." << endl; + exit(1); + } + + + if (emulateNucmer) { + SetEmulateNucmer(); + } + + if (randomSeed != 0) { + useRandomSeed = true; + } + // + // Parse the clipping. + // + if (clippingString == "soft") { + clipping = SAMOutput::soft; + } + else if (clippingString == "hard") { + clipping = SAMOutput::hard; + } + else if (clippingString == "none") { + clipping = SAMOutput::none; + } + else if (clippingString == "subread") { + clipping = SAMOutput::subread; + } + else if (clippingString != "") { + cout << "ERROR, clipping should either be soft, hard, or none." << endl; + exit(1); + } + + if (printSAM) { // since sam is printed via bam we need to use ifndef USE_PBBAM here +#ifndef USE_PBBAM + REQUIRE_PBBAM_ERROR(); +#else + printSAM = false; + printBAM = true; + sam_via_bam = true; // set to true for constructors and to avoid entering if (printBAM + cigarUseSeqMatch = true; // ALWAYS true for BAM + printFormat = BAM; // Not sure for sam_via_bam + samQVList.SetDefaultQV(); + printSAMQV = true; + if (clipping != SAMOutput::soft) { + // Only support two clipping methods: soft or subread. + clipping = SAMOutput::subread; + } + // Turn on fa fa -> bam pipe + /* + if (queryFileType != FileType::PBBAM and queryFileType != FileType::PBDATASET and not enableHiddenPaths) { + // bax|fasta|fastq -> bam paths are turned off by default + cout << "ERROR, could not output alignments in BAM unless input reads are in PacBio BAM or DATASET files." << endl; + exit(1); + } + */ + if (outFileName == "") { + cout << "ERROR, SAM output file must be specified." << endl; + exit(1); + } + // VR Need to see what happens if printing SAM + // VR Check with Derek regarding sam_via_bam + if (outputByThread) { + cout << "ERROR, could not output alignments by threads in BAM format." << endl; + exit(1); + } +#endif + } + + if (printBAM && !sam_via_bam) { // Need to check settings for SAM, +#ifndef USE_PBBAM + REQUIRE_PBBAM_ERROR(); +#else + cigarUseSeqMatch = true; // ALWAYS true for BAM + printFormat = BAM; + printSAM = false; + samQVList.SetDefaultQV(); + printSAMQV = true; + if (clipping != SAMOutput::soft) { + // Only support two clipping methods: soft or subread. + clipping = SAMOutput::subread; + } + // Turn on fa fa -> bam pipe + /* + if (queryFileType != FileType::PBBAM and queryFileType != FileType::PBDATASET and not enableHiddenPaths) { + // bax|fasta|fastq -> bam paths are turned off by default + cout << "ERROR, could not output alignments in BAM unless input reads are in PacBio BAM or DATASET files." << endl; + exit(1); + } + */ + if (outFileName == "") { + cout << "ERROR, BAM output file must be specified." << endl; + exit(1); + } + // VR Need to see what happens if printing SAM + // VR Check with Derek regarding sam_via_bam + if (outputByThread) { + cout << "ERROR, could not output alignments by threads in BAM format." << endl; + exit(1); + } +#endif + } + + if (limsAlign != 0) { + mapSubreadsSeparately = false; + forwardOnly = true; + } + + if (holeNumberRangesStr.size() > 0) { + if (not holeNumberRanges.setRanges(holeNumberRangesStr)) { + cout << "ERROR, could not parse hole number ranges: " + << holeNumberRangesStr << "." << endl; + exit(1); + } + } + + if (printSAMQV) { + if (samQV.size() == 0) { + samQVList.SetDefaultQV(); + } + else { + samQVList.UseQV(samQV); + } + } + + if (minRawSubreadScore > 1000) { + cout << "ERROR, minimum raw subread score should be less than 1000." << endl; + exit(1); + } + if (minRawSubreadScore != -1 and byAdapter) { + cout << "ERROR, minRawSubreadScore and byAdapter should not be used together." << endl; + exit(1); + } + // Determine query read type + queryReadType = DetermineQueryReadType(); + // Pass verbosity + anchorParameters.verbosity = verbosity; + + // Set filter criteria and hit policy + ResetFilterAndHit(); + } + void ResetFilterAndHit(void) { + filterCriteria = FilterCriteria(minAlnLength, minPctSimilarity, + minPctAccuracy, true, + Score(static_cast(maxScore), ScoreSign::NEGATIVE)); + hitPolicy = HitPolicy(hitPolicyStr, ScoreSign::NEGATIVE); + } + + ReadType::ReadTypeEnum DetermineQueryReadType() { + if (useCcsOnly or queryFileType == FileType::HDFCCSONLY) { + return ReadType::CCS; + } + if (queryFileType == FileType::PBBAM) { + // Read type in BAM may be CCS, SUBREAD, HQREGION or POLYMERASE. + // Determine it later. + return ReadType::UNKNOWN; + } + if (mapSubreadsSeparately) { + return ReadType::SUBREAD; + } else { + if (useHQRegionTable) { + return ReadType::HQREGION; + } else { + return ReadType::POLYMERASE; + } + } + } + + void SetEmulateNucmer() { + anchorParameters.stopMappingOnceUnique = true; + anchorParameters.advanceExactMatches = 30; + anchorParameters.maxAnchorsPerPosition = 1; + sdpBypassThreshold = 0.75; + sdpTupleSize = 15; + anchorParameters.minMatchLength = 30; + useGuidedAlign = true; + refineAlignments = false; + } + + void SetForSensitivity() { + advanceExactMatches = 0; + anchorParameters.numBranches = 1; + anchorParameters.maxAnchorsPerPosition = 10000; + } +}; diff -Nru blasr-0~20151014+git8e668be/iblasr/MappingSemaphores.h blasr-5.3/iblasr/MappingSemaphores.h --- blasr-0~20151014+git8e668be/iblasr/MappingSemaphores.h 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/iblasr/MappingSemaphores.h 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,36 @@ +#pragma once + +#include +#include +#include + +#ifndef __APPLE__ +class MappingSemaphores { + public: + sem_t reader; + sem_t writer; + sem_t unaligned; + sem_t hitCluster; + + void InitializeAll() { + sem_init(&reader, 0, 1); + sem_init(&writer, 0, 1); + sem_init(&unaligned, 0, 1); + sem_init(&hitCluster, 0, 1); + } +}; +#else +class MappingSemaphores { + public: + sem_t *reader; + sem_t *writer; + sem_t *unaligned; + sem_t *hitCluster; + void InitializeAll() { + reader = sem_open("/reader", O_CREAT, 0644, 1); + writer = sem_open("/writer", O_CREAT, 0644, 1); + unaligned = sem_open("/unaligned", O_CREAT, 0644, 1); + hitCluster = sem_open("/hitCluster", O_CREAT, 0644, 1); + } +}; +#endif diff -Nru blasr-0~20151014+git8e668be/iblasr/ReadAlignments.hpp blasr-5.3/iblasr/ReadAlignments.hpp --- blasr-0~20151014+git8e668be/iblasr/ReadAlignments.hpp 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/iblasr/ReadAlignments.hpp 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,160 @@ +// Author: Mark Chaisson +#pragma once + +#include +#include +#include +#include +#include + +using namespace std; + +class ReadAlignments { +public: +/* + This class stores the alignments from a read. A read may be + aligned in several different modes: + 1. Fullread - Treat the read as a unit from start to end + 2. Subread - Align each subread independently + 3. CCSDeNovo - Only align the CCS sequence from a read + 4. CCSAllPass - Align the de novo ccs sequences and then the + subreads to where the denovo ccs aligned. + 5. CCSFullPass - Same as allpass, except using only complete + subreads. + 6. ZmwSubreads - Align subreads of each zmw to where the longest + subread of the zmw aligned to. + + The alignments are a raggad array of n sequences; n is 1 for cases + 1 and 3, the number of subreads for cases 2 and 4, and the number + of full length passes for case 5. + + A ReadAligments class must only have alignments for a single type + of read in it. + +*/ + vector > subreadAlignments; + vector subreads; + AlignMode alignMode; + SMRTSequence read; + + inline int GetNAlignedSeq(); + + inline bool AllSubreadsHaveAlignments(); + + inline void Clear(); + + inline void Resize(int nSeq); + + inline void CheckSeqIndex(int seqIndex); + + inline void SetSequence(int seqIndex, SMRTSequence &seq); + + inline void AddAlignmentForSeq(int seqIndex, T_AlignmentCandidate *alignmentPtr); + + inline void AddAlignmentsForSeq(int seqIndex, vector &seqAlignmentPtrs); + + // Copy all T_AlignmentCandidate objects (to which subreadAlignment[seqIndex] + // is pointing) to newly created objects, and then return pointers to the new + // objects. + inline vector CopySubreadAlignments(int seqIndex); + + inline void Print(ostream &out=cout); + + inline ~ReadAlignments(); +}; + + +inline int ReadAlignments::GetNAlignedSeq() { + return subreadAlignments.size(); +} + +inline bool ReadAlignments::AllSubreadsHaveAlignments() { + int i, nAlignedSeq; + nAlignedSeq = subreadAlignments.size(); + for (i = 0; i < nAlignedSeq; i++) { + if (subreadAlignments[i].size() == 0) { + return false; + } + } + return true; +} + +inline void ReadAlignments::Clear() { + int i; + int nAlignedSeq; + for (i = 0, nAlignedSeq = subreadAlignments.size(); i < nAlignedSeq; i++) { + int nAlignments; + int a; + for (a = 0, nAlignments = subreadAlignments[i].size(); a < nAlignments; a++) { + delete subreadAlignments[i][a]; + } + subreadAlignments[i].clear(); + } + + for (i = 0, nAlignedSeq = subreads.size(); i< nAlignedSeq; i++) { + subreads[i].Free(); + } + subreadAlignments.clear(); + read.Free(); +} + +inline void ReadAlignments::Resize(int nSeq) { + subreadAlignments.resize(nSeq); + subreads.resize(nSeq); +} + +inline void ReadAlignments::CheckSeqIndex(int seqIndex) { + if ( seqIndex < 0 or seqIndex >= int(subreads.size()) ) { + cout << "ERROR, adding a sequence to an unallocated position." + << endl; + assert(0); + } +} + +inline void ReadAlignments::SetSequence(int seqIndex, SMRTSequence &seq) { + CheckSeqIndex(seqIndex); + subreads[seqIndex] = seq; +} + +inline void ReadAlignments::AddAlignmentForSeq(int seqIndex, T_AlignmentCandidate *alignmentPtr) { + CheckSeqIndex(seqIndex); + subreadAlignments[seqIndex].push_back(alignmentPtr); +} + +inline void ReadAlignments::AddAlignmentsForSeq(int seqIndex, vector &seqAlignmentPtrs) { + CheckSeqIndex(seqIndex); + subreadAlignments[seqIndex].insert(subreadAlignments[seqIndex].end(), seqAlignmentPtrs.begin(), seqAlignmentPtrs.end()); +} + +inline vector ReadAlignments::CopySubreadAlignments(int seqIndex) { + vector ret; + for (int i=0; iPrint(out); + } + } + out << " read: "; + read.Print(out); + out << endl << endl; +} + +inline ReadAlignments::~ReadAlignments() { + read.Free(); +} diff -Nru blasr-0~20151014+git8e668be/iblasr/RegisterBlasrOptions.h blasr-5.3/iblasr/RegisterBlasrOptions.h --- blasr-0~20151014+git8e668be/iblasr/RegisterBlasrOptions.h 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/iblasr/RegisterBlasrOptions.h 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,537 @@ +#pragma once +/* + * ============================================================================ + * + * Filename: RegisterOptions.hpp + * + * Description: + * + * Version: 1.0 + * Created: 04/29/2015 04:48:26 PM + * Revision: none + * Compiler: gcc + * + * Author: Yuan Li (yli), yli@pacificbiosciences.com + * Company: Pacific Biosciences + * + * ============================================================================ + */ + +#include +#include +#include + +#include "MappingParameters.h" +#include "RegisterFilterOptions.h" +using namespace std; + +void RegisterBlasrOptions(CommandLineParser & clp, MappingParameters & params) { + int trashbinInt; + float trashbinFloat; + bool trashbinBool; + clp.RegisterStringOption("-sa", ¶ms.suffixArrayFileName, ""); + clp.RegisterStringOption("-ctab", ¶ms.countTableName, "" ); + clp.RegisterStringOption("-regionTable", ¶ms.regionTableFileName, ""); + clp.RegisterStringOption("-ccsFofn", ¶ms.ccsFofnFileName, ""); + clp.RegisterIntOption("-bestn", (int*) ¶ms.nBest, "", CommandLineParser::PositiveInteger); + clp.RegisterIntOption("-limsAlign", ¶ms.limsAlign, "", CommandLineParser::PositiveInteger); + clp.RegisterFlagOption("-printOnlyBest", ¶ms.printOnlyBest, ""); + clp.RegisterFlagOption("-outputByThread", ¶ms.outputByThread, ""); + clp.RegisterFlagOption("-rbao", ¶ms.refineBetweenAnchorsOnly, ""); + clp.RegisterFlagOption("-onegap", ¶ms.separateGaps, ""); + clp.RegisterFlagOption("-allowAdjacentIndels", ¶ms.allowAdjacentIndels, "", false); + clp.RegisterFlagOption("-placeRepeatsRandomly", ¶ms.placeRandomly, ""); + clp.RegisterIntOption("-randomSeed", ¶ms.randomSeed, "", CommandLineParser::Integer); + clp.RegisterFlagOption("-extend", ¶ms.extendAlignments, ""); + clp.RegisterIntOption("-branchExpand", ¶ms.anchorParameters.branchExpand, "", CommandLineParser::NonNegativeInteger); + clp.RegisterIntOption("-maxExtendDropoff", ¶ms.maxExtendDropoff, "", CommandLineParser::NonNegativeInteger); + clp.RegisterFlagOption("-nucmer", ¶ms.emulateNucmer, ""); + clp.RegisterIntOption("-maxExpand", ¶ms.maxExpand, "", CommandLineParser::PositiveInteger); + clp.RegisterIntOption("-minExpand", ¶ms.minExpand, "", CommandLineParser::NonNegativeInteger); + clp.RegisterStringOption("-seqdb", ¶ms.seqDBName, ""); + clp.RegisterStringOption("-anchors", ¶ms.anchorFileName, ""); + clp.RegisterStringOption("-clusters", ¶ms.clusterFileName, ""); + clp.RegisterFlagOption("-samplePaths", (bool*) ¶ms.samplePaths, ""); + clp.RegisterFlagOption("-noStoreMapQV", ¶ms.storeMapQV, ""); + clp.RegisterFlagOption("-nowarp", (bool*) ¶ms.nowarp, ""); + clp.RegisterFlagOption("-guidedAlign", (bool*)¶ms.useGuidedAlign, ""); + clp.RegisterFlagOption("-useGuidedAlign", (bool*)&trashbinBool, ""); + clp.RegisterFlagOption("-noUseGuidedAlign", (bool*)¶ms.useGuidedAlign, ""); + clp.RegisterFlagOption("-header", (bool*)¶ms.printHeader, ""); + clp.RegisterIntOption("-bandSize", ¶ms.bandSize, "", CommandLineParser::PositiveInteger); + clp.RegisterIntOption("-extendBandSize", ¶ms.extendBandSize, "", CommandLineParser::PositiveInteger); + clp.RegisterIntOption("-guidedAlignBandSize", ¶ms.guidedAlignBandSize, "", CommandLineParser::PositiveInteger); + clp.RegisterIntOption("-maxAnchorsPerPosition", (int*) ¶ms.anchorParameters.maxAnchorsPerPosition, "", CommandLineParser::PositiveInteger); + clp.RegisterIntOption("-stopMappingOnceUnique", (int*) ¶ms.anchorParameters.stopMappingOnceUnique, "", CommandLineParser::NonNegativeInteger); + clp.RegisterStringOption("-out", ¶ms.outFileName, ""); + clp.RegisterIntOption("-match", ¶ms.match, "", CommandLineParser::Integer); + clp.RegisterIntOption("-mismatch", ¶ms.mismatch, "", CommandLineParser::Integer); + clp.RegisterIntOption("-minMatch", ¶ms.minMatchLength, "", CommandLineParser::PositiveInteger); + clp.RegisterIntOption("-maxMatch", ¶ms.anchorParameters.maxLCPLength, "", CommandLineParser::NonNegativeInteger); + clp.RegisterIntOption("-maxLCPLength", ¶ms.anchorParameters.maxLCPLength, "", CommandLineParser::NonNegativeInteger); + clp.RegisterIntOption("-indel", ¶ms.indel, "", CommandLineParser::Integer); + clp.RegisterIntOption("-insertion", ¶ms.insertion, "", CommandLineParser::Integer); + clp.RegisterIntOption("-deletion", ¶ms.deletion, "", CommandLineParser::Integer); + clp.RegisterIntOption("-idsIndel", ¶ms.idsIndel, "", CommandLineParser::Integer); + clp.RegisterIntOption("-sdpindel", ¶ms.sdpIndel, "", CommandLineParser::Integer); + clp.RegisterIntOption("-sdpIns", ¶ms.sdpIns, "", CommandLineParser::Integer); + clp.RegisterIntOption("-sdpDel", ¶ms.sdpDel, "", CommandLineParser::Integer); + clp.RegisterFloatOption("-indelRate", ¶ms.indelRate, "", CommandLineParser::NonNegativeFloat); + clp.RegisterFloatOption("-minRatio", ¶ms.minRatio, "", CommandLineParser::NonNegativeFloat); + clp.RegisterFloatOption("-sdpbypass", ¶ms.sdpBypassThreshold, "", CommandLineParser::NonNegativeFloat); + clp.RegisterFloatOption("-minFrac", &trashbinFloat, "", CommandLineParser::NonNegativeFloat); + clp.RegisterIntOption("-maxScore", ¶ms.maxScore, "", CommandLineParser::Integer); + clp.RegisterStringOption("-bwt", ¶ms.bwtFileName, ""); + clp.RegisterIntOption("m", ¶ms.printFormat, "", CommandLineParser::NonNegativeInteger); + clp.RegisterFlagOption("-sam", ¶ms.printSAM, ""); +#ifdef USE_PBBAM + clp.RegisterFlagOption("-bam", ¶ms.printBAM, ""); +#endif + clp.RegisterStringOption("-clipping", ¶ms.clippingString, ""); + clp.RegisterIntOption("-sdpTupleSize", ¶ms.sdpTupleSize, "", CommandLineParser::PositiveInteger); + clp.RegisterIntOption("-pvaltype", ¶ms.pValueType, "", CommandLineParser::NonNegativeInteger); + clp.RegisterIntOption("-start", ¶ms.startRead, "", CommandLineParser::NonNegativeInteger); + clp.RegisterIntOption("-stride", ¶ms.stride, "", CommandLineParser::NonNegativeInteger); + clp.RegisterFloatOption("-subsample", ¶ms.subsample, "", CommandLineParser::PositiveFloat); + clp.RegisterIntOption("-nproc", ¶ms.nProc, "", CommandLineParser::PositiveInteger); + clp.RegisterFlagOption("-sortRefinedAlignments",(bool*) ¶ms.sortRefinedAlignments, ""); + clp.RegisterIntOption("-quallc", ¶ms.qualityLowerCaseThreshold, "", CommandLineParser::Integer); + clp.RegisterFlagOption("v", (bool*) ¶ms.verbosity, ""); + clp.RegisterIntOption("V", ¶ms.verbosity, "Specify a level of verbosity.", CommandLineParser::NonNegativeInteger); + clp.RegisterIntOption("-contextAlignLength", ¶ms.anchorParameters.contextAlignLength, "", CommandLineParser::PositiveInteger); + clp.RegisterFlagOption("-skipLookupTable", ¶ms.anchorParameters.useLookupTable, ""); + clp.RegisterStringOption("-metrics", ¶ms.metricsFileName, ""); + clp.RegisterStringOption("-lcpBounds", ¶ms.lcpBoundsFileName, ""); + clp.RegisterStringOption("-fullMetrics", ¶ms.fullMetricsFileName, ""); + clp.RegisterIntOption("-nbranch", ¶ms.anchorParameters.numBranches, "", CommandLineParser::NonNegativeInteger); + clp.RegisterFlagOption("-divideByAdapter", ¶ms.byAdapter, ""); + clp.RegisterFlagOption("-useQuality", ¶ms.ignoreQualities, ""); + clp.RegisterFlagOption("-noFrontAlign", ¶ms.extendFrontAlignment, ""); + clp.RegisterIntOption("-minReadLength", ¶ms.minReadLength, "", CommandLineParser::NonNegativeInteger); + clp.RegisterIntOption("-maxReadLength", ¶ms.maxReadLength, "", CommandLineParser::NonNegativeInteger); + clp.RegisterIntOption("-minSubreadLength", ¶ms.minSubreadLength, "", CommandLineParser::NonNegativeInteger); + clp.RegisterIntOption("-minRawSubreadScore", ¶ms.minRawSubreadScore, "", CommandLineParser::NonNegativeInteger); + clp.RegisterIntOption("-minAvgQual", ¶ms.minAvgQual, "", CommandLineParser::Integer); + clp.RegisterFlagOption("-advanceHalf", ¶ms.advanceHalf, ""); + clp.RegisterIntOption("-advanceExactMatches", ¶ms.anchorParameters.advanceExactMatches, "", CommandLineParser::NonNegativeInteger); + clp.RegisterFlagOption("-useccs", ¶ms.useCcs, ""); + clp.RegisterFlagOption("-useccsdenovo", ¶ms.useCcsOnly, ""); + clp.RegisterFlagOption("-useccsall", ¶ms.useAllSubreadsInCcs, ""); + clp.RegisterFlagOption("-extendDenovoCCSSubreads", ¶ms.extendDenovoCCSSubreads, ""); + clp.RegisterFlagOption("-noRefineAlignments", ¶ms.refineAlignments, ""); + clp.RegisterFlagOption("-refineConcordantAlignments", ¶ms.refineConcordantAlignments, ""); + clp.RegisterIntOption("-nCandidates", ¶ms.nCandidates, "", CommandLineParser::NonNegativeInteger); + clp.RegisterFlagOption("-useTemp", (bool*) ¶ms.tempDirectory, ""); + clp.RegisterFlagOption("-noSplitSubreads", ¶ms.mapSubreadsSeparately, ""); + clp.RegisterFlagOption("-concordant", ¶ms.concordant, ""); + // When -concordant is turned on, blasr first selects a subread (e.g., the median length full-pass subread) + // of a zmw as template, maps the template subread to a reference, then infers directions of all other subreads + // of the same zmw based on direction of the template, and finally maps all other subreads to the same + // genomic coordinates as the template. When -concordantAlignBothDirections is turned on, blasr will align + // all other subreads both forwardly and backwardly, without infering their directions. This is a hidden + // diagnostic option only useful for analyzing movies which have lots of un-identified or missed adapters such + // that directions of subreads can not be inferred accurately. + clp.RegisterFlagOption("-concordantAlignBothDirections", ¶ms.concordantAlignBothDirections, ""); + clp.RegisterIntOption("-flankSize", ¶ms.flankSize, "", CommandLineParser::NonNegativeInteger); + clp.RegisterStringOption("-titleTable", ¶ms.titleTableName, ""); + clp.RegisterFlagOption("-useSensitiveSearch", ¶ms.doSensitiveSearch, ""); + clp.RegisterFlagOption("-ignoreRegions", ¶ms.useRegionTable, ""); + clp.RegisterFlagOption("-ignoreHQRegions", ¶ms.useHQRegionTable, ""); + clp.RegisterFlagOption("-computeAlignProbability", ¶ms.computeAlignProbability, ""); + clp.RegisterStringOption("-unaligned", ¶ms.unalignedFileName, ""); + // Print unaligned reads names only + clp.RegisterFlagOption("-noPrintUnalignedSeqs", ¶ms.noPrintUnalignedSeqs, ""); + clp.RegisterFlagOption("-global", ¶ms.doGlobalAlignment, ""); + clp.RegisterIntOption("-globalChainType", ¶ms.globalChainType, "", CommandLineParser::NonNegativeInteger); + clp.RegisterFlagOption("-noPrintSubreadTitle", (bool*) ¶ms.printSubreadTitle, ""); + clp.RegisterIntOption("-saLookupTableLength", ¶ms.lookupTableLength, "", CommandLineParser::PositiveInteger); + clp.RegisterFlagOption("-useDetailedSDP", ¶ms.detailedSDPAlignment, ""); + clp.RegisterFlagOption("-nouseDetailedSDP", &trashbinBool, ""); + clp.RegisterIntOption("-sdpFilterType", ¶ms.sdpFilterType, "", CommandLineParser::NonNegativeInteger); + clp.RegisterIntOption("-scoreType", ¶ms.scoreType, "", CommandLineParser::NonNegativeInteger); + clp.RegisterFlagOption("h", ¶ms.printVerboseHelp, ""); + clp.RegisterFlagOption("-help", ¶ms.printDiscussion, ""); + clp.RegisterFloatOption("-accuracyPrior", ¶ms.readAccuracyPrior, "", CommandLineParser::NonNegativeFloat); + // holeNumberRangesStr is a string of comma-delimited hole number ranges, such as '1,2,3,10-15'. + // Blasr only analyzes reads whose hole numbers are in the specified hole number ranges. + clp.RegisterStringOption("-holeNumbers", ¶ms.holeNumberRangesStr, ""); + clp.RegisterIntOption("-substitutionPrior", ¶ms.substitutionPrior, "", CommandLineParser::NonNegativeInteger); + clp.RegisterIntOption("-deletionPrior", ¶ms.globalDeletionPrior, "", CommandLineParser::NonNegativeInteger); + clp.RegisterIntOption("-recurseOver", ¶ms.recurseOver, "", CommandLineParser::NonNegativeInteger); + clp.RegisterStringOption("-scoreMatrix", ¶ms.scoreMatrixString, ""); + clp.RegisterFlagOption("-printDotPlots", ¶ms.printDotPlots, ""); + clp.RegisterFlagOption("-preserveReadTitle", ¶ms.preserveReadTitle,""); + clp.RegisterFlagOption("-forwardOnly", ¶ms.forwardOnly,""); + clp.RegisterFlagOption("-affineAlign", ¶ms.affineAlign, ""); + clp.RegisterIntOption("-affineOpen", ¶ms.affineOpen, "", CommandLineParser::NonNegativeInteger); + clp.RegisterIntOption("-affineExtend", ¶ms.affineExtend, "", CommandLineParser::NonNegativeInteger); + clp.RegisterFlagOption("-scaleMapQVByNClusters", ¶ms.scaleMapQVByNumSignificantClusters, "", false); + clp.RegisterFlagOption("-printSAMQV", ¶ms.printSAMQV, "", false); + clp.RegisterFlagOption("-cigarUseSeqMatch", ¶ms.cigarUseSeqMatch, ""); + clp.RegisterStringListOption("-samQV", ¶ms.samQV, ""); + clp.RegisterFlagOption("-fastMaxInterval", ¶ms.fastMaxInterval, "", false); + clp.RegisterFlagOption("-aggressiveIntervalCut", ¶ms.aggressiveIntervalCut, "", false); + clp.RegisterFlagOption("-fastSDP", ¶ms.fastSDP, "", false); + clp.RegisterStringOption("-concordantTemplate", ¶ms.concordantTemplate, "typicalsubread"); + + RegisterFilterOptions(clp, params.minAlnLength, params.minPctSimilarity, params.minPctAccuracy, + params.hitPolicyStr, trashbinBool=true, trashbinInt, params.maxScore); +} + +const string BlasrHelp(MappingParameters & params) { + stringstream helpStream; + helpStream << " Options for blasr " << endl + << " Basic usage: 'blasr reads.{bam|fasta|bax.h5|fofn} genome.fasta [-options] " << endl + << " option\tDescription (default_value)." << endl << endl + << " Input Files." << endl + << " reads.bam is a PacBio BAM file of reads." << endl + << " This is the preferred input to blasr because rich quality" << endl + << " value (insertion,deletion, and substitution quality values) information is " << endl + << " maintained. The extra quality information improves variant detection and mapping"< 3." << endl + << " --maxMatch l (inf)" << endl + << " Stop mapping a read to the genome when the lcp length reaches l. " << endl + << " This is useful when the query is part of the reference, for example when " < +#include +#include +#include + +/// Register options for filtering alignments. +void RegisterFilterOptions(CommandLineParser & clp, int & minAlnLength, + float & minPctSimilarity, float & minPctAccuracy, + std::string & hitPolicyStr, bool & useScoreCutoff, + int & scoreSignInt, int & scoreCutoff) { + ScoreSign ss = static_cast(scoreSignInt); + Score sc(static_cast(scoreCutoff), ss); + FilterCriteria fc(static_cast(minAlnLength), + minPctSimilarity, minPctAccuracy, + useScoreCutoff, sc); + + HitPolicy hp("randombest", ScoreSign::NEGATIVE); + + clp.RegisterIntOption("-minAlnLength", &minAlnLength, + fc.MinAlnLengthHelp(), + CommandLineParser::PositiveInteger); + clp.RegisterIntOption("-minAlignLength", &minAlnLength, + "Alias of --minAlnLength", + CommandLineParser::PositiveInteger); + clp.RegisterIntOption("-minLength", &minAlnLength, + "Alias of --minAlnLength", + CommandLineParser::PositiveInteger); + + clp.RegisterFloatOption("-minPctSimilarity", &minPctSimilarity, + fc.MinPctSimilarityHelp(), + CommandLineParser::PositiveFloat); + clp.RegisterFloatOption("-minPctIdentity", &minPctSimilarity, + "Alias of --minPctSimilarity", + CommandLineParser::PositiveFloat); + + clp.RegisterFloatOption("-minPctAccuracy", &minPctAccuracy, + fc.MinPctAccuracyHelp(), + CommandLineParser::PositiveFloat); + clp.RegisterFloatOption("-minAccuracy", &minPctAccuracy, + "Alias of --minPctAccuracy", + CommandLineParser::PositiveFloat); + + clp.RegisterStringOption("-hitPolicy", &hitPolicyStr, hp.Help()); + + clp.RegisterIntOption("-scoreSign", &scoreSignInt, + fc.ScoreSignHelp(), + CommandLineParser::Integer); + + clp.RegisterIntOption("-scoreCutoff", &scoreCutoff, + fc.ScoreCutoffHelp(), + CommandLineParser::Integer); +} diff -Nru blasr-0~20151014+git8e668be/include/BlasrAlign.hpp blasr-5.3/include/BlasrAlign.hpp --- blasr-0~20151014+git8e668be/include/BlasrAlign.hpp 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/include/BlasrAlign.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,122 +0,0 @@ -// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. -// -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted (subject to the limitations in the -// disclaimer below) provided that the following conditions are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// -// * Neither the name of Pacific Biosciences nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE -// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC -// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED -// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES -// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF -// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT -// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -// SUCH DAMAGE. - -// Author: Mark Chaisson -#ifndef __BLASR_ALIGN_HPP_ -#define __BLASR_ALIGN_HPP_ - -#include "BlasrHeaders.h" -#include "BlasrMiscs.hpp" - -//------------------MAP READS---------------------------------// -template -void MapRead(T_Sequence &read, T_Sequence &readRC, T_RefSequence &genome, - T_SuffixArray &sarray, - BWT &bwt, - SeqBoundaryFtr &seqBoundary, - T_TupleCountTable &ct, - SequenceIndexDatabase &seqdb, - MappingParameters ¶ms, - MappingMetrics &metrics, - vector &alignmentPtrs, - MappingBuffers &mappingBuffers, - MappingIPC *mapData, - MappingSemaphores & semaphores); - -template -void MapRead(T_Sequence &read, T_Sequence &readRC, - vector &alignmentPtrs, - MappingBuffers &mappingBuffers, - MappingIPC *mapData, - MappingSemaphores & semaphores); - -/* -void MapReads(MappingData *mapData); -*/ - -//------------------MAKE ALIGNMENTS---------------------------// -template -void AlignIntervals(T_TargetSequence &genome, T_QuerySequence &read, T_QuerySequence &rcRead, - WeightedIntervalSet &weightedIntervals, - int mutationCostMatrix[][5], - int ins, int del, int sdpTupleSize, - int useSeqDB, SequenceIndexDatabase &seqDB, - vector &alignments, - MappingParameters ¶ms, - MappingBuffers &mappingBuffers, - int procId=0); - -template -void PairwiseLocalAlign(T_Sequence &qSeq, T_RefSequence &tSeq, - int k, - MappingParameters ¶ms, T_AlignmentCandidate &alignment, - MappingBuffers &mappingBuffers, - AlignmentType alignType=Global); - -// Extend target aligned sequence of the input alignement to both ends -// by flankSize bases. Update alignment->tAlignedSeqPos, -// alignment->tAlignedSeqLength and alignment->tAlignedSeq. -void FlankTAlignedSeq(T_AlignmentCandidate * alignment, - SequenceIndexDatabase &seqdb, - DNASequence & genome, - int flankSize); - -// Align a subread of a SMRT sequence to target sequence of an alignment. -// Input: -// subread - a subread of a SMRT sequence. -// unrolledRead - the full SMRT sequence. -// alignment - an alignment. -// passDirection - whether or not the subread has the -// same direction as query of the alignment. -// 0 = true, 1 = false. -// subreadInterval - [start, end) interval of the subread in the -// SMRT read. -// subreadIndex - index of the subread in allReadAlignments. -// params - mapping paramters. -// Output: -// allReadAlignments - where the sequence and alignments of the -// subread are saved. -// threadOut - an out stream for debugging the current thread. -void AlignSubreadToAlignmentTarget(ReadAlignments & allReadAlignments, - SMRTSequence & subread, SMRTSequence & unrolledRead, - T_AlignmentCandidate * alignment, - int passDirection, ReadInterval & subreadInterval, - int subreadIndex, - MappingParameters & params, - MappingBuffers & mappingBuffers, - ostream & threadOut); - -#include "BlasrAlignImpl.hpp" -#endif diff -Nru blasr-0~20151014+git8e668be/include/BlasrAlignImpl.hpp blasr-5.3/include/BlasrAlignImpl.hpp --- blasr-0~20151014+git8e668be/include/BlasrAlignImpl.hpp 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/include/BlasrAlignImpl.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,1564 +0,0 @@ -// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. -// -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted (subject to the limitations in the -// disclaimer below) provided that the following conditions are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// -// * Neither the name of Pacific Biosciences nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE -// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC -// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED -// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES -// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF -// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT -// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -// SUCH DAMAGE. - -// Author: Mark Chaisson - -#ifndef _BLASR_ALIGN_IMPL_HPP_ -#define _BLASR_ALIGN_IMPL_HPP_ - -template -void MapRead(T_Sequence &read, T_Sequence &readRC, T_RefSequence &genome, - T_SuffixArray &sarray, - BWT &bwt, - SeqBoundaryFtr &seqBoundary, - T_TupleCountTable &ct, - SequenceIndexDatabase &seqdb, - MappingParameters ¶ms, - MappingMetrics &metrics, - vector &alignmentPtrs, - MappingBuffers &mappingBuffers, - MappingIPC *mapData, - MappingSemaphores & semaphores) -{ - - bool matchFound; - WeightedIntervalSet topIntervals(params.nCandidates); - int numKeysMatched=0, rcNumKeysMatched=0; - int expand = params.minExpand; - metrics.clocks.total.Tick(); - int nTotalCells = 0; - int forwardNumBasesMatched = 0, reverseNumBasesMatched = 0; - do { - matchFound = false; - mappingBuffers.matchPosList.clear(); - mappingBuffers.rcMatchPosList.clear(); - alignmentPtrs.clear(); - topIntervals.clear(); - params.anchorParameters.expand = expand; - - metrics.clocks.mapToGenome.Tick(); - - if (params.useSuffixArray) { - params.anchorParameters.lcpBoundsOutPtr = mapData->lcpBoundsOutPtr; - numKeysMatched = - MapReadToGenome(genome, sarray, read, params.lookupTableLength, mappingBuffers.matchPosList, - params.anchorParameters); - - // - // Only print values for the read in forward direction (and only - // the first read). - // - mapData->lcpBoundsOutPtr = NULL; - if (!params.forwardOnly) { - rcNumKeysMatched = - MapReadToGenome(genome, sarray, readRC, params.lookupTableLength, mappingBuffers.rcMatchPosList, - params.anchorParameters); - } - } - else if (params.useBwt){ - numKeysMatched = MapReadToGenome(bwt, read, read.SubreadStart(), read.SubreadEnd(), - mappingBuffers.matchPosList, params.anchorParameters, forwardNumBasesMatched); - if (!params.forwardOnly) { - rcNumKeysMatched = MapReadToGenome(bwt, readRC, readRC.SubreadStart(), readRC.SubreadEnd(), - mappingBuffers.rcMatchPosList, params.anchorParameters, reverseNumBasesMatched); - } - } - - // - // Look to see if only the anchors are printed. - if (params.anchorFileName != "") { - int i; - if (params.nProc > 1) { -#ifdef __APPLE__ - sem_wait(semaphores.writer); -#else - sem_wait(&semaphores.writer); -#endif - } - *mapData->anchorFilePtr << read.title << endl; - for (i = 0; i < mappingBuffers.matchPosList.size(); i++) { - *mapData->anchorFilePtr << mappingBuffers.matchPosList[i] << endl; - } - *mapData->anchorFilePtr << readRC.title << " (RC) " << endl; - for (i = 0; i < mappingBuffers.rcMatchPosList.size(); i++) { - *mapData->anchorFilePtr << mappingBuffers.rcMatchPosList[i] << endl; - } - - if (params.nProc > 1) { -#ifdef __APPLE__ - sem_post(semaphores.writer); -#else - sem_post(&semaphores.writer); -#endif - } - } - - metrics.totalAnchors += mappingBuffers.matchPosList.size() + mappingBuffers.rcMatchPosList.size(); - metrics.clocks.mapToGenome.Tock(); - - metrics.clocks.sortMatchPosList.Tick(); - SortMatchPosList(mappingBuffers.matchPosList); - SortMatchPosList(mappingBuffers.rcMatchPosList); - metrics.clocks.sortMatchPosList.Tock(); - - PValueWeightor lisPValue(read, genome, ct.tm, &ct); - MultiplicityPValueWeightor lisPValueByWeight(genome); - - LISSumOfLogPWeightor > lisPValueByLogSum(genome); - - LISSizeWeightor > lisWeightFn; - - IntervalSearchParameters intervalSearchParameters; - intervalSearchParameters.globalChainType = params.globalChainType; - intervalSearchParameters.advanceHalf = params.advanceHalf; - intervalSearchParameters.warp = params.warp; - intervalSearchParameters.fastMaxInterval = params.fastMaxInterval; - intervalSearchParameters.aggressiveIntervalCut = params.aggressiveIntervalCut; - intervalSearchParameters.verbosity = params.verbosity; - - // - // If specified, only align a band from the anchors. - // - DNALength squareRefLength = read.length * 1.25 + params.limsAlign; - if (params.limsAlign != 0) { - int fi; - for (fi = 0; fi < mappingBuffers.matchPosList.size(); fi++) { - if (mappingBuffers.matchPosList[fi].t >= squareRefLength) { break; } - } - if (fi < mappingBuffers.matchPosList.size()) { - mappingBuffers.matchPosList.resize(fi); - } - } - - metrics.clocks.findMaxIncreasingInterval.Tick(); - - // - // For now say that something that has a 50% chance of happening - // by chance is too high of a p value. This is probably many times - // the size. - // - intervalSearchParameters.maxPValue = log(0.5); - intervalSearchParameters.aboveCategoryPValue = -300; - VarianceAccumulator accumPValue; - VarianceAccumulator accumWeight; - VarianceAccumulator accumNBases; - - mappingBuffers.clusterList.Clear(); - mappingBuffers.revStrandClusterList.Clear(); - - // - // Remove anchors that are fully encompassed by longer ones. This - // speeds up limstemplate a lot. - // - - RemoveOverlappingAnchors(mappingBuffers.matchPosList); - RemoveOverlappingAnchors(mappingBuffers.rcMatchPosList); - - if (params.pValueType == 0) { - int original = mappingBuffers.matchPosList.size(); - - int numMerged = 0; - if (params.printDotPlots) { - ofstream dotPlotOut; - string dotPlotName = string(read.title) + ".anchors"; - CrucialOpen(dotPlotName, dotPlotOut, std::ios::out); - int mp; - for (mp = 0; mp < mappingBuffers.matchPosList.size(); mp++ ){ - dotPlotOut << mappingBuffers.matchPosList[mp].q << " " << mappingBuffers.matchPosList[mp].t << " " << mappingBuffers.matchPosList[mp].l << " " << endl; - } - dotPlotOut.close(); - } - /* - This is an optimization that is being tested out that places a grid over the - area where there are anchors, and then finds an increasing maximally weighted - path through the grid. The weight of a cell in the grid is the sum of the - number of anchors in it. All other anchors are to be removed. This will likely - only work for LIMSTemplate sequences, or other sequences with little structural - variation. - FindBand(mappingBuffers.matchPosList, - refCopy, read, 100); - */ - FindMaxIncreasingInterval(Forward, - mappingBuffers.matchPosList, - // allow for indels to stretch out the mapping of the read. - (DNALength) ((read.SubreadLength()) * (1 + params.indelRate)), params.nCandidates, - seqBoundary, - lisPValue,//lisPValue2, - lisWeightFn, - topIntervals, genome, read, intervalSearchParameters, - &mappingBuffers.globalChainEndpointBuffer, - mappingBuffers.clusterList, - accumPValue, accumWeight, accumNBases, read.title); - // Uncomment when the version of the weight functor needs the sequence. - - mappingBuffers.clusterList.ResetCoordinates(); - - FindMaxIncreasingInterval(Reverse, mappingBuffers.rcMatchPosList, - (DNALength) ((read.SubreadLength()) * (1 + params.indelRate)), params.nCandidates, - seqBoundary, - lisPValue,//lisPValue2 - lisWeightFn, - topIntervals, genome, readRC, intervalSearchParameters, - &mappingBuffers.globalChainEndpointBuffer, - mappingBuffers.revStrandClusterList, - accumPValue, accumWeight, accumNBases, read.title); - } - else if (params.pValueType == 1) { - FindMaxIncreasingInterval(Forward, - mappingBuffers.matchPosList, - // allow for indels to stretch out the mapping of the read. - (DNALength) ((read.SubreadLength()) * (1 + params.indelRate)), params.nCandidates, - seqBoundary, - lisPValueByWeight, // different from pvaltype == 2 and 0 - lisWeightFn, - topIntervals, genome, read, intervalSearchParameters, - &mappingBuffers.globalChainEndpointBuffer, - mappingBuffers.clusterList, - accumPValue, accumWeight, accumNBases, - read.title); - - - mappingBuffers.clusterList.ResetCoordinates(); - FindMaxIncreasingInterval(Reverse, mappingBuffers.rcMatchPosList, - (DNALength) ((read.SubreadLength()) * (1 + params.indelRate)), params.nCandidates, - seqBoundary, - lisPValueByWeight, // different from pvaltype == 2 and 0 - lisWeightFn, - topIntervals, genome, readRC, intervalSearchParameters, - &mappingBuffers.globalChainEndpointBuffer, - mappingBuffers.revStrandClusterList, - accumPValue, accumWeight, accumNBases, - read.title); - } - else if (params.pValueType == 2) { - FindMaxIncreasingInterval(Forward, - mappingBuffers.matchPosList, - // allow for indels to stretch out the mapping of the read. - (DNALength) ((read.SubreadLength()) * (1 + params.indelRate)), params.nCandidates, - seqBoundary, - lisPValueByLogSum, // different from pvaltype == 1 and 0 - lisWeightFn, - topIntervals, genome, read, intervalSearchParameters, - &mappingBuffers.globalChainEndpointBuffer, - mappingBuffers.clusterList, - accumPValue, accumWeight, accumNBases, - read.title); - - mappingBuffers.clusterList.ResetCoordinates(); - FindMaxIncreasingInterval(Reverse, mappingBuffers.rcMatchPosList, - (DNALength) ((read.SubreadLength()) * (1 + params.indelRate)), params.nCandidates, - seqBoundary, - lisPValueByLogSum, // different from pvaltype == 1 and 0 - lisWeightFn, - topIntervals, genome, readRC, intervalSearchParameters, - &mappingBuffers.globalChainEndpointBuffer, - mappingBuffers.revStrandClusterList, - accumPValue, accumWeight, accumNBases, - read.title); - } - - mappingBuffers.clusterList.numBases.insert(mappingBuffers.clusterList.numBases.end(), - mappingBuffers.revStrandClusterList.numBases.begin(), - mappingBuffers.revStrandClusterList.numBases.end()); - - mappingBuffers.clusterList.numAnchors.insert(mappingBuffers.clusterList.numAnchors.end(), - mappingBuffers.revStrandClusterList.numAnchors.begin(), - mappingBuffers.revStrandClusterList.numAnchors.end()); - - metrics.clocks.findMaxIncreasingInterval.Tock(); - - // - // Print verbose output. - // - WeightedIntervalSet::iterator topIntIt, topIntEnd; - topIntEnd = topIntervals.end(); - if (params.verbosity > 0) { - int topintind = 0; - cout << " intv: index start end qstart qend seq_boundary_start seq_boundary_end pvalue " << endl; - for (topIntIt = topIntervals.begin();topIntIt != topIntEnd ; ++topIntIt) { - cout << " intv: " << topintind << " " << (*topIntIt).start << " " - << (*topIntIt).end << " " - << (*topIntIt).qStart << " " << (*topIntIt).qEnd << " " - << seqBoundary((*topIntIt).start) << " " << seqBoundary((*topIntIt).end) << " " - << (*topIntIt).pValue << endl; - if (params.verbosity > 2) { - int m; - for (m = 0; m < (*topIntIt).matches.size(); m++) { - cout << " (" << (*topIntIt).matches[m].q << ", " << (*topIntIt).matches[m].t << ", " << (*topIntIt).matches[m].l << ") "; - } - cout << endl; - } - ++topintind; - } - } - - // - // Allocate candidate alignments on the stack. Each interval is aligned. - // - alignmentPtrs.resize(topIntervals.size()); - UInt i; - for (i = 0; i < alignmentPtrs.size(); i++ ) { - alignmentPtrs[i] = new T_AlignmentCandidate; - } - metrics.clocks.alignIntervals.Tick(); - AlignIntervals( genome, read, readRC, - topIntervals, - SMRTDistanceMatrix, - params.indel, params.indel, - params.sdpTupleSize, - params.useSeqDB, seqdb, - alignmentPtrs, - params, - mappingBuffers, - params.startRead ); - - /* cout << read.title << endl; - for (i = 0; i < alignmentPtrs.size(); i++) { - cout << alignmentPtrs[i]->clusterScore << " " << alignmentPtrs[i]->score << endl; - } - */ - StoreRankingStats(alignmentPtrs, accumPValue, accumWeight); - - std::sort(alignmentPtrs.begin(), alignmentPtrs.end(), SortAlignmentPointersByScore()); - metrics.clocks.alignIntervals.Tock(); - - // - // Evalutate the matches that are found for 'good enough'. - // - - matchFound = CheckForSufficientMatch(read, alignmentPtrs, params); - - // - // When no proper alignments are found, the loop will resume. - // Delete all alignments because they are bad. - // - if (expand < params.maxExpand and matchFound == false) { - DeleteAlignments(alignmentPtrs, 0); - } - - // - // Record some metrics that show how long this took to run per base. - // - - if (alignmentPtrs.size() > 0) { - metrics.RecordNumAlignedBases(read.length); - metrics.RecordNumCells(alignmentPtrs[0]->nCells); - } - - if (matchFound == true) { - metrics.totalAnchorsForMappedReads += mappingBuffers.matchPosList.size() + mappingBuffers.rcMatchPosList.size(); - } - ++expand; - } while ( expand <= params.maxExpand and matchFound == false); - metrics.clocks.total.Tock(); - UInt i; - int totalCells = 0; - for (i = 0; i< alignmentPtrs.size(); i++) { - totalCells += alignmentPtrs[i]->nCells; - } - metrics.clocks.AddCells(totalCells); - int totalBases = 0; - for (i = 0; i < alignmentPtrs.size(); i++) { - totalBases += alignmentPtrs[i]->qLength; - } - metrics.clocks.AddBases(totalBases); - // - // Some of the alignments are to spurious regions. Delete the - // references that have too small of a score. - // - - int effectiveReadLength = 0; - for (i = 0; i< read.length; i++) { - if (read.seq[i] != 'N') effectiveReadLength++; - } - if (params.sdpFilterType == 0) { - RemoveLowQualityAlignments(read, alignmentPtrs, params); - } - else if (params.sdpFilterType == 1) { - RemoveLowQualitySDPAlignments(effectiveReadLength, alignmentPtrs, params); - } - - // - // Now remove overlapping alignments. - // - - vector bothQueryStrands; - bothQueryStrands.resize(2); - bothQueryStrands[Forward] = &read; - bothQueryStrands[Reverse] = &readRC; - - - // - // Possibly use banded dynamic programming to refine the columns - // of an alignment and the alignment score. - // - if (params.refineAlignments) { - RefineAlignments(bothQueryStrands, genome, alignmentPtrs, params, mappingBuffers); - RemoveLowQualityAlignments(read,alignmentPtrs,params); - RemoveOverlappingAlignments(alignmentPtrs, params); - } - - if (params.forPicard) { - int a; - for (a = 0; a < alignmentPtrs.size(); a++ ) { - alignmentPtrs[a]->OrderGapsByType(); - } - } - - // - // Look to see if the number of anchors found for this read match - // what is expected given the expected distribution of number of - // anchors. - // - - if (alignmentPtrs.size() > 0) { - int clusterIndex; - // - // Compute some stats on the read. For now this is fixed but will - // be updated on the fly soon. - // - float meanAnchorBasesPerRead, sdAnchorBasesPerRead; - float meanAnchorsPerRead, sdAnchorsPerRead; - - int lookupValue; - // - // If a very short anchor size was used, or very long min match - // size there may be no precomputed distributions for it. - // Handle this by bounding the min match by the smallest and - // largest values for which there are precomputed statistics. - - int boundedMinWordMatchLength = min(max(params.minMatchLength, anchorMinKValues[0]), anchorMinKValues[1]); - - // - // Do a similar bounding for match length and accuracy. - // - int boundedMatchLength = min(max((int) alignmentPtrs[0]->qAlignedSeq.length, anchorReadLengths[0]), anchorReadLengths[1]); - int boundedPctSimilarity = min(max((int)alignmentPtrs[0]->pctSimilarity, anchorReadAccuracies[0]), anchorReadAccuracies[1]); - - lookupValue = LookupAnchorDistribution(boundedMatchLength, boundedMinWordMatchLength, boundedPctSimilarity, - meanAnchorsPerRead, sdAnchorsPerRead, meanAnchorBasesPerRead, sdAnchorBasesPerRead); - - float minExpAnchors = meanAnchorsPerRead - sdAnchorsPerRead; - // - // The number of standard deviations is just trial and error. - float minExpAnchorBases = meanAnchorBasesPerRead - 2 * sdAnchorBasesPerRead; - if (lookupValue < 0 or minExpAnchorBases < 0) { - minExpAnchorBases = 0; - } - int numSignificantClusters = 0; - int totalSignificantClusterSize = 0; - int maxClusterSize = 0; - int maxClusterIndex = 0; - int numAlnAnchorBases, numAlnAnchors, scaledMaxClusterSize; - alignmentPtrs[0]->ComputeNumAnchors(boundedMinWordMatchLength, numAlnAnchors, numAlnAnchorBases); - int totalAnchorBases = 0; - if (numAlnAnchorBases > meanAnchorBasesPerRead + sdAnchorBasesPerRead) { - numSignificantClusters = 1; - } - else { - if (alignmentPtrs[0]->score < params.maxScore) { - for (clusterIndex = 0; clusterIndex < mappingBuffers.clusterList.numBases.size(); clusterIndex++) { - if (mappingBuffers.clusterList.numBases[clusterIndex] > maxClusterSize) { - maxClusterSize = mappingBuffers.clusterList.numBases[clusterIndex]; - maxClusterIndex = clusterIndex; - } - } - int scaledExpectedClusterSize = maxClusterSize / ((float)numAlnAnchorBases) * minExpAnchorBases; - for (clusterIndex = 0; clusterIndex < mappingBuffers.clusterList.numBases.size(); clusterIndex++) { - bool isSignificant = false; - if (mappingBuffers.clusterList.numBases[clusterIndex] >= scaledExpectedClusterSize) { - // cout << mappingBuffers.clusterList.numBases[clusterIndex] << " " << scaledExpectedClusterSize << " " << meanAnchorBasesPerRead << " " << sdAnchorBasesPerRead << endl; - ++numSignificantClusters; - totalSignificantClusterSize += meanAnchorBasesPerRead; - isSignificant = true; - } - // - // The following output block is useful in debugging mapqv - // calculation. It should be uncommented and examined when - // mapqvs do not look correct. - // - totalAnchorBases += mappingBuffers.clusterList.numBases[clusterIndex]; - } - } - - if (lookupValue == 0) { - int scaledMaxClusterSize; - alignmentPtrs[0]->ComputeNumAnchors(params.minMatchLength, numAlnAnchors, numAlnAnchorBases); - scaledMaxClusterSize = ( ((float)numAlnAnchorBases )/ meanAnchorBasesPerRead) * maxClusterSize; - } - } - - for (i = 0; i < alignmentPtrs.size(); i++) { - alignmentPtrs[i]->numSignificantClusters = numSignificantClusters; - } - if (mapData->clusterFilePtr != NULL and topIntervals.size() > 0 and alignmentPtrs.size() > 0) { - WeightedIntervalSet::iterator intvIt = topIntervals.begin(); - if (params.nProc > 1) { -#ifdef __APPLE__ - sem_wait(semaphores.hitCluster); -#else - sem_wait(&semaphores.hitCluster); -#endif - } - - *mapData->clusterFilePtr << (*intvIt).size << " " << (*intvIt).pValue << " " << (*intvIt).nAnchors << " " - << read.length << " " << alignmentPtrs[0]->score << " " << alignmentPtrs[0]->pctSimilarity << " " - << " " << minExpAnchors << " " << alignmentPtrs[0]->qAlignedSeq.length << endl; - - if (params.nProc > 1) { -#ifdef __APPLE__ - sem_post(semaphores.hitCluster); -#else - sem_post(&semaphores.hitCluster); -#endif - } - } - - } - - // - // Assign the query name and strand for each alignment. - // - - for (i = 0; i < alignmentPtrs.size(); i++) { - T_AlignmentCandidate *aref = alignmentPtrs[i]; - if (aref->tStrand == 0) { - aref->qName = read.GetName(); - } - else { - aref->qName = readRC.GetName(); - } - } - - AssignRefContigLocations(alignmentPtrs, seqdb, genome); -} - -template -void MapRead(T_Sequence &read, T_Sequence &readRC, - vector &alignmentPtrs, - MappingBuffers &mappingBuffers, - MappingIPC *mapData, - MappingSemaphores & semaphores) -{ - DNASuffixArray sarray; - TupleCountTable ct; - SequenceIndexDatabase seqdb; - T_GenomeSequence genome; - BWT *bwtPtr = mapData->bwtPtr; - mapData->ShallowCopySuffixArray(sarray); - mapData->ShallowCopyReferenceSequence(genome); - mapData->ShallowCopySequenceIndexDatabase(seqdb); - mapData->ShallowCopyTupleCountTable(ct); - SeqBoundaryFtr seqBoundary(&seqdb); - - return - MapRead(read, readRC, - genome, // possibly multi fasta file read into one sequence - sarray, *bwtPtr, // The suffix array, and the bwt-fm index structures - seqBoundary, // Boundaries of contigs in the - // genome, alignments do not span - // the ends of boundaries. - ct, // Count table to use word frequencies in the genome to weight matches. - seqdb, // Information about the names of - // chromosomes in the genome, and - // where their sequences are in the genome. - mapData->params,// A huge list of parameters for - // mapping, only compile/command - // line values set. - mapData->metrics, // Keep track of time/ hit counts, - // etc.. Not fully developed, but - // should be. - alignmentPtrs, // Where the results are stored. - mappingBuffers, // A class of buffers for structurs - // like dyanmic programming - // matrices, match lists, etc., that are not - // reallocated between calls to - // MapRead. They are cleared though. - mapData, // Some values that are shared - // across threads. - semaphores); -} - -template -void AlignIntervals(T_TargetSequence &genome, T_QuerySequence &read, T_QuerySequence &rcRead, - WeightedIntervalSet &weightedIntervals, - int mutationCostMatrix[][5], - int ins, int del, int sdpTupleSize, - int useSeqDB, SequenceIndexDatabase &seqDB, - vector &alignments, - MappingParameters ¶ms, - MappingBuffers &mappingBuffers, - int procId) { - - vector forrev; - forrev.resize(2); - forrev[Forward] = &read; - forrev[Reverse] = &rcRead; - - // - // Use an edit distance scoring function instead of IDS. Although - // the IDS should be more accurate, it is more slow, and it is more - // important at this stage to have faster alignments than accurate, - // since all alignments are rerun using GuidedAlignment later on. - // - DistanceMatrixScoreFunction distScoreFn(SMRTDistanceMatrix, params.insertion, params.deletion); - DistanceMatrixScoreFunction distScoreFn2(SMRTDistanceMatrix, ins, ins); - - // - // Assume there is at least one interval. - // - if (weightedIntervals.size() == 0) - return; - - WeightedIntervalSet::iterator intvIt = weightedIntervals.begin(); - int alignmentIndex = 0; - - - do { - - T_AlignmentCandidate *alignment = alignments[alignmentIndex]; - alignment->clusterWeight= (*intvIt).size; // totalAnchorSize == size - alignment->clusterScore = (*intvIt).pValue; - - // - // Advance references. Intervals are stored in reverse order, so - // go backwards in the list, and alignments are in forward order. - // That should probably be changed. - // - ++alignmentIndex; - - // - // Try aligning the read to the genome. - // - DNALength matchIntervalStart, matchIntervalEnd; - matchIntervalStart = (*intvIt).start; - matchIntervalEnd = (*intvIt).end; - - bool readOverlapsContigStart = false; - bool readOverlapsContigEnd = false; - int startOverlappedContigIndex = 0; - int endOverlappedContigIndex = 0; - if (params.verbosity > 0) { - cout << "aligning interval : " << read.length << " " << (*intvIt).start << " " - << (*intvIt).end << " " << (*intvIt).qStart << " " << (*intvIt).qEnd - << " " << matchIntervalStart << " to " << matchIntervalEnd << " " - << params.approximateMaxInsertionRate << " " << endl; - } - assert(matchIntervalEnd >= matchIntervalStart); - - // - // If using a sequence database, check to make sure that the - // boundaries of the sequence windows do not overlap with - // the boundaries of the reads. If the beginning is before - // the boundary, move the beginning up to the start of the read. - // If the end is past the end boundary of the read, similarly move - // the window boundary to the end of the read boundary. - - DNALength tAlignedContigStart = 0; - int seqDBIndex = 0; - - - // - // Stretch the alignment interval so that it is close to where - // the read actually starts. - // - DNALength subreadStart = read.SubreadStart(); - DNALength subreadEnd = read.SubreadEnd(); - if ((*intvIt).GetStrandIndex() == Reverse) { - subreadEnd = read.MakeRCCoordinate(read.SubreadStart()) + 1; - subreadStart = read.MakeRCCoordinate(read.SubreadEnd()-1); - } - - DNALength lengthBeforeFirstMatch = ((*intvIt).qStart - subreadStart) * params.approximateMaxInsertionRate ; - DNALength lengthAfterLastMatch = (subreadEnd - (*intvIt).qEnd) * params.approximateMaxInsertionRate; - if (matchIntervalStart < lengthBeforeFirstMatch or params.doGlobalAlignment) { - matchIntervalStart = 0; - } - else { - matchIntervalStart -= lengthBeforeFirstMatch; - } - - if (genome.length < matchIntervalEnd + lengthAfterLastMatch or params.doGlobalAlignment) { - matchIntervalEnd = genome.length; - } - else { - matchIntervalEnd += lengthAfterLastMatch; - } - - DNALength intervalContigStartPos, intervalContigEndPos; - if (useSeqDB) { - // - // The sequence db index is the one where the actual match is - // contained. The matchIntervalStart might be before the sequence - // index boundary due to the extrapolation of alignment start by - // insertion rate. If this is the case, bump up the - // matchIntervalStart to be at the beginning of the boundary. - // Modify bounds similarly for the matchIntervalEnd and the end - // of a boundary. - // - seqDBIndex = seqDB.SearchForIndex((*intvIt).start); - intervalContigStartPos = seqDB.seqStartPos[seqDBIndex]; - if (intervalContigStartPos > matchIntervalStart) { - matchIntervalStart = intervalContigStartPos; - } - intervalContigEndPos = seqDB.seqStartPos[seqDBIndex+1] - 1; - if (intervalContigEndPos < matchIntervalEnd) { - matchIntervalEnd = intervalContigEndPos; - } - alignment->tName = seqDB.GetSpaceDelimitedName(seqDBIndex); - alignment->tLength = intervalContigEndPos - intervalContigStartPos; - // - // When there are multiple sequences in the database, store the - // index of this sequence. This lets one compare the contigs - // that reads are mapped to, for instance. - // - alignment->tIndex = seqDBIndex; - } - else { - alignment->tLength = genome.length; - alignment->tName = genome.GetName(); - intervalContigStartPos = 0; - intervalContigEndPos = genome.length; - // - // When there are multiple sequences in the database, store the - // index of this sequence. This lets one compare the contigs - // that reads are mapped to, for instance. - // - } - alignment->qName = read.title; - // - // Look to see if a read overhangs the beginning of a contig. - // - if (params.verbosity > 2) { - cout << "Check for prefix/suffix overlap on interval: " << (*intvIt).qStart << " ?> " << (*intvIt).start - intervalContigStartPos < (*intvIt).start - intervalContigStartPos) { - readOverlapsContigStart = true; - startOverlappedContigIndex = seqDBIndex; - } - - // - // Look to see if the read overhangs the end of a contig. - // - if (params.verbosity > 2) { - cout << "Check for suffix/prefix overlap on interval, read overhang: " << read.length - (*intvIt).qEnd << " ?> " << matchIntervalEnd - (*intvIt).end < matchIntervalEnd - (*intvIt).end) { - if (params.verbosity > 2) { - cout << "read overlaps genome end." << endl; - } - readOverlapsContigEnd = true; - endOverlappedContigIndex = seqDBIndex; - } - int alignScore; - alignScore = 0; - - alignment->tAlignedSeqPos = matchIntervalStart; - alignment->tAlignedSeqLength = matchIntervalEnd - matchIntervalStart; - if ((*intvIt).GetStrandIndex() == Forward) { - alignment->tAlignedSeq.Copy(genome, alignment->tAlignedSeqPos, alignment->tAlignedSeqLength); - alignment->tStrand = Forward; - } - else { - DNALength rcAlignedSeqPos = genome.MakeRCCoordinate(alignment->tAlignedSeqPos + alignment->tAlignedSeqLength - 1); - genome.CopyAsRC(alignment->tAlignedSeq, rcAlignedSeqPos, alignment->tAlignedSeqLength); - // Map forward coordinates into reverse complement. - - intervalContigStartPos = genome.MakeRCCoordinate(intervalContigStartPos) + 1; - intervalContigEndPos = genome.MakeRCCoordinate(intervalContigEndPos - 1); - swap(intervalContigStartPos, intervalContigEndPos); - alignment->tAlignedSeqPos = rcAlignedSeqPos; - alignment->tStrand = Reverse; - } - - // Configure the part of the query that is aligned. The entire - // query should always be aligned. - alignment->qAlignedSeqPos = 0; - alignment->qAlignedSeq.ReferenceSubstring(read); - alignment->qAlignedSeqLength = alignment->qAlignedSeq.length; - alignment->qLength = read.length; - alignment->qStrand = 0; - - if (params.verbosity > 1) { - cout << "aligning read " << endl; - static_cast(&(alignment->qAlignedSeq))->PrintSeq(cout); - cout << endl << "aligning reference" << endl; - static_cast(&(alignment->tAlignedSeq))->PrintSeq(cout); - cout << endl; - } - - // - // The type of alignment that is performed depends on the mode - // blasr is running in. If it is running in normal mode, local - // aligment is performed and guided by SDP alignment. When - // running in overlap mode, the alignments are forced to the ends - // of reads. - // - - int intervalSize = 0; - int m; - // - // Check to see if the matches to the genome are sufficiently - // dense to allow them to be used instead of having to redo - // sdp alignment. - // - - // First count how much of the read matches the genome exactly. - for (m = 0; m < intvIt->matches.size(); m++) { intervalSize += intvIt->matches[m].l;} - - int subreadLength = forrev[(*intvIt).GetStrandIndex()]->SubreadEnd() - forrev[(*intvIt).GetStrandIndex()]->SubreadStart(); - if ((1.0*intervalSize) / subreadLength < params.sdpBypassThreshold and !params.emulateNucmer) { - // - // Not enough of the read maps to the genome, need to use - // sdp alignment to define the regions of the read that map. - // - if (params.refineBetweenAnchorsOnly) { - - // - // Run SDP alignment only between the genomic anchors, - // including the genomic anchors as part of the alignment. - // - int m; - - vector *matches; - vector rcMatches; - Alignment anchorsOnly; - DNASequence tAlignedSeq; - FASTQSequence qAlignedSeq; - // - // The strand bookkeeping is a bit confusing, so hopefully - // this will set things straight. - // - // If the alignment is forward strand, the coordinates of the - // blocks are relative to the forward read, starting at 0, not - // the subread start. - // If the alignment is reverse strand, the coordinates of the - // blocks are relative to the reverse strand, starting at the - // position of the subread on the reverse strand. - // - // The coordinates of the blocks in the genome are always - // relative to the forward strand on the genome, starting at - // 0. - // - - // - // The first step to refining between anchors only is to make - // the anchors relative to the tAlignedSeq. - - matches = (vector*) &(*intvIt).matches; - tAlignedSeq = alignment->tAlignedSeq; - qAlignedSeq = alignment->qAlignedSeq; - - if (alignment->tStrand == 0) { - for (m = 0; m < matches->size(); m++) { - (*matches)[m].t -= alignment->tAlignedSeqPos; - (*matches)[m].q -= alignment->qAlignedSeqPos; - } - } - else { - // - // Flip the entire alignment if it is on the reverse strand. - DNALength rcAlignedSeqPos = genome.MakeRCCoordinate(alignment->tAlignedSeqPos + alignment->tAlignedSeqLength - 1); - for (m = 0; m < matches->size(); m++) { - (*matches)[m].t -= rcAlignedSeqPos; - (*matches)[m].q -= alignment->qAlignedSeqPos; - } - - alignment->tAlignedSeq.CopyAsRC(tAlignedSeq); - rcMatches.resize((*intvIt).matches.size()); - // - // Make the reverse complement of the match list. - // - - // 1. Reverse complement the coordinates. - for (m = 0; m < (*intvIt).matches.size(); m++) { - int revCompIndex = rcMatches.size() - m - 1; - rcMatches[revCompIndex].q = read.MakeRCCoordinate((*intvIt).matches[m].q + (*intvIt).matches[m].l - 1); - rcMatches[revCompIndex].t = tAlignedSeq.MakeRCCoordinate((*intvIt).matches[m].t + (*intvIt).matches[m].l - 1); - rcMatches[revCompIndex].l = (*intvIt).matches[m].l; - } - matches = &rcMatches; - } - - /* - Uncomment to get a dot plot - ofstream matchFile; - matchFile.open("matches.txt"); - matchFile << "q t l " << endl; - for (m = 0; matches->size() > 0 and m < matches->size() - 1; m++) { - matchFile << (*matches)[m].q << " " << (*matches)[m].t << " " << (*matches)[m].l << endl; - } - */ - DNASequence tSubSeq; - FASTQSequence qSubSeq; - for (m = 0; matches->size() > 0 and m < matches->size() - 1; m++) { - Block block; - block.qPos = (*matches)[m].q; - block.tPos = (*matches)[m].t; - block.length = (*matches)[m].l; - - // - // Find the lengths of the gaps between anchors. - // - int tGap, qGap; - tGap = (*matches)[m+1].t - ((*matches)[m].t + (*matches)[m].l); - qGap = (*matches)[m+1].q - ((*matches)[m].q + (*matches)[m].l); - float gapRatio = (1.0*tGap)/qGap; - - if (tGap > 0 and qGap > 0) { - DNALength tPos, qPos; - tPos = block.tPos + block.length; - qPos = block.qPos + block.length; - tSubSeq.ReferenceSubstring(tAlignedSeq, tPos, tGap); - qSubSeq.ReferenceSubstring(alignment->qAlignedSeq, qPos, qGap); - Alignment alignmentInGap; - int alignScore; - - /* - The following code is experimental code for trying to do - something like affine gap alignment in long gaps. It - would eventually be used in cDNA alignment to align - between exons, but for now is being tested here by using - it to align when there is a big gap between anchors. - */ - if (params.separateGaps == true and - qSubSeq.length > 0 and tSubSeq.length > 0 and - ( (1.0*qSubSeq.length)/tSubSeq.length < 0.25 )) { - alignScore = OneGapAlign(qSubSeq, tSubSeq, distScoreFn, mappingBuffers, alignmentInGap); - } - else { - /* - This is the 'normal/default' way to align between - gaps. It is more well tested than OneGapAlign. - */ - alignScore = SDPAlign(qSubSeq, tSubSeq, distScoreFn, params.sdpTupleSize, - params.sdpIns, params.sdpDel, params.indelRate*2, - alignmentInGap, mappingBuffers, Global, - params.detailedSDPAlignment, - params.extendFrontAlignment, - params.recurseOver, - params.fastSDP); - } - - // - // Now, splice the fragment alignment into the current - // alignment. - // - if (alignmentInGap.blocks.size() > 0) { - int b; - // - // Configure this block to be relative to the beginning - // of the aligned substring. - // - for (b = 0; b < alignmentInGap.size(); b++) { - alignmentInGap.blocks[b].tPos += tPos + alignmentInGap.tPos; - alignmentInGap.blocks[b].qPos += qPos + alignmentInGap.qPos; - assert(alignmentInGap.blocks[b].tPos < alignment->tAlignedSeq.length); - assert(alignmentInGap.blocks[b].qPos < alignment->qAlignedSeq.length); - } - } - // Add the original block - alignment->blocks.push_back(block); - anchorsOnly.blocks.push_back(block); - // Add the blocks for the refined alignment - alignment->blocks.insert(alignment->blocks.end(), - alignmentInGap.blocks.begin(), - alignmentInGap.blocks.end()); - } - } - - // Add the last block - m = (*matches).size() - 1; - Block block; - block.qPos = (*matches)[m].q; - block.tPos = (*matches)[m].t; - - assert(block.tPos <= alignment->tAlignedSeq.length); - assert(block.qPos <= alignment->qAlignedSeq.length); - - block.length = (*matches)[m].l; - alignment->blocks.push_back(block); - anchorsOnly.blocks.push_back(block); - - // - // By convention, blocks start at 0, and the - // alignment->tPos,qPos give the start of the alignment. - // Modify the block positions so that they are offset by 0. - alignment->tPos = alignment->blocks[0].tPos; - alignment->qPos = alignment->blocks[0].qPos; - int b; - int blocksSize = alignment->blocks.size(); - for (b = 0; b < blocksSize ; b++) { - assert(alignment->tPos <= alignment->blocks[b].tPos); - assert(alignment->qPos <= alignment->blocks[b].qPos); - alignment->blocks[b].tPos -= alignment->tPos; - alignment->blocks[b].qPos -= alignment->qPos; - } - for (b = 0; b < anchorsOnly.blocks.size(); b++) { - anchorsOnly.blocks[b].tPos -= alignment->tPos; - anchorsOnly.blocks[b].qPos -= alignment->qPos; - } - anchorsOnly.tPos = alignment->tPos; - anchorsOnly.qPos = alignment->qPos; - ComputeAlignmentStats(*alignment, alignment->qAlignedSeq.seq, alignment->tAlignedSeq.seq, - distScoreFn); - - tAlignedSeq.Free(); - qAlignedSeq.Free(); - tSubSeq.Free(); - qSubSeq.Free(); - } - else { - alignScore = SDPAlign(alignment->qAlignedSeq, alignment->tAlignedSeq, distScoreFn, - sdpTupleSize, params.sdpIns, params.sdpDel, params.indelRate*3, - *alignment, mappingBuffers, - Local, - params.detailedSDPAlignment, - params.extendFrontAlignment, - params.recurseOver, - params.fastSDP); - ComputeAlignmentStats(*alignment, alignment->qAlignedSeq.seq, alignment->tAlignedSeq.seq, - distScoreFn); - } - } - else { - // - // The anchors used to anchor the sequence are sufficient to extend the alignment. - // - int m; - for (m = 0; m < (*intvIt).matches.size(); m++ ){ - Block block; - block.qPos = (*intvIt).matches[m].q - alignment->qAlignedSeqPos; - block.tPos = (*intvIt).matches[m].t - alignment->tAlignedSeqPos; - block.length = (*intvIt).matches[m].l; - alignment->blocks.push_back(block); - } - } - - // - // The anchors/sdp alignments may leave portions of the read - // unaligned at the beginning and end. If the parameters - // specify extending alignments, try and align extra bases at - // the beginning and end of alignments. - if (params.extendAlignments) { - - // - // Modify the alignment so that the start and end of the - // alignment strings are at the alignment boundaries. - // - // Since the query sequence is pointing at a subsequence of the - // read (and is always in the forward direction), just reference - // a new portion of the read. - alignment->qAlignedSeqPos = alignment->qAlignedSeqPos + alignment->qPos; - alignment->qAlignedSeqLength = alignment->QEnd(); - alignment->qAlignedSeq.ReferenceSubstring(read, alignment->qAlignedSeqPos, alignment->qAlignedSeqLength ); - alignment->qPos = 0; - - // - // Since the target sequence may be on the forward or reverse - // strand, a copy of the subsequence is made, and the original - // sequence free'd. - // - DNASequence tSubseq; - alignment->tAlignedSeqPos = alignment->tAlignedSeqPos + alignment->tPos; - alignment->tAlignedSeqLength = alignment->TEnd(); - tSubseq.Copy(alignment->tAlignedSeq, alignment->tPos, alignment->tAlignedSeqLength); - alignment->tPos = 0; - - alignment->tAlignedSeq.Free(); - alignment->tAlignedSeq.TakeOwnership(tSubseq); - - DNALength maximumExtendLength = 500; - - if (alignment->blocks.size() > 0 ) { - int lastAlignedBlock = alignment->blocks.size() - 1; - DNALength lastAlignedQPos = alignment->blocks[lastAlignedBlock].QEnd() + alignment->qPos + alignment->qAlignedSeqPos; - DNALength lastAlignedTPos = alignment->blocks[lastAlignedBlock].TEnd() + alignment->tPos + alignment->tAlignedSeqPos; - T_AlignmentCandidate extendedAlignmentForward, extendedAlignmentReverse; - int forwardScore, reverseScore; - - SMRTSequence readSuffix; - DNALength readSuffixLength; - DNASequence genomeSuffix; - DNALength genomeSuffixLength; - - SMRTSequence readPrefix; - DNALength readPrefixLength; - DNASequence genomePrefix; - DNALength genomePrefixLength; - - // - // Align the entire end of the read if it is short enough. - // - readSuffixLength = min(read.length - lastAlignedQPos, maximumExtendLength); - if (readSuffixLength > 0) { - readSuffix.ReferenceSubstring(read, lastAlignedQPos, readSuffixLength); - } - else { - readSuffix.length = 0; - } - - // - // Align The entire end of the genome up to the maximum extend length; - // - genomeSuffixLength = min(intervalContigEndPos - lastAlignedTPos, maximumExtendLength); - if (genomeSuffixLength > 0) { - if (alignment->tStrand == Forward) { - genomeSuffix.Copy(genome, lastAlignedTPos, genomeSuffixLength); - } - else { - static_cast(&genome)->CopyAsRC(genomeSuffix, lastAlignedTPos, genomeSuffixLength); - } - } - else { - genomeSuffix.length = 0; - } - forwardScore = 0; - if (readSuffix.length > 0 and genomeSuffix.length > 0) { - forwardScore = ExtendAlignmentForward(readSuffix, 0, - genomeSuffix, 0, - params.extendBandSize, - // Reuse buffers to speed up alignment - mappingBuffers.scoreMat, - mappingBuffers.pathMat, - // Do the alignment in the forward direction. - extendedAlignmentForward, - distScoreFn, - 1, // don't bother attempting - // to extend the alignment - // if one of the sequences - // is less than 1 base long - params.maxExtendDropoff); - } - - if ( forwardScore < 0 ) { - // - // The extended alignment considers the whole genome, but - // should be modified to be starting at the end of where - // the original alignment left off. - // - if (params.verbosity > 0) { - cout << "forward extended an alignment of score " << alignment->score << " with score " << forwardScore << " by " << extendedAlignmentForward.blocks.size() << " blocks and length " << extendedAlignmentForward.blocks[extendedAlignmentForward.blocks.size()-1].qPos << endl; - } - extendedAlignmentForward.tAlignedSeqPos = lastAlignedTPos; - - extendedAlignmentForward.qAlignedSeqPos = lastAlignedQPos; - - genomeSuffix.length = extendedAlignmentForward.tPos + extendedAlignmentForward.TEnd(); - alignment->tAlignedSeq.Append(genomeSuffix); - alignment->qAlignedSeq.length += extendedAlignmentForward.qPos + extendedAlignmentForward.QEnd(); - assert(alignment->qAlignedSeq.length <= read.length); - alignment->AppendAlignment(extendedAlignmentForward); - } - - DNALength firstAlignedQPos = alignment->qPos + alignment->qAlignedSeqPos; - DNALength firstAlignedTPos = alignment->tPos + alignment->tAlignedSeqPos; - - readPrefixLength = min(firstAlignedQPos, maximumExtendLength); - if (readPrefixLength > 0) { - readPrefix.ReferenceSubstring(read, firstAlignedQPos-readPrefixLength, readPrefixLength); - } - else { - readPrefix.length = 0; - } - - genomePrefixLength = min(firstAlignedTPos - intervalContigStartPos, maximumExtendLength); - if (genomePrefixLength > 0) { - if (alignment->tStrand == 0) { - genomePrefix.Copy(genome, firstAlignedTPos - genomePrefixLength, genomePrefixLength); - } - else { - static_cast(&genome)->MakeRC(genomePrefix, firstAlignedTPos - genomePrefixLength, genomePrefixLength); - } - } - reverseScore = 0; - if (readPrefix.length > 0 and genomePrefix.length > 0) { - reverseScore = ExtendAlignmentReverse(readPrefix, readPrefix.length-1, - genomePrefix, genomePrefixLength - 1, - params.extendBandSize, //k - mappingBuffers.scoreMat, - mappingBuffers.pathMat, - extendedAlignmentReverse, - distScoreFn, - 1, // don't bother attempting - // to extend the alignment - // if one of the sequences - // is less than 1 base long - params.maxExtendDropoff); - } - - if (reverseScore < 0 ) { - // - // Make alignment->tPos relative to the beginning of the - // extended alignment so that when it is appended, the - // coordinates match correctly. - if (params.verbosity > 0) { - cout << "reverse extended an alignment of score " << alignment->score << " with score " << reverseScore << " by " << extendedAlignmentReverse.blocks.size() << " blocks and length " << extendedAlignmentReverse.blocks[extendedAlignmentReverse.blocks.size()-1].qPos << endl; - } - extendedAlignmentReverse.tAlignedSeqPos = firstAlignedTPos - genomePrefixLength; - extendedAlignmentReverse.qAlignedSeqPos = firstAlignedQPos - readPrefixLength; - extendedAlignmentReverse.AppendAlignment(*alignment); - - genomePrefix.Append(alignment->tAlignedSeq, genomePrefix.length - alignment->tPos); - alignment->tAlignedSeq.Free(); - alignment->tAlignedSeq.TakeOwnership(genomePrefix); - - alignment->blocks = extendedAlignmentReverse.blocks; - - alignment->tAlignedSeqPos = extendedAlignmentReverse.tAlignedSeqPos; - alignment->tPos = extendedAlignmentReverse.tPos; - - - alignment->qAlignedSeqPos = extendedAlignmentReverse.qAlignedSeqPos; - alignment->qAlignedSeq.length = readPrefix.length + alignment->qAlignedSeq.length; - alignment->qPos = extendedAlignmentReverse.qPos; - alignment->qAlignedSeq.seq = readPrefix.seq; - // - // Make sure the two ways of accounting for aligned sequence - // length are in sync. This needs to go. - // - if (alignment->blocks.size() > 0) { - int lastBlock = alignment->blocks.size() - 1; - alignment->qAlignedSeqLength = alignment->qAlignedSeq.length; - alignment->tAlignedSeqLength = alignment->tAlignedSeq.length; - } - else { - alignment->qAlignedSeqLength = alignment->qAlignedSeq.length = 0; - alignment->tAlignedSeqLength = alignment->tAlignedSeq.length = 0; - } - } // end of if (reverseScore < 0 ) - readSuffix.Free(); - readPrefix.Free(); - genomePrefix.Free(); - genomeSuffix.Free(); - } - tSubseq.Free(); - } - - if (params.verbosity > 0) { - cout << "interval align score: " << alignScore << endl; - StickPrintAlignment(*alignment, - (DNASequence&) alignment->qAlignedSeq, - (DNASequence&) alignment->tAlignedSeq, - cout, - 0, alignment->tAlignedSeqPos); - - } - ComputeAlignmentStats(*alignment, - alignment->qAlignedSeq.seq, - alignment->tAlignedSeq.seq, - distScoreFn2); - //SMRTDistanceMatrix, ins, del ); - - - intvIt++; - } while (intvIt != weightedIntervals.end()); -} - - -template -void PairwiseLocalAlign(T_Sequence &qSeq, T_RefSequence &tSeq, - int k, - MappingParameters ¶ms, T_AlignmentCandidate &alignment, - MappingBuffers &mappingBuffers, - AlignmentType alignType) { - // - // Perform a pairwise alignment between qSeq and tSeq, but choose - // the pairwise alignment method based on the parameters. The - // options for pairwise alignment are: - // - Affine KBanded alignment: usually used for sequences with no - // quality information. - // - KBanded alignment: For sequences with quality information. - // Gaps are scored with quality values. - // - QualityValueScoreFunction scoreFn; - scoreFn.del = params.indel; - scoreFn.ins = params.indel; - - DistanceMatrixScoreFunction distScoreFn2( - SMRTDistanceMatrix, params.indel, params.indel); - - IDSScoreFunction idsScoreFn; - idsScoreFn.ins = params.insertion; - idsScoreFn.del = params.deletion; - idsScoreFn.substitutionPrior = params.substitutionPrior; - idsScoreFn.globalDeletionPrior = params.globalDeletionPrior; - idsScoreFn.InitializeScoreMatrix(SMRTDistanceMatrix); - - int kbandScore; - int qvAwareScore; - if (params.ignoreQualities || qSeq.qual.Empty() || !ReadHasMeaningfulQualityValues(qSeq) ) { - - kbandScore = AffineKBandAlign(qSeq, tSeq, SMRTDistanceMatrix, - params.indel+2, params.indel - 3, // homopolymer insertion open and extend - params.indel+2, params.indel - 1, // any insertion open and extend - params.indel, // deletion - k*1.2, - mappingBuffers.scoreMat, mappingBuffers.pathMat, - mappingBuffers.hpInsScoreMat, mappingBuffers.hpInsPathMat, - mappingBuffers.insScoreMat, mappingBuffers.insPathMat, - alignment, Global); - - alignment.score = kbandScore; - if (params.verbosity >= 2) { - cout << "align score: " << kbandScore << endl; - } - } - else { - - - if (qSeq.insertionQV.Empty() == false) { - qvAwareScore = KBandAlign(qSeq, tSeq, SMRTDistanceMatrix, - params.indel+2, // ins - params.indel+2, // del - k, - mappingBuffers.scoreMat, mappingBuffers.pathMat, - alignment, idsScoreFn, alignType); - if (params.verbosity >= 2) { - cout << "ids score fn score: " << qvAwareScore << endl; - } - } - else { - qvAwareScore = KBandAlign(qSeq, tSeq, SMRTDistanceMatrix, - params.indel+2, // ins - params.indel+2, // del - k, - mappingBuffers.scoreMat, mappingBuffers.pathMat, - alignment, scoreFn, alignType); - if (params.verbosity >= 2) { - cout << "qv score fn score: " << qvAwareScore << endl; - } - } - alignment.sumQVScore = qvAwareScore; - alignment.score = qvAwareScore; - alignment.probScore = 0; - } - // Compute stats and assign a default alignment score using an edit distance. - ComputeAlignmentStats(alignment, qSeq.seq, tSeq.seq, distScoreFn2); - - if (params.scoreType == 1) { - alignment.score = alignment.sumQVScore; - } -} - -// Extend target aligned sequence of the input alignement to both ends -// by flankSize bases. Update alignment->tAlignedSeqPos, -// alignment->tAlignedSeqLength and alignment->tAlignedSeq. -void FlankTAlignedSeq(T_AlignmentCandidate * alignment, - SequenceIndexDatabase &seqdb, - DNASequence & genome, - int flankSize) { - assert(alignment != NULL and alignment->tIsSubstring); - - UInt forwardTPos, newTAlignedSeqPos, newTAlignedSeqLen; - // New aligned start position relative to this chromosome, with - // the same direction as alignment->tStrand. - newTAlignedSeqPos = UInt((alignment->tAlignedSeqPos > UInt(flankSize))? - (alignment->tAlignedSeqPos - flankSize): 0); - newTAlignedSeqLen = min(alignment->tAlignedSeqPos + alignment->tAlignedSeqLength + - flankSize, alignment->tLength) - newTAlignedSeqPos; - - if (alignment->tStrand ==0) { - forwardTPos = newTAlignedSeqPos; - } else { - forwardTPos = alignment->tLength - newTAlignedSeqPos - 1; - } - - // Find where this chromosome is in the genome. - int seqIndex = seqdb.GetIndexOfSeqName(alignment->tName); - assert(seqIndex != -1); - UInt newGenomePos = seqdb.ChromosomePositionToGenome(seqIndex, forwardTPos); - - if (alignment->tIsSubstring == false) { - alignment->tAlignedSeq.Free(); - } - alignment->tAlignedSeqPos = newTAlignedSeqPos; - alignment->tAlignedSeqLength = newTAlignedSeqLen; - if (alignment->tStrand == 0) { - alignment->tAlignedSeq.ReferenceSubstring(genome, newGenomePos, newTAlignedSeqLen); - } else { - // Copy and then reverse complement. - genome.MakeRC(alignment->tAlignedSeq, - newGenomePos + 1 - alignment->tAlignedSeqLength, - alignment->tAlignedSeqLength); - alignment->tIsSubstring = false; - } -} - -// Align a subread of a SMRT sequence to target sequence of an alignment. -// Input: -// subread - a subread of a SMRT sequence. -// unrolledRead - the full SMRT sequence. -// alignment - an alignment. -// passDirection - whether or not the subread has the -// same direction as query of the alignment. -// 0 = true, 1 = false. -// subreadInterval - [start, end) interval of the subread in the -// SMRT read. -// subreadIndex - index of the subread in allReadAlignments. -// params - mapping paramters. -// Output: -// allReadAlignments - where the sequence and alignments of the -// subread are saved. -// threadOut - an out stream for debugging the current thread. -void AlignSubreadToAlignmentTarget(ReadAlignments & allReadAlignments, - SMRTSequence & subread, SMRTSequence & unrolledRead, - T_AlignmentCandidate * alignment, - int passDirection, ReadInterval & subreadInterval, - int subreadIndex, - MappingParameters & params, - MappingBuffers & mappingBuffers, - ostream & threadOut) { - assert(passDirection == 0 or passDirection == 1); - // - // Determine where in the genome the subread has mapped. - // - DNASequence alignedForwardRefSequence, alignedReverseRefSequence; - - if (alignment->tStrand == 0) { - // This needs to be changed -- copy copies RHS into LHS, - // CopyAsRC copies LHS into RHS - alignedForwardRefSequence.Copy(alignment->tAlignedSeq); - alignment->tAlignedSeq.CopyAsRC(alignedReverseRefSequence); - } - else { - alignment->tAlignedSeq.CopyAsRC(alignedForwardRefSequence); - alignedReverseRefSequence.Copy(alignment->tAlignedSeq); - } - - IDSScoreFunction idsScoreFn; - idsScoreFn.ins = params.insertion; - idsScoreFn.del = params.deletion; - idsScoreFn.InitializeScoreMatrix(SMRTDistanceMatrix); - idsScoreFn.globalDeletionPrior = params.globalDeletionPrior; - idsScoreFn.substitutionPrior = params.substitutionPrior; - - DistanceMatrixScoreFunction distScoreFn2( - SMRTDistanceMatrix, params.indel, params.indel); - // - // Determine the strand to align the subread to. - // - T_AlignmentCandidate exploded; - bool sameAlignmentPassDirection = (alignment->tStrand == passDirection); - bool computeProbIsFalse = false; - DNASequence & alignedRefSequence = (sameAlignmentPassDirection? - alignedForwardRefSequence:alignedReverseRefSequence); - // - // In the original code, parameters: bandSize=10, alignType=Global, - // sdpTupleSize=4 (instead of 12, Local and 6) were used when - // alignment & pass have different directions. - // - int explodedScore = GuidedAlign(subread, alignedRefSequence, - idsScoreFn, 12, params.sdpIns, params.sdpDel, - params.indelRate, mappingBuffers, exploded, - Local, computeProbIsFalse, 6); - - if (params.verbosity >= 3) { - threadOut << "zmw " << unrolledRead.zmwData.holeNumber - << ", subreadIndex " << subreadIndex - << ", passDirection " << passDirection - << ", subreadInterval [" << subreadInterval.start - << ", " << subreadInterval.end << ")" << endl - << "StickPrintAlignment subread-reference alignment which has" - << " the " << (sameAlignmentPassDirection?"same":"different") - << " direction as the ccs-reference (or the " - << "longestSubread-reference) alignment. " << endl - << "subread: " << endl; - static_cast(&subread)->PrintSeq(threadOut); - threadOut << endl; - threadOut << "alignedRefSeq: " << endl; - static_cast(&alignedRefSequence)->PrintSeq(threadOut); - StickPrintAlignment(exploded, (DNASequence&) subread, - (DNASequence&) alignedRefSequence, - threadOut, exploded.qAlignedSeqPos, - exploded.tAlignedSeqPos); - } - - if (exploded.blocks.size() > 0) { - DistanceMatrixScoreFunction distScoreFn( - SMRTDistanceMatrix, params.indel, params.indel); - ComputeAlignmentStats(exploded, subread.seq, - alignedRefSequence.seq, - distScoreFn2); - //SMRTDistanceMatrix, params.indel, params.indel); - if (exploded.score <= params.maxScore) { - // - // The coordinates of the alignment should be - // relative to the reference sequence (the specified chromosome, - // not the whole genome). - // - exploded.qStrand = 0; - exploded.tStrand = sameAlignmentPassDirection?0:1; - exploded.qLength = unrolledRead.length; - exploded.tLength = alignment->tLength; - exploded.tAlignedSeq.Copy(alignedRefSequence); - exploded.tAlignedSeqPos = (passDirection == 0)? - (alignment->tAlignedSeqPos): - (exploded.tLength - alignment->tAlignedSeqPos - - alignment->tAlignedSeqLength); - exploded.tAlignedSeqLength = alignment->tAlignedSeqLength; - - exploded.qAlignedSeq.ReferenceSubstring(subread); - exploded.qAlignedSeqPos = subreadInterval.start; - exploded.qAlignedSeqLength = subreadInterval.end - subreadInterval.start; - exploded.mapQV = alignment->mapQV; - exploded.tName = alignment->tName; - exploded.tIndex = alignment->tIndex; - - stringstream namestrm; - namestrm << "/" << subreadInterval.start - << "_" << subreadInterval.end; - exploded.qName = string(unrolledRead.title) + namestrm.str(); - - // - // Don't call AssignRefContigLocation as the coordinates - // of the alignment is already relative to the chromosome coordiantes. - // - // Save this alignment for printing later. - // - T_AlignmentCandidate *alignmentPtr = new T_AlignmentCandidate; - *alignmentPtr = exploded; - allReadAlignments.AddAlignmentForSeq(subreadIndex, alignmentPtr); - } // End of exploded score <= maxScore. - if (params.verbosity >= 3) { - threadOut << "exploded score: " << exploded.score << endl - << "exploded alignment: "<< endl; - exploded.Print(threadOut); - threadOut << endl; - } - } // End of exploded.blocks.size() > 0. -} - -#endif diff -Nru blasr-0~20151014+git8e668be/include/BlasrHeaders.h blasr-5.3/include/BlasrHeaders.h --- blasr-0~20151014+git8e668be/include/BlasrHeaders.h 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/include/BlasrHeaders.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,149 +0,0 @@ -// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. -// -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted (subject to the limitations in the -// disclaimer below) provided that the following conditions are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// -// * Neither the name of Pacific Biosciences nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE -// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC -// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED -// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES -// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF -// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT -// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -// SUCH DAMAGE. - -#ifndef _BLASR_HEADERS_H_ -#define _BLASR_HEADERS_H_ - -#ifdef __linux__ -# include -#endif -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define MAX_PHRED_SCORE 254 -#define MAPQV_END_ALIGN_WIGGLE 5 - -using namespace std; - -#include "libconfig.h" -#ifdef USE_PBBAM -#include -#endif - -#include "CCSSequence.hpp" -#include "SMRTSequence.hpp" -#include "FASTASequence.hpp" -#include "FASTAReader.hpp" -#include "SeqUtils.hpp" -#include "defs.h" -#include "utils.hpp" - - -#include "tuples/DNATuple.hpp" -#include "tuples/HashedTupleList.hpp" -#include "algorithms/compare/CompareStrings.hpp" -#include "algorithms/alignment/AffineKBandAlign.hpp" -#include "algorithms/alignment/GuidedAlign.hpp" -#include "algorithms/alignment/AffineGuidedAlign.hpp" -#include "algorithms/alignment/FullQVAlign.hpp" -#include "algorithms/alignment/ExtendAlign.hpp" -#include "algorithms/alignment/OneGapAlignment.hpp" -#include "algorithms/alignment/AlignmentUtils.hpp" -#include "algorithms/alignment/QualityValueScoreFunction.hpp" -#include "algorithms/alignment/IDSScoreFunction.hpp" -#include "algorithms/alignment/DistanceMatrixScoreFunction.hpp" -#include "algorithms/alignment/StringToScoreMatrix.hpp" -#include "algorithms/alignment/AlignmentFormats.hpp" -#include "algorithms/anchoring/LISPValue.hpp" -#include "algorithms/anchoring/LISPValueWeightor.hpp" -#include "algorithms/anchoring/LISSizeWeightor.hpp" -#include "algorithms/anchoring/LISQValueWeightor.hpp" -#include "algorithms/anchoring/FindMaxInterval.hpp" -#include "algorithms/anchoring/MapBySuffixArray.hpp" -#include "datastructures/anchoring/ClusterList.hpp" -#include "algorithms/anchoring/ClusterProbability.hpp" -#include "algorithms/anchoring/BWTSearch.hpp" -#include "metagenome/SequenceIndexDatabase.hpp" -#include "metagenome/TitleTable.hpp" -#include "suffixarray/SharedSuffixArray.hpp" -#include "suffixarray/SuffixArrayTypes.hpp" -#include "tuples/TupleCountTable.hpp" -#include "datastructures/anchoring/WeightedInterval.hpp" -#include "datastructures/anchoring/AnchorParameters.hpp" -#include "datastructures/alignment/AlignmentCandidate.hpp" -#include "datastructures/alignment/AlignmentContext.hpp" -#include "MappingMetrics.hpp" -#include "reads/ReadInterval.hpp" -#include "utils/FileOfFileNames.hpp" -#include "utils/RegionUtils.hpp" -#include "utils/TimeUtils.hpp" -#include "utils/SMRTTitle.hpp" -#include "qvs/QualityTransform.hpp" -#include "files/ReaderAgglomerate.hpp" -#include "files/CCSIterator.hpp" -#include "files/FragmentCCSIterator.hpp" -#include "HDFRegionTableReader.hpp" -#include "bwt/BWT.hpp" -#include "PackedDNASequence.hpp" -#include "CommandLineParser.hpp" -#include "qvs/QualityValue.hpp" -#include "statistics/VarianceAccumulator.hpp" -#include "statistics/pdfs.hpp" -#include "statistics/cdfs.hpp" -#include "statistics/StatUtils.hpp" -#include "statistics/LookupAnchorDistribution.hpp" -#include "format/StickAlignmentPrinter.hpp" -#include "format/SAMPrinter.hpp" -#include "format/XMLPrinter.hpp" -#include "format/CompareSequencesPrinter.hpp" -#include "format/VulgarPrinter.hpp" -#include "format/IntervalPrinter.hpp" -#include "format/SummaryPrinter.hpp" -#include "format/SAMHeaderPrinter.hpp" -#include "format/BAMPrinter.hpp" - -#include "MappingIPC.h" -#include "MappingSemaphores.h" -#include "MappingBuffers.hpp" -#include "ReadAlignments.hpp" - - -typedef SMRTSequence T_Sequence; -typedef FASTASequence T_GenomeSequence; -typedef DNASuffixArray T_SuffixArray; -typedef DNATuple T_Tuple; -typedef LISPValueWeightor > PValueWeightor; -typedef LISSMatchFrequencyPValueWeightor > MultiplicityPValueWeightor; -typedef MappingData MappingIPC; - -#endif diff -Nru blasr-0~20151014+git8e668be/include/BlasrMiscs.hpp blasr-5.3/include/BlasrMiscs.hpp --- blasr-0~20151014+git8e668be/include/BlasrMiscs.hpp 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/include/BlasrMiscs.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,100 +0,0 @@ -// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. -// -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted (subject to the limitations in the -// disclaimer below) provided that the following conditions are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// -// * Neither the name of Pacific Biosciences nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE -// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC -// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED -// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES -// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF -// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT -// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -// SUCH DAMAGE. - -// Author: Mark Chaisson -#ifndef _BLASR_MISCS_HPP_ -#define _BLASR_MISCS_HPP_ - -#include "BlasrHeaders.h" - -//-------------------------Fetch Reads----------------------------// -template -bool GetNextReadThroughSemaphore(ReaderAgglomerate &reader, - MappingParameters ¶ms, - T_Sequence &read, - string & readGroupId, - int & associatedRandInt, - MappingSemaphores & semaphores); - -//---------------------MAKE & CHECK READS-------------------------// -//FIXME: move to SMRTSequence -bool ReadHasMeaningfulQualityValues(FASTQSequence &sequence); - -//FIXME: Move to SMRTSequence -// Given a SMRT sequence and a subread interval, make the subread. -// Input: -// smrtRead - a SMRT sequence -// subreadInterval - a subread interval -// params - mapping parameters -// Output: -// subreadSequence - the constructed subread -void MakeSubreadOfInterval(SMRTSequence & subreadSequence, - SMRTSequence & smrtRead, - ReadInterval & subreadInterval, - MappingParameters & params); - -//FIXME: Move to SMRTSequence -// Given a SMRT sequence and one of its subreads, make the -// reverse complement of the subread in the coordinate of the -// reverse complement sequence of the SMRT sequence. -// Input: -// smrtRead - a SMRT read -// subreadSequence - a subread of smrtRead -// Output: -// subreadSequenceRC - the reverse complement of the subread -// in the coordinate of the reverse -// complement of the SMRT read. -void MakeSubreadRC(SMRTSequence & subreadSequenceRC, - SMRTSequence & subreadSequence, - SMRTSequence & smrtRead); - -// Make a virtual SMRTSequence (polymerase reads) given all subreads. -// NO QVs will be copied at this point. -void MakeVirtualRead(SMRTSequence & smrtRead, - const vector & subreads); - -// Construct subreads invervals from subreads -void MakeSubreadIntervals(vector & subreads, - vector & subreadIntervals); - -// Get index of median length interval -int GetIndexOfMedian(const vector & subreadIntervals); - -//-------------------------MISC-----------------------------------// -int CountZero(unsigned char *ptr, int length); - -#include "BlasrMiscsImpl.hpp" - -#endif diff -Nru blasr-0~20151014+git8e668be/include/BlasrMiscsImpl.hpp blasr-5.3/include/BlasrMiscsImpl.hpp --- blasr-0~20151014+git8e668be/include/BlasrMiscsImpl.hpp 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/include/BlasrMiscsImpl.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,223 +0,0 @@ -// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. -// -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted (subject to the limitations in the -// disclaimer below) provided that the following conditions are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// -// * Neither the name of Pacific Biosciences nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE -// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC -// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED -// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES -// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF -// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT -// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -// SUCH DAMAGE. - -// Author: Mark Chaisson - -#ifndef _BLASR_MISCS_IMPL_HPP_ -#define _BLASR_MISCS_IMPL_HPP_ -#include "utils/SMRTTitle.hpp" - -template -bool GetNextReadThroughSemaphore(ReaderAgglomerate &reader, - MappingParameters ¶ms, - T_Sequence &read, - string & readGroupId, - int & associatedRandInt, - MappingSemaphores & semaphores) -{ - // Wait on a semaphore - if (params.nProc > 1) { -#ifdef __APPLE__ - sem_wait(semaphores.reader); -#else - sem_wait(&semaphores.reader); -#endif - } - - bool returnValue = true; - // - // CCS Reads are read differently from other reads. Do static casting here - // of this. - // - if (reader.GetNext(read, associatedRandInt) == 0) { - returnValue = false; - } - - // - // Set the read group id before releasing the semaphore, since other - // threads may change the reader object to a new read group before - // sending this alignment out to printing. - readGroupId = reader.readGroupId; - - if (params.nProc > 1) { -#ifdef __APPLE__ - sem_post(semaphores.reader); -#else - sem_post(&semaphores.reader); -#endif - } - return returnValue; -} - - - -bool ReadHasMeaningfulQualityValues(FASTQSequence &sequence) -{ - if (sequence.qual.Empty() == true) { - return 0; - } - else { - int numZero=0, numNonZero=0; - if (sequence.qual.data == NULL) { - return false; - } - numZero = CountZero(sequence.qual.data, sequence.length); - numNonZero = sequence.length - numZero; - int subNumZero = 0, subNonZero = 0; - - if (sequence.substitutionQV.data == NULL) { - return false; - } - subNumZero = CountZero(sequence.substitutionQV.data, sequence.length); - subNonZero = sequence.length - subNumZero; - - if (numZero < 0.5*numNonZero and subNumZero < 0.5 * subNonZero) { - return true; - } - else { - return false; - } - } -} - -// Given a SMRT sequence and a subread interval, make the subread. -// Input: -// smrtRead - a SMRT sequence -// subreadInterval - a subread interval -// params - mapping parameters -// Output: -// subreadSequence - the constructed subread -void MakeSubreadOfInterval(SMRTSequence & subreadSequence, - SMRTSequence & smrtRead, - ReadInterval & subreadInterval, - MappingParameters & params) -{ - int start = subreadInterval.start; - int end = subreadInterval.end; - - assert(smrtRead.length >= subreadSequence.length); - smrtRead.MakeSubreadAsMasked(subreadSequence, start, end); - - if (!params.preserveReadTitle) { - smrtRead.SetSubreadTitle(subreadSequence, - subreadSequence.SubreadStart(), - subreadSequence.SubreadEnd()); - } - else { - subreadSequence.CopyTitle(smrtRead.title); - } - subreadSequence.zmwData = smrtRead.zmwData; -} - -// Given a SMRT sequence and one of its subreads, make the -// reverse complement of the subread in the coordinate of the -// reverse complement sequence of the SMRT sequence. -// Input: -// smrtRead - a SMRT read -// subreadSequence - a subread of smrtRead -// Output: -// subreadSequenceRC - the reverse complement of the subread -// in the coordinate of the reverse -// complement of the SMRT read. -void MakeSubreadRC(SMRTSequence & subreadSequenceRC, - SMRTSequence & subreadSequence, - SMRTSequence & smrtRead) -{ - assert(smrtRead.length >= subreadSequence.length); - // Reverse complement sequence of the subread. - subreadSequence.MakeRC(subreadSequenceRC); - // Update start and end positions of subreadSequenceRC in the - // coordinate of reverse compelement sequence of the SMRT read. - subreadSequenceRC.SubreadStart(smrtRead.length - subreadSequence.SubreadEnd()); - subreadSequenceRC.SubreadEnd (smrtRead.length - subreadSequence.SubreadStart()); - subreadSequenceRC.zmwData = smrtRead.zmwData; -} - -int CountZero(unsigned char *ptr, int length) -{ - int i; - int nZero = 0; - for (i = 0; i < length; i++) { - if (ptr[i] == 0) { ++nZero; } - } - return nZero; -} - -void MakeVirtualRead(SMRTSequence & smrtRead, - const vector & subreads) -{ - assert(subreads.size() > 0); - DNALength hqStart = 0, hqEnd = 0; - for(auto subread: subreads) { - hqStart = min(DNALength(subread.SubreadStart()), hqStart); - hqEnd = max(DNALength(subread.SubreadEnd()), hqEnd); - } - smrtRead.Free(); - smrtRead.Allocate(hqEnd); - smrtRead.lowQualityPrefix = hqStart; - smrtRead.lowQualitySuffix = smrtRead.length - hqEnd; - smrtRead.highQualityRegionScore = subreads[0].highQualityRegionScore; - stringstream ss; - ss << SMRTTitle(subreads[0].GetTitle()).MovieName() << "/" << subreads[0].HoleNumber(); - smrtRead.CopyTitle(ss.str()); - for (auto subread: subreads) { - memcpy(&smrtRead.seq[subread.SubreadStart()], - &subread.seq[0], sizeof(char) * subread.length); - } -} - -void MakeSubreadIntervals(vector & subreads, - vector & subreadIntervals) -{ - subreadIntervals.clear(); - for (auto subread: subreads) { - subreadIntervals.push_back(ReadInterval(subread.SubreadStart(), - subread.SubreadEnd(), subread.highQualityRegionScore)); - } -} - -int GetIndexOfMedian(const vector & subreadIntervals) -{ - vector intervals = subreadIntervals; - size_t n = intervals.size() / 2; - nth_element(intervals.begin(), intervals.begin() + n, intervals.end(), - [](const ReadInterval & a, const ReadInterval & b) -> bool - {a.end - a.start < b.end - b.start;}); - auto it = std::find(subreadIntervals.begin(), subreadIntervals.end(), intervals[n]); - int pos = int(std::distance(subreadIntervals.begin(), it)); - return pos; -} - -#endif diff -Nru blasr-0~20151014+git8e668be/include/BlasrUtils.hpp blasr-5.3/include/BlasrUtils.hpp --- blasr-0~20151014+git8e668be/include/BlasrUtils.hpp 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/include/BlasrUtils.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,199 +0,0 @@ -// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. -// -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted (subject to the limitations in the -// disclaimer below) provided that the following conditions are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// -// * Neither the name of Pacific Biosciences nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE -// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC -// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED -// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES -// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF -// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT -// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -// SUCH DAMAGE. - -// Author: Mark Chaisson - - -#ifndef _BLASR_INC_UTILS_HPP_ -#define _BLASR_INC_UTILS_HPP_ - -#include "BlasrHeaders.h" - -//----------------------MODIFY ALIGNMENTS--------------------------// -//FIXME: refactor class SequenceIndexDatabase -void AssignRefContigLocation(T_AlignmentCandidate &alignment, - SequenceIndexDatabase &seqdb, - DNASequence &genome); - -//FIXME: refactor class SequenceIndexDatabase -void AssignRefContigLocations(vector &alignmentPtrs, - SequenceIndexDatabase &seqdb, - DNASequence &genome); - -template -//FIXME: refactor class SequenceIndexDatabase -void AssignGenericRefContigName(vector &alignmentPtrs, - T_RefSequence &genome); - -//FIXME: move to class ReadAlignments -void StoreRankingStats(vector &alignments, - VarianceAccumulator &accumPValue, - VarianceAccumulator &accumWeight); - -//FIXME: mapQV should be assigned when alignments are created. -void AssignMapQV(vector &alignmentPtrs); - -//FIXME: move to class ReadAlignments -void ScaleMapQVByClusterSize(T_AlignmentCandidate &alignment, - MappingParameters ¶ms); - -void StoreMapQVs(SMRTSequence &read, - vector &alignmentPtrs, - MappingParameters ¶ms); - - -//--------------------SEARCH & CHECK ALIGNMENTS-------------------// -//FIXME: move to class ReadAlignments -template -bool CheckForSufficientMatch(T_Sequence &read, - vector &alignmentPtrs, - MappingParameters ¶ms); - -//FIXME: move to class ReadAlignments -int FindMaxLengthAlignment(vector alignmentPtrs, - int &maxLengthIndex); - -//FIXME: move to class T_AlignmentCandidate -void SumMismatches(SMRTSequence &read, - T_AlignmentCandidate &alignment, - int mismatchScore, - int fullIntvStart, int fullIntvEnd, - int &sum); - -//FIXME: move to class T_AlignmentCandidate -/// \returns whether two alignments overlap by more than minPcercentOverlap% -bool AlignmentsOverlap(T_AlignmentCandidate &alnA, - T_AlignmentCandidate &alnB, - float minPercentOverlap); - -/// \Partition overlapping alignments. -void PartitionOverlappingAlignments(vector &alignmentPtrs, - vector > &partitions, - float minOverlap); - - -//--------------------FILTER ALIGNMENTS---------------------------// -//FIXME: move to class T_AlignmentCandidate and ReadAlignments -int RemoveLowQualitySDPAlignments(int readLength, - vector &alignmentPtrs, - MappingParameters ¶ms); - -//FIXME: move to class ReadAlignments -template -int RemoveLowQualityAlignments(T_Sequence &read, - vector &alignmentPtrs, - MappingParameters ¶ms); - -//FIXME: move to class ReadAlignments -int RemoveOverlappingAlignments(vector &alignmentPtrs, - MappingParameters ¶ms); - -// FIXME: move to class ReadAlignments -// Delete all alignments from index startIndex in vector, inclusive. -void DeleteAlignments(vector &alignmentPtrs, - int startIndex=0); - -//--------------------REFINE ALIGNMENTS---------------------------// -template -void RefineAlignment(vector &bothQueryStrands, - T_RefSequence &genome, - T_AlignmentCandidate &alignmentCandidate, - MappingParameters ¶ms, - MappingBuffers &mappingBuffers); - - -template -void RefineAlignments(vector &bothQueryStrands, - T_RefSequence &genome, - vector &alignmentPtrs, - MappingParameters ¶ms, - MappingBuffers &mappingBuffers); - - -//--------------------PRINT ALIGNMENTS---------------------------// -vector -SelectAlignmentsToPrint(vector alignmentPtrs, - MappingParameters & params, - const int & associatedRandInt); - -// -// The full read is not the subread, and does not have masked off characters. -// -void PrintAlignment(T_AlignmentCandidate &alignment, - SMRTSequence &fullRead, - MappingParameters ¶ms, - AlignmentContext &alignmentContext, - ostream &outFile -#ifdef USE_PBBAM - , PacBio::BAM::BamWriter * bamWriterPtr -#endif - ); - -// Print all alignments in vector alignmentPtrs -void PrintAlignments(vector alignmentPtrs, - SMRTSequence &read, - MappingParameters ¶ms, ostream &outFile, - AlignmentContext alignmentContext, -#ifdef USE_PBBAM - PacBio::BAM::BamWriter * bamWriterPtr, -#endif - MappingSemaphores & semaphores); - -void PrintAlignmentPtrs(vector & alignmentPtrs, - ostream & out = cout); - -// Print all alignments for subreads in allReadAlignments. -// Input: -// allReadAlignments - contains a set of subreads, each of which -// is associated with a group of alignments. -// alignmentContext - an alignment context of each subread used -// for printing in SAM format. -// params - mapping parameters. -// Output: -// outFilePtr - where to print alignments for subreads. -// unalignedFilePtr - where to print sequences for unaligned subreads. -void PrintAllReadAlignments(ReadAlignments & allReadAlignments, - AlignmentContext & alignmentContext, - ostream & outFilePtr, - ostream & unalignedFilePtr, - MappingParameters & params, - vector & subreads, -#ifdef USE_PBBAM - PacBio::BAM::BamWriter * bamWriterPtr, -#endif - MappingSemaphores & semaphores); - -#include "BlasrUtilsImpl.hpp" -#endif diff -Nru blasr-0~20151014+git8e668be/include/BlasrUtilsImpl.hpp blasr-5.3/include/BlasrUtilsImpl.hpp --- blasr-0~20151014+git8e668be/include/BlasrUtilsImpl.hpp 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/include/BlasrUtilsImpl.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,1225 +0,0 @@ -// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. -// -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted (subject to the limitations in the -// disclaimer below) provided that the following conditions are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// -// * Neither the name of Pacific Biosciences nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE -// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC -// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED -// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES -// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF -// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT -// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -// SUCH DAMAGE. - -// Author: Mark Chaisson -#ifndef _BLASR_INC_UTILS_IMPL_HPP_ -#define _BLASR_INC_UTILS_IMPL_HPP_ -#include "BlasrAlign.hpp" - - -//----------------------MODIFY ALIGNMENTS--------------------------// -void AssignRefContigLocation(T_AlignmentCandidate &alignment, - SequenceIndexDatabase &seqdb, - DNASequence &genome) -{ - // - // If the sequence database is used, the start position of - // the alignment is relative to the start of the chromosome, - // not the entire index. Subtract off the start position of - // the chromosome to get the true position. - // - DNALength forwardTPos; - int seqDBIndex; - if (alignment.tStrand == 0) { - forwardTPos = alignment.tAlignedSeqPos; - seqDBIndex = seqdb.SearchForIndex(forwardTPos); - alignment.tAlignedSeqPos -= seqdb.seqStartPos[seqDBIndex]; - } - else { - // - // Flip coordinates into forward strand in order to find the boundaries - // of the contig, then reverse them in order to find offset. - // - - // Find the reverse complement coordinate of the index of the last aligned base. - assert(alignment.tAlignedSeqLength > 0); - forwardTPos = genome.MakeRCCoordinate(alignment.tAlignedSeqPos + alignment.tAlignedSeqLength - 1); - seqDBIndex = seqdb.SearchForIndex(forwardTPos); - - - // - // Find the reverse comlement coordinate of the last base of this - // sequence. This would normally be the start of the next contig - // -1 to get the length, but since an 'N' is added between every - // pair of sequences, this is -2. - // - DNALength reverseTOffset; - reverseTOffset = genome.MakeRCCoordinate(seqdb.seqStartPos[seqDBIndex+1]-2); - alignment.tAlignedSeqPos -= reverseTOffset; - } -} - -void AssignRefContigLocations(vector &alignmentPtrs, - SequenceIndexDatabase &seqdb, - DNASequence &genome) -{ - - UInt i; - for (i = 0; i < alignmentPtrs.size(); i++) { - T_AlignmentCandidate *aref = alignmentPtrs[i]; - AssignRefContigLocation(*aref, seqdb, genome); - } -} - -template -void AssignGenericRefContigName(vector &alignmentPtrs, - T_RefSequence &genome) { - UInt i; - for (i = 0; i < alignmentPtrs.size(); i++) { - T_AlignmentCandidate *aref = alignmentPtrs[i]; - aref->tName = genome.title; - } -} - - -void StoreRankingStats(vector &alignments, - VarianceAccumulator &accumPValue, - VarianceAccumulator &accumWeight) { - int i; - for (i = 0; i < int(alignments.size()); i++) { - alignments[i]->pvalVariance = accumPValue.GetVariance(); - alignments[i]->pvalNStdDev = accumPValue.GetNStdDev(alignments[i]->clusterScore); - alignments[i]->weightVariance = accumWeight.GetVariance(); - alignments[i]->weightNStdDev = accumWeight.GetNStdDev(alignments[i]->clusterWeight); - } -} - -void AssignMapQV(vector &alignmentPtrs) { - int i; - int mapQV = 1; - if (alignmentPtrs.size() > 1 and alignmentPtrs[0]->score == alignmentPtrs[1]->score) { - // the top two alignments have the same score, don't consider them as mapped. - mapQV = 0; - } - - for (i = 0; i < int(alignmentPtrs.size()); i++) { - alignmentPtrs[i]->mapQV = mapQV; - } -} - -void ScaleMapQVByClusterSize(T_AlignmentCandidate &alignment, - MappingParameters ¶ms) -{ - if (alignment.numSignificantClusters > int(params.nCandidates)) { - alignment.mapQV = Phred((1-InversePhred(alignment.mapQV))* ((float)params.nCandidates / alignment.numSignificantClusters)); - } - else if (alignment.numSignificantClusters == 0) { - alignment.mapQV = 0; - } -} - -void StoreMapQVs(SMRTSequence &read, - vector &alignmentPtrs, - MappingParameters ¶ms) -{ - // - // Only weight alignments for mapqv against eachother if they are overlapping. - // - int a; - vector > partitions; // Each set contains alignments that overlap on the read. - DistanceMatrixScoreFunction distScoreFn; - distScoreFn.del = params.deletion; - distScoreFn.ins = params.insertion; - // bug 24363, set affineOpen and affineExtend for distScoreFn - distScoreFn.affineOpen = params.affineOpen; - distScoreFn.affineExtend = params.affineExtend; - distScoreFn.InitializeScoreMatrix(SMRTLogProbMatrix); - IDSScoreFunction idsScoreFn; - idsScoreFn.ins = params.insertion; - idsScoreFn.del = params.deletion; - idsScoreFn.affineExtend = params.affineExtend; - idsScoreFn.affineOpen = params.affineOpen; - idsScoreFn.substitutionPrior = params.substitutionPrior; - idsScoreFn.globalDeletionPrior = params.globalDeletionPrior; - - // - // Rescore the alignment so that it uses probabilities. - // - for (a = 0; a < int(alignmentPtrs.size()); a++) { - if (params.ignoreQualities == false) { - // bug 24363, pass -affineAlign to compute correct alignment score. - alignmentPtrs[a]->probScore = -ComputeAlignmentScore(*alignmentPtrs[a], - alignmentPtrs[a]->qAlignedSeq, - alignmentPtrs[a]->tAlignedSeq, - idsScoreFn, - params.affineAlign) / 10.0; - } - else { - alignmentPtrs[a]->probScore = -ComputeAlignmentScore(*alignmentPtrs[a], - alignmentPtrs[a]->qAlignedSeq, - alignmentPtrs[a]->tAlignedSeq, - distScoreFn, - params.affineAlign) / 10.0; - } - } - PartitionOverlappingAlignments(alignmentPtrs, partitions, params.minFractionToBeConsideredOverlapping); - - int p; - set::iterator partIt, partEnd; - - // - // For each partition, store where on the read it begins, and where - // it ends. - // - vector partitionBeginPos, partitionEndPos; - partitionBeginPos.resize(partitions.size()); - partitionEndPos.resize(partitions.size()); - fill(partitionBeginPos.begin(), partitionBeginPos.end(), -1); - fill(partitionEndPos.begin(), partitionEndPos.end(), -1); - vector assigned; - assigned.resize( alignmentPtrs.size()); - fill(assigned.begin(), assigned.end(), false); - - for (p = 0; p < int(partitions.size()); p++) { - partEnd = partitions[p].end(); - int alnStart, alnEnd; - - if (partitions[p].size() > 0) { - partIt = partitions[p].begin(); - alignmentPtrs[*partIt]->GetQInterval(alnStart, alnEnd); - partitionBeginPos[p] = alnStart; - partitionEndPos[p] = alnEnd; - ++partIt; - partEnd = partitions[p].end(); - for (; partIt != partEnd; ++partIt) { - // Comment out because all reads are now in the forward strand. - // alignmentPtrs[*partIt]->GetQInterval(alnStart, alnEnd, convertToForwardStrand); - alignmentPtrs[*partIt]->GetQInterval(alnStart, alnEnd); - if (alnEnd - alnStart > partitionEndPos[p] - partitionBeginPos[p]) { - partitionBeginPos[p] = alnStart; - partitionEndPos[p] = alnEnd; - } - } - } - } - - // - // For each partition, determine the widest parts of the read that - // are aligned in the partition. All alignments will be extended to - // the end of the widest parts of the partition. - // - const static bool convertToForwardStrand = true; - - UInt i; - - // - // For now, just use the alignment score as the probability score. - // Although it is possible to use the full forward probability, for - // the most part it is pretty much the same as the Vitterbi - // probability, but it takes a lot longer to compute. - // - - // - // Now estimate what the alignment scores would be if they were - // extended past the ends of their current alignment. - // - - for (p = 0; p < int(partitions.size()); p++) { - partEnd = partitions[p].end(); - int alnStart, alnEnd; - for (partIt = partitions[p].begin(); partitions[p].size() > 0 and partIt != partEnd; ++partIt) { - int mismatchSum = 0; - alignmentPtrs[*partIt]->GetQInterval(alnStart, alnEnd, convertToForwardStrand); - if (alnStart - partitionBeginPos[p] > MAPQV_END_ALIGN_WIGGLE or - partitionEndPos[p] - alnEnd > MAPQV_END_ALIGN_WIGGLE) { - // bug 24363, use updated SumMismatches to compute mismatch score when - // no QV is available. - SumMismatches(read, *alignmentPtrs[*partIt], 15, - partitionBeginPos[p], partitionEndPos[p], mismatchSum); - } - // - // Random sequence can be aligned with about 50% similarity due - // to optimization, so weight the qv sum - // - alignmentPtrs[*partIt]->probScore += -(mismatchSum) * 0.5; - } - } - - // - // Determine mapqv by summing qvscores in partitions - - float mapQVDenominator = 0; - for (p = 0; p < int(partitions.size()); p++) { - set::iterator nextIt; - if (partitions[p].size() == 0) { - continue; - } - int index = *partitions[p].begin(); - - mapQVDenominator = alignmentPtrs[index]->probScore; - - if (partitions[p].size() > 1) { - partIt = partitions[p].begin(); - partEnd = partitions[p].end(); - ++partIt; - - for (; partIt != partEnd; ++partIt) { - index = *partIt; - mapQVDenominator = LogSumOfTwo(mapQVDenominator, alignmentPtrs[index]->probScore); - } - } - - - for (partIt = partitions[p].begin(); - partIt != partitions[p].end(); ++partIt) { - // - // If only one alignment is found, assume maximum mapqv. - // - assigned[*partIt] = true; - if (partitions[p].size() == 1) { - alignmentPtrs[*partIt]->mapQV = MAX_PHRED_SCORE; - } - - // - // Look for overflow. - // - else if (alignmentPtrs[*partIt]->probScore - mapQVDenominator < -20) { - alignmentPtrs[*partIt]->mapQV = 0; - } - else { - double log10 = log(10); - double sub = alignmentPtrs[*partIt]->probScore - mapQVDenominator; - double expo = exp(log10*sub); - double diff = 1.0 - expo; - int phredValue; - - if (expo == 0) { - phredValue = 0; - } - else if (diff == 0) { - phredValue = MAX_PHRED_SCORE; - } - else { - phredValue = Phred(diff); - } - if (phredValue > MAX_PHRED_SCORE) { - phredValue = MAX_PHRED_SCORE; - } - - alignmentPtrs[*partIt]->mapQV = phredValue; - assigned[*partIt]=true; - } - - if (params.scaleMapQVByNumSignificantClusters) { - ScaleMapQVByClusterSize(*alignmentPtrs[*partIt], params); - } - } - } - - for (i = 0; i < assigned.size(); i++) { - assert(assigned[i]); - } -} - -//--------------------SEARCH & CHECK ALIGNMENTS-------------------// -template -bool CheckForSufficientMatch(T_Sequence &read, - vector &alignmentPtrs, - MappingParameters ¶ms) -{ - if (alignmentPtrs.size() > 0 and alignmentPtrs[0]->score < params.maxScore) { - return true; - } - else { - return false; - } -} - -int FindMaxLengthAlignment(vector alignmentPtrs, - int &maxLengthIndex) -{ - int i; - int maxLength = 0; - maxLengthIndex = -1; - - for (i = 0; i < int(alignmentPtrs.size()); i++) { - int qStart, qEnd; - alignmentPtrs[i]->GetQInterval(qStart, qEnd); - if (qEnd - qStart > maxLength) { - maxLengthIndex = i; - maxLength = qEnd - qStart; - } - } - return (maxLength != -1); -} - -void SumMismatches(SMRTSequence &read, - T_AlignmentCandidate &alignment, - int mismatchScore, - int fullIntvStart, int fullIntvEnd, - int &sum) -{ - int alnStart, alnEnd; - alignment.GetQIntervalOnForwardStrand(alnStart, alnEnd); - int p; - sum = 0; - if (read.substitutionQV.Empty() == false) { - for (p = fullIntvStart; p < alnStart; p++) { - sum += read.substitutionQV[p]; - } - for (p = alnEnd; p < fullIntvEnd; p++) { - sum += read.substitutionQV[p]; - } - } else { - // bug 24363, compute mismatch score when QV is not available. - sum += mismatchScore * ((alnStart - fullIntvStart) + (fullIntvEnd - alnEnd)); - } -} - - -bool AlignmentsOverlap(T_AlignmentCandidate &alnA, - T_AlignmentCandidate &alnB, - float minPercentOverlap) -{ - int alnAStart, alnAEnd, alnBStart, alnBEnd; - bool useForwardStrand=true; - alnA.GetQInterval(alnAStart, alnAEnd, useForwardStrand); - alnB.GetQInterval(alnBStart, alnBEnd, useForwardStrand); - // Look if one alignment encompasses the other - int ovp = 0; - if (alnAStart <= alnBStart and alnAEnd >= alnBEnd) { - return true; - } - else if (alnBStart <= alnAStart and alnBEnd >= alnAEnd) { - return true; - //ovp = alnAEnd - alnAStart; - } - else { - // - // Look to see if the alignments overlap - // - - if (alnAEnd >= alnBStart and alnAEnd <= alnBEnd) { - ovp = alnAEnd - alnBStart; - } - else if (alnAStart >= alnBStart and alnAStart <= alnBEnd) { - ovp = alnBEnd - alnAStart; - } - } - - // float ovpPercent = (2.0*ovp) / ((alnAEnd - alnAStart) + (alnBEnd - alnBStart)); - float ovpPercent = 0; - if (alnAEnd - alnAStart > 0 and alnBEnd - alnBStart > 0) { - // overlap percentage: maximum overlap percent in A and B. - ovpPercent = max(float(ovp)/float(alnAEnd - alnAStart), - float(ovp)/float(alnBEnd - alnBStart)); - } - - // returns true when an overlap is found. - return (ovpPercent > minPercentOverlap); -} - -void PartitionOverlappingAlignments(vector &alignmentPtrs, - vector > &partitions, - float minOverlap) { - if (alignmentPtrs.size() == 0) { - partitions.clear(); - return; - } - - set::iterator setIt, setEnd; - int i, p; - bool overlapFound = false; - for (i = 0; i < int(alignmentPtrs.size()); i++) { - overlapFound = false; - for (p = 0; p < int(partitions.size()) and overlapFound == false; p++) { - setEnd = partitions[p].end(); - for (setIt = partitions[p].begin(); setIt != partitions[p].end() and overlapFound == false; ++setIt) { - if (AlignmentsOverlap(*alignmentPtrs[i], *alignmentPtrs[*setIt], minOverlap) or - ((alignmentPtrs[i]->QAlignStart() <= alignmentPtrs[*setIt]->QAlignStart()) and - (alignmentPtrs[i]->QAlignEnd() > alignmentPtrs[*setIt]->QAlignEnd()))) { - partitions[p].insert(i); - overlapFound = true; - } - } - } - // - // If this alignment does not overlap any other, create a - // partition with it as the first element. - // - if (overlapFound == false) { - partitions.push_back(set()); - partitions[partitions.size()-1].insert(i); - } - } -} - -//--------------------FILTER ALIGNMENTS---------------------------// -int RemoveLowQualitySDPAlignments(int readLength, - vector &alignmentPtrs, - MappingParameters ¶ms) -{ - // Just a hack. For now, assume there is at least 1 match per 50 bases. - int totalBasesMatched = 0; - int a; - for (a = 0; a < int(alignmentPtrs.size()); a++) { - int b; - for (b = 0; b < int(alignmentPtrs[a]->blocks.size()); b++) { - totalBasesMatched += alignmentPtrs[a]->blocks[b].length; - } - int expectedMatches = params.sdpTupleSize/50.0 * readLength; - if (totalBasesMatched < expectedMatches) { - delete alignmentPtrs[a]; - alignmentPtrs[a] = NULL; - } - } - int packedAlignmentIndex = 0; - for (a = 0; a < int(alignmentPtrs.size()); a++) { - if (alignmentPtrs[a] != NULL) { - alignmentPtrs[packedAlignmentIndex] = alignmentPtrs[a]; - packedAlignmentIndex++; - } - } - alignmentPtrs.resize(packedAlignmentIndex); - return packedAlignmentIndex; -} - -template -int RemoveLowQualityAlignments(T_Sequence &read, - vector &alignmentPtrs, - MappingParameters ¶ms) -{ - if (params.verbosity > 0) { - cout << "checking at least " << alignmentPtrs.size() << " alignments to see if they are accurate." << endl; - } - UInt i; - for (i = 0; i < MIN(params.nCandidates, alignmentPtrs.size()); i++) { - if (params.verbosity > 0) { - cout << "Quality check " << i << " " << alignmentPtrs[i]->score << endl; - } - if (alignmentPtrs[i]->blocks.size() == 0 or - alignmentPtrs[i]->score > params.maxScore) { - // - // Since the alignments are sorted according to alignment - // score, once one of the alignments is too low of a score, - // all remaining alignments are also too low, and should be - // removed as well. Do that all at once. - // - if (alignmentPtrs[i]->blocks.size() == 0 and params.verbosity > 0) { - cout << "Removing empty alignment " << alignmentPtrs[i]->qName << endl; - } - if (params.verbosity > 0) { - cout << alignmentPtrs[i]->qName << " alignment " << i << " is too low of a score." << alignmentPtrs[i]->score << endl; - } - int deletedIndex = i; - for (; deletedIndex < alignmentPtrs.size(); deletedIndex++) { - delete alignmentPtrs[deletedIndex]; - alignmentPtrs[deletedIndex] = NULL; - } - alignmentPtrs.erase(i + alignmentPtrs.begin(), alignmentPtrs.end()); - break; - } - else { - if (params.verbosity > 0) { - cout << "Keeping alignment " << i << " " << alignmentPtrs[i]->qPos << " " << alignmentPtrs[i]->qLength - << " " << alignmentPtrs[i]->tName << " " << alignmentPtrs[i]->tPos << " " << alignmentPtrs[i]->tLength - << " from score: " << alignmentPtrs[i]->score << endl; - } - } - } - return alignmentPtrs.size(); -} - - -//FIXME: move to class ReadAlignments -int RemoveOverlappingAlignments(vector &alignmentPtrs, - MappingParameters ¶ms) -{ - vector alignmentIsContained; - alignmentIsContained.resize(alignmentPtrs.size()); - std::fill(alignmentIsContained.begin(), alignmentIsContained.end(), false); - - int j; - int numContained = 0; - int curNotContained = 0; - - if (alignmentPtrs.size() > 0) { - UInt i; - for (i = 0; i < alignmentPtrs.size()-1; i++ ){ - T_AlignmentCandidate *aref = alignmentPtrs[i]; - if (aref->pctSimilarity < params.minPctSimilarity) { - continue; - } - for (j = i + 1; j < int(alignmentPtrs.size()); j++ ){ - // - // Make sure this alignment isn't already removed. - // - if (alignmentIsContained[j]) { - continue; - } - - // - // Only check for containment if the two sequences are from the same contig. - // - if (alignmentPtrs[i]->tIndex != alignmentPtrs[j]->tIndex) { - continue; - } - - // - // Check for an alignment that is fully overlapping another - // alignment. - if (aref->GenomicTBegin() <= alignmentPtrs[j]->GenomicTBegin() and - aref->GenomicTEnd() >= alignmentPtrs[j]->GenomicTEnd() and - alignmentPtrs[i]->tIndex == alignmentPtrs[j]->tIndex) { - // - // Alignment i is contained in j is only true if it has a worse score. - // - if (aref->score <= alignmentPtrs[j]->score) { - alignmentIsContained[j] = true; - } - if (params.verbosity >= 2) { - cout << "alignment " << i << " is contained in " << j << endl; - cout << aref->tAlignedSeqPos << " " << alignmentPtrs[j]->tAlignedSeqPos << " " - << aref->tAlignedSeqPos + aref->tAlignedSeqLength << " " - << alignmentPtrs[j]->tAlignedSeqPos + alignmentPtrs[j]->tAlignedSeqLength << endl; - } - } - else if (alignmentPtrs[j]->GenomicTBegin() <= aref->GenomicTBegin() and - alignmentPtrs[j]->GenomicTEnd() >= aref->GenomicTEnd() and - alignmentPtrs[i]->tIndex == alignmentPtrs[j]->tIndex) { - if (params.verbosity >= 2) { - cout << "ALIGNMENT " << j << " is contained in " << i << endl; - cout << alignmentPtrs[j]->tAlignedSeqPos << " " << aref->tAlignedSeqPos << " " - << alignmentPtrs[j]->tAlignedSeqPos + alignmentPtrs[j]->tAlignedSeqLength << " " - << aref->tAlignedSeqPos + aref->tAlignedSeqLength << endl; - } - if (alignmentPtrs[j]->score <= aref->score) { - alignmentIsContained[i] = true; - } - } - } - } - for (i = 0; i < alignmentPtrs.size(); i++) { - T_AlignmentCandidate *aref = alignmentPtrs[i]; - if (alignmentIsContained[i]) { - delete alignmentPtrs[i]; - alignmentPtrs[i] = NULL; - numContained++; - } - else { - alignmentPtrs[curNotContained] = aref; - ++curNotContained; - } - } - alignmentPtrs.resize(alignmentPtrs.size() - numContained); - } - return alignmentPtrs.size(); -} - -// Delete all alignments from index startIndex in vector, inclusive. -void DeleteAlignments(vector &alignmentPtrs, - int startIndex) -{ - int i; - for (i = startIndex; i < int(alignmentPtrs.size()); i++ ) { - delete alignmentPtrs[i]; - } - alignmentPtrs.resize(0); -} - - -//--------------------REFINE ALIGNMENTS---------------------------// -template -void RefineAlignment(vector &bothQueryStrands, - T_RefSequence &genome, - T_AlignmentCandidate &alignmentCandidate, - MappingParameters ¶ms, - MappingBuffers &mappingBuffers) -{ - FASTQSequence qSeq; - DNASequence tSeq; - DistanceMatrixScoreFunction distScoreFn( - SMRTDistanceMatrix, params.deletion, params.insertion); - - DistanceMatrixScoreFunction distScoreFn2( - SMRTDistanceMatrix, params.indel, params.indel); - - QualityValueScoreFunction scoreFn; - IDSScoreFunction idsScoreFn; - idsScoreFn.InitializeScoreMatrix(SMRTDistanceMatrix); - scoreFn.del = params.indel; - scoreFn.ins = params.indel; - idsScoreFn.ins = params.insertion; - idsScoreFn.del = params.deletion; - idsScoreFn.affineExtend = params.affineExtend; - idsScoreFn.affineOpen = params.affineOpen; - idsScoreFn.substitutionPrior = params.substitutionPrior; - idsScoreFn.globalDeletionPrior = params.globalDeletionPrior; - - if (params.doGlobalAlignment) { - SMRTSequence subread; - subread.ReferenceSubstring(*bothQueryStrands[0], - bothQueryStrands[0]->SubreadStart(), - (bothQueryStrands[0]->SubreadLength())); - - int drift = ComputeDrift(alignmentCandidate); - T_AlignmentCandidate refinedAlignment; - - KBandAlign(subread, alignmentCandidate.tAlignedSeq, SMRTDistanceMatrix, - params.insertion, params.deletion, - drift, - mappingBuffers.scoreMat, mappingBuffers.pathMat, - refinedAlignment, idsScoreFn, Global); - refinedAlignment.RemoveEndGaps(); - ComputeAlignmentStats(refinedAlignment, - subread.seq, - alignmentCandidate.tAlignedSeq.seq, - distScoreFn2); - //idsScoreFn); - - alignmentCandidate.blocks = refinedAlignment.blocks; - alignmentCandidate.gaps = refinedAlignment.gaps; - alignmentCandidate.tPos = refinedAlignment.tPos; - alignmentCandidate.qPos = refinedAlignment.qPos + bothQueryStrands[0]->SubreadStart(); - alignmentCandidate.score = refinedAlignment.score; - subread.Free(); - } - else if (params.useGuidedAlign) { - T_AlignmentCandidate refinedAlignment; - int lastBlock = alignmentCandidate.blocks.size() - 1; - - - if (alignmentCandidate.blocks.size() > 0) { - - /* - * Refine the alignment without expanding past the current - * boundaries of the sequences that are already aligned. - */ - - // - // NOTE** this only makes sense when - // alignmentCandidate.blocks[0].tPos == 0. Otherwise the length - // of the sequence is not correct. - // - tSeq.Copy(alignmentCandidate.tAlignedSeq, - alignmentCandidate.tPos, - (alignmentCandidate.blocks[lastBlock].tPos + - alignmentCandidate.blocks[lastBlock].length - - alignmentCandidate.blocks[0].tPos)); - - // qSeq.ReferenceSubstring(alignmentCandidate.qAlignedSeq, - qSeq.ReferenceSubstring(*bothQueryStrands[0], - alignmentCandidate.qAlignedSeqPos + alignmentCandidate.qPos, - (alignmentCandidate.blocks[lastBlock].qPos + - alignmentCandidate.blocks[lastBlock].length)); - - if (!params.ignoreQualities && ReadHasMeaningfulQualityValues(alignmentCandidate.qAlignedSeq)) { - if (params.affineAlign) { - AffineGuidedAlign(qSeq, tSeq, alignmentCandidate, - idsScoreFn, params.bandSize, - mappingBuffers, - refinedAlignment, Global, false); - } - else { - GuidedAlign(qSeq, tSeq, alignmentCandidate, - idsScoreFn, params.guidedAlignBandSize, - mappingBuffers, - refinedAlignment, Global, false); - } - } - else { - if (params.affineAlign) { - AffineGuidedAlign(qSeq, tSeq, alignmentCandidate, - distScoreFn, params.bandSize, - mappingBuffers, - refinedAlignment, Global, false); - } - else { - GuidedAlign(qSeq, tSeq, alignmentCandidate, - distScoreFn, params.guidedAlignBandSize, - mappingBuffers, - refinedAlignment, Global, false); - } - } - ComputeAlignmentStats(refinedAlignment, - qSeq.seq, - tSeq.seq, - distScoreFn2, params.affineAlign); - // - // Copy the refine alignment, which may be a subsequence of the - // alignmentCandidate into the alignment candidate. - // - - // First copy the alignment block and gap (the description of - // the base by base alignment). - - alignmentCandidate.blocks.clear(); - alignmentCandidate.blocks = refinedAlignment.blocks; - - alignmentCandidate.CopyStats(refinedAlignment); - - alignmentCandidate.gaps = refinedAlignment.gaps; - alignmentCandidate.score = refinedAlignment.score; - alignmentCandidate.nCells = refinedAlignment.nCells; - - // Next copy the information that describes what interval was - // aligned. Since the reference sequences of the alignment - // candidate have been modified, they are reassigned. - alignmentCandidate.tAlignedSeq.Free(); - alignmentCandidate.tAlignedSeq.TakeOwnership(tSeq); - alignmentCandidate.ReassignQSequence(qSeq); - alignmentCandidate.tAlignedSeqPos += alignmentCandidate.tPos; - alignmentCandidate.qAlignedSeqPos += alignmentCandidate.qPos; - - // - // tPos and qPos are the positions within the interval where the - // alignment begins. The refined alignment has adifferent tPos - // and qPos from the alignment candidate. - alignmentCandidate.tPos = refinedAlignment.tPos; - alignmentCandidate.qPos = refinedAlignment.qPos; - - // The lengths of the newly aligned sequences may differ, update those. - alignmentCandidate.tAlignedSeqLength = tSeq.length; - alignmentCandidate.qAlignedSeqLength = qSeq.length; - } - } - else { - - - // - // This assumes an SDP alignment has been performed to create 'alignmentCandidate'. - - // - // Recompute the alignment using a banded smith waterman to - // get rid of any spurious effects of usign the seeded gaps. - // - - // - // The k-banded alignment is over a subsequence of the first - // (sparse dynamic programming, SDP) alignment. The SDP - // alignment is over a large window that may contain the - // candidate sequence. The k-band alignment is over a tighter - // region. - - int drift = ComputeDrift(alignmentCandidate); - - // - // Rescore the alignment with a banded alignment that has a - // better model of sequencing error. - // - - if (alignmentCandidate.blocks.size() == 0 ){ - alignmentCandidate.score = 0; - return; - } - int lastBlock = alignmentCandidate.blocks.size() - 1; - - // - // Assign the sequences that are going to be realigned using - // banded alignment. The SDP alignment does not give that great - // of a score, but it does do a good job at finding a backbone - // alignment that closely defines the sequence that is aligned. - // Reassign the subsequences for alignment with a tight bound - // around the beginning and ending of each sequence, so that - // global banded alignment may be performed. - // - - // - // This section needs to be cleaned up substantially. Right now it - // copies a substring from the ref to a temp, then from the temp - // back to the ref. It may be possible to just keep one pointer per - // read to the memory that was allocated, then allow the seq - // parameter to float around. The reason for all the copying is - // that in case there is a compressed version of the genome the - // seqences must be transformed before alignment. - // - - if (alignmentCandidate.qIsSubstring) { - qSeq.ReferenceSubstring(*bothQueryStrands[0], // the original sequence - alignmentCandidate.qPos + alignmentCandidate.qAlignedSeqPos, - alignmentCandidate.blocks[lastBlock].qPos + alignmentCandidate.blocks[lastBlock].length); - } - else { - qSeq.ReferenceSubstring(alignmentCandidate.qAlignedSeq, // the subsequence that the alignment points to - alignmentCandidate.qPos + alignmentCandidate.qAlignedSeqPos, - alignmentCandidate.blocks[lastBlock].qPos + alignmentCandidate.blocks[lastBlock].length - alignmentCandidate.blocks[0].qPos); - } - - tSeq.Copy(alignmentCandidate.tAlignedSeq, // the subsequence the alignment points to - alignmentCandidate.tPos, // ofset into the subsequence - alignmentCandidate.blocks[lastBlock].tPos + alignmentCandidate.blocks[lastBlock].length - alignmentCandidate.blocks[0].tPos); - - T_AlignmentCandidate refinedAlignment; - - // - // When the parameter bandSize is 0, set the alignment band size - // to the drift off the diagonal, plus a little more for wiggle - // room. When the parameteris nonzero, use that as a fixed band. - // - int k; - if (params.bandSize == 0) { - k = abs(drift) * 1.5; - } - else { - k = params.bandSize; - } - if (params.verbosity > 0) { - cout << "drift: " << drift << " qlen: " << alignmentCandidate.qAlignedSeq.length << " tlen: " << alignmentCandidate.tAlignedSeq.length << " k: " << k << endl; - cout << "aligning in " << k << " * " << alignmentCandidate.tAlignedSeq.length << " " << k * alignmentCandidate.tAlignedSeq.length << endl; - } - if (k < 10) { - k = 10; - } - - alignmentCandidate.tAlignedSeqPos += alignmentCandidate.tPos; - - VectorIndex lastSDPBlock = alignmentCandidate.blocks.size() - 1; - - if (alignmentCandidate.blocks.size() > 0) { - DNALength prevLength = alignmentCandidate.tAlignedSeqLength -= alignmentCandidate.tPos; - alignmentCandidate.tAlignedSeqLength = (alignmentCandidate.blocks[lastSDPBlock].tPos - + alignmentCandidate.blocks[lastSDPBlock].length - - alignmentCandidate.blocks[0].tPos); - } - else { - alignmentCandidate.tAlignedSeqLength = 0; - } - - alignmentCandidate.tPos = 0; - alignmentCandidate.qAlignedSeqPos += alignmentCandidate.qPos; - - if (alignmentCandidate.blocks.size() > 0) { - DNALength prevLength = alignmentCandidate.qAlignedSeqLength -= alignmentCandidate.qPos; - alignmentCandidate.qAlignedSeqLength = (alignmentCandidate.blocks[lastSDPBlock].qPos - + alignmentCandidate.blocks[lastSDPBlock].length - - alignmentCandidate.blocks[0].qPos); - } - else { - alignmentCandidate.qAlignedSeqLength = 0; - } - alignmentCandidate.qPos = 0; - - alignmentCandidate.blocks.clear(); - alignmentCandidate.tAlignedSeq.Free(); - alignmentCandidate.tAlignedSeq.TakeOwnership(tSeq); - alignmentCandidate.ReassignQSequence(qSeq); - - if (params.verbosity >= 2) { - cout << "refining target: " << endl; - alignmentCandidate.tAlignedSeq.PrintSeq(cout); - cout << "refining query: " << endl; - static_cast(&alignmentCandidate.qAlignedSeq)->PrintSeq(cout); - cout << endl; - } - PairwiseLocalAlign(qSeq, tSeq, k, params, alignmentCandidate, mappingBuffers, Fit); - } -} - -template -void RefineAlignments(vector &bothQueryStrands, - T_RefSequence &genome, - vector &alignmentPtrs, MappingParameters ¶ms, MappingBuffers &mappingBuffers) { - - - UInt i; - for (i = 0; i < alignmentPtrs.size(); i++ ) { - RefineAlignment(bothQueryStrands, genome, *alignmentPtrs[i], params, mappingBuffers); - } - // - // It's possible the alignment references change their order after running - // the local alignments. This is made into a parameter rather than resorting - // every time so that the performance gain by resorting may be measured. - // - if (params.sortRefinedAlignments) { - std::sort(alignmentPtrs.begin(), alignmentPtrs.end(), SortAlignmentPointersByScore()); - } -} - -vector -SelectAlignmentsToPrint(vector alignmentPtrs, - MappingParameters & params, - const int & associatedRandInt) { - if (params.placeRandomly) {assert(params.hitPolicy.IsRandombest());} - - if (alignmentPtrs.size() == 0) {return vector({});} - - std::sort(alignmentPtrs.begin(), alignmentPtrs.end(), - SortAlignmentPointersByScore()); - - // Apply filter criteria and hit policy. - // Shallow copy AlignmentCandidate pointers. - vector filtered; - for (auto ptr: alignmentPtrs) { - if (params.filterCriteria.Satisfy(ptr)) { - filtered.push_back(ptr); - if (filtered.size() == params.nBest) break; - } - } - - return params.hitPolicy.Apply(filtered, false, associatedRandInt); -} - -// The full read is not the subread, and does not have masked off characters. -void PrintAlignment(T_AlignmentCandidate &alignment, - SMRTSequence &fullRead, - MappingParameters ¶ms, - AlignmentContext &alignmentContext, - ostream &outFile -#ifdef USE_PBBAM - , PacBio::BAM::BamWriter * bamWriterPtr -#endif - ) { - try { - int lastBlock = alignment.blocks.size() - 1; - if (params.printFormat == StickPrint) { - PrintAlignmentStats(alignment, outFile); - StickPrintAlignment(alignment, - (DNASequence&) alignment.qAlignedSeq, - (DNASequence&) alignment.tAlignedSeq, - outFile, - alignment.qAlignedSeqPos, alignment.tAlignedSeqPos); - } - else if (params.printFormat == SAM) { - SAMOutput::PrintAlignment(alignment, fullRead, outFile, alignmentContext, params.samQVList, params.clipping, params.cigarUseSeqMatch); - } - else if (params.printFormat == BAM) { -#ifdef USE_PBBAM - BAMOutput::PrintAlignment(alignment, fullRead, *bamWriterPtr, alignmentContext, params.samQVList, params.clipping, params.cigarUseSeqMatch); -#else - REQUIRE_PBBAM_ERROR(); -#endif - } - else if (params.printFormat == CompareXML) { - XMLOutput::Print(alignment, - (DNASequence&) alignment.qAlignedSeq, (DNASequence&) alignment.tAlignedSeq, - outFile, - alignment.qAlignedSeqPos, alignment.tAlignedSeqPos); - } - else if (params.printFormat == Vulgar) { - PrintAlignmentStats(alignment, outFile); - VulgarOutput::Print(alignment, outFile); - } - else if (params.printFormat == CompareSequencesParsable) { - CompareSequencesOutput::Print(alignment, alignment.qAlignedSeq, alignment.tAlignedSeq, outFile); - } - else if (params.printFormat == Interval) { - if (alignment.blocks.size() > 0) { - IntervalOutput::Print(alignment, outFile); - } - } - else if (params.printFormat == SummaryPrint) { - if (alignment.blocks.size() > 0) { - SummaryOutput::Print(alignment, outFile); - } - } - } - catch (ostream::failure f) { - cout << "ERROR writing to output file. The output drive may be full, or you " << endl; - cout << "may not have proper write permissions." << endl; - exit(1); - } -} - -// Print all alignments in vector alignmentPtrs -void PrintAlignments(vector alignmentPtrs, - SMRTSequence &read, - MappingParameters ¶ms, ostream &outFile, - AlignmentContext alignmentContext, -#ifdef USE_PBBAM - PacBio::BAM::BamWriter * bamWriterPtr, -#endif - MappingSemaphores & semaphores) { - if (params.nProc > 1) { -#ifdef __APPLE__ - sem_wait(semaphores.writer); -#else - sem_wait(&semaphores.writer); -#endif - } - for (int i = 0; i < int(alignmentPtrs.size()); i++) { - T_AlignmentCandidate *aref = alignmentPtrs[i]; - - if (aref->blocks.size() == 0) { - - // - // If the SDP alignment finds nothing, there will be no - // blocks. This may happen if the sdp block size is larger - // than the anchor size found with the suffix array. When no - // blocks are found there is no alignment, so zero-out the - // score and continue. - // - aref->score = 0; - if (params.verbosity > 0) { - cout << "Zero blocks found for " << aref->qName << " " << aref->qAlignedSeqPos << " " << aref->tAlignedSeqPos << endl; - } - continue; - } - - // - // Configure some of the alignment context before printing. - // - if (i > 0 and params.placeRandomly == false) { - alignmentContext.isPrimary = false; - } - else { - alignmentContext.isPrimary = true; - } - - if (params.printSAM or params.printBAM) { - DistanceMatrixScoreFunction editdistScoreFn(EditDistanceMatrix, 1, 1); - T_AlignmentCandidate & alignment = *alignmentPtrs[i]; - alignmentContext.editDist = ComputeAlignmentScore(alignment, - alignment.qAlignedSeq, - alignment.tAlignedSeq, - editdistScoreFn); - } - - PrintAlignment(*alignmentPtrs[i], read, - params, alignmentContext, outFile -#ifdef USE_PBBAM - , bamWriterPtr -#endif - ); - } - - if (params.nProc > 1) { -#ifdef __APPLE__ - sem_post(semaphores.writer); -#else - sem_post(&semaphores.writer); -#endif - } - -} - -void PrintAlignmentPtrs(vector & alignmentPtrs, - ostream & out) { - for(int alignmentIndex = 0; - alignmentIndex < int(alignmentPtrs.size()); - alignmentIndex++) { - out << "["<< alignmentIndex << "/" - << alignmentPtrs.size() << "]" << endl; - T_AlignmentCandidate *alignment = alignmentPtrs[alignmentIndex]; - alignment->Print(out); - } - out << endl; -} - -// Print all alignments for subreads in allReadAlignments. -// Input: -// allReadAlignments - contains a set of subreads, each of which -// is associated with a group of alignments. -// alignmentContext - an alignment context of each subread used -// for printing in SAM format. -// params - mapping parameters. -// Output: -// outFilePtr - where to print alignments for subreads. -// unalignedFilePtr - where to print sequences for unaligned subreads. -void PrintAllReadAlignments(ReadAlignments & allReadAlignments, - AlignmentContext & alignmentContext, - ostream & outFilePtr, - ostream & unalignedFilePtr, - MappingParameters & params, - vector & subreads, -#ifdef USE_PBBAM - PacBio::BAM::BamWriter * bamWriterPtr, -#endif - MappingSemaphores & semaphores) -{ - int subreadIndex; - int nAlignedSubreads = allReadAlignments.GetNAlignedSeq(); - - // - // Initialize the alignemnt context with information applicable to SAM output. - // - alignmentContext.alignMode = allReadAlignments.alignMode; - for (subreadIndex = 0; subreadIndex < nAlignedSubreads; subreadIndex++) { - if (allReadAlignments.subreadAlignments[subreadIndex].size() > 0) { - alignmentContext.numProperlyAlignedSubreads++; - } - } - - if (alignmentContext.numProperlyAlignedSubreads == int(allReadAlignments.subreadAlignments.size())) { - alignmentContext.allSubreadsProperlyAligned = true; - } - alignmentContext.nSubreads = nAlignedSubreads; - - for (subreadIndex = 0; subreadIndex < nAlignedSubreads; subreadIndex++) { - alignmentContext.subreadIndex = subreadIndex; - if (subreadIndex < nAlignedSubreads-1 and allReadAlignments.subreadAlignments[subreadIndex+1].size() > 0) { - alignmentContext.nextSubreadPos = allReadAlignments.subreadAlignments[subreadIndex+1][0]->QAlignStart(); - alignmentContext.nextSubreadDir = allReadAlignments.subreadAlignments[subreadIndex+1][0]->qStrand; - alignmentContext.rNext = allReadAlignments.subreadAlignments[subreadIndex+1][0]->tName; - alignmentContext.hasNextSubreadPos = true; - } else { - alignmentContext.nextSubreadPos = 0; - alignmentContext.nextSubreadDir = 0; - alignmentContext.rNext = ""; - alignmentContext.hasNextSubreadPos = false; - } - SMRTSequence & sourceSubread = allReadAlignments.subreads[subreadIndex]; - if (subreads.size() == allReadAlignments.subreads.size()) { - sourceSubread = subreads[subreadIndex]; - } - if (allReadAlignments.subreadAlignments[subreadIndex].size() > 0) { - PrintAlignments(allReadAlignments.subreadAlignments[subreadIndex], - sourceSubread, - // for these alignments - params, outFilePtr,//*mapData->outFilePtr, - alignmentContext, -#ifdef USE_PBBAM - bamWriterPtr, -#endif - semaphores); - } else { - // - // Print the unaligned sequences. - // - if (params.printUnaligned == true) { - if (params.nProc == 1) { - //allReadAlignments.subreads[subreadIndex].PrintSeq(*mapData->unalignedFilePtr); - allReadAlignments.subreads[subreadIndex].PrintSeq(unalignedFilePtr); - } - else { -#ifdef __APPLE__ - sem_wait(semaphores.unaligned); -#else - sem_wait(&semaphores.unaligned); -#endif - //allReadAlignments.subreads[subreadIndex].PrintSeq(*mapData->unalignedFilePtr); - allReadAlignments.subreads[subreadIndex].PrintSeq(unalignedFilePtr); -#ifdef __APPLE__ - sem_post(semaphores.unaligned); -#else - sem_post(&semaphores.unaligned); -#endif - } // End of nproc > 1. - } // End of printing unaligned sequences. - } // End of finding no alignments for the subread with subreadIndex. - } // End of printing and processing alignmentContext for each subread. -} - - -#endif diff -Nru blasr-0~20151014+git8e668be/include/MappingBuffers.hpp blasr-5.3/include/MappingBuffers.hpp --- blasr-0~20151014+git8e668be/include/MappingBuffers.hpp 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/include/MappingBuffers.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,116 +0,0 @@ -// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. -// -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted (subject to the limitations in the -// disclaimer below) provided that the following conditions are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// -// * Neither the name of Pacific Biosciences nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE -// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC -// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED -// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES -// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF -// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT -// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -// SUCH DAMAGE. - -// Author: Mark Chaisson -#ifndef __BLASR_MAPPING_BUFFERS__ -#define __BLASR_MAPPING_BUFFERS__ - -#include -#include "tuples/DNATuple.hpp" -#include "tuples/TupleList.hpp" -#include "algorithms/alignment/sdp/SDPFragment.hpp" -#include "algorithms/anchoring/BasicEndpoint.hpp" -#include "datastructures/anchoring/ClusterList.hpp" -#include "datastructures/anchoring/MatchPos.hpp" - -using namespace std; - - -// -// Define a list of buffers that are meant to grow to high-water -// marks, and not shrink down past that. The memory is reused rather -// than having multiple calls to new. -// -class MappingBuffers { -public: - vector hpInsScoreMat, insScoreMat; - vector kbandScoreMat; - vector hpInsPathMat, insPathMat; - vector kbandPathMat; - vector scoreMat; - vector pathMat; - vector affineScoreMat; - vector affinePathMat; - vector matchPosList; - vector rcMatchPosList; - vector > globalChainEndpointBuffer; - vector sdpFragmentSet, sdpPrefixFragmentSet, sdpSuffixFragmentSet; - TupleList sdpCachedTargetTupleList; - TupleList sdpCachedTargetPrefixTupleList; - TupleList sdpCachedTargetSuffixTupleList; - std::vector sdpCachedMaxFragmentChain; - vector probMat; - vector optPathProbMat; - vector lnSubPValueMat; - vector lnInsPValueMat; - vector lnDelPValueMat; - vector lnMatchPValueMat; - vector clusterNumBases; - ClusterList clusterList; - ClusterList revStrandClusterList; - - void Reset(void); -}; - - -inline void MappingBuffers::Reset(void) { - vector().swap(hpInsScoreMat); - vector().swap(insScoreMat); - vector().swap(kbandScoreMat); - vector().swap(hpInsPathMat); - vector().swap(insPathMat); - vector().swap(kbandPathMat); - vector().swap(scoreMat); - vector().swap(pathMat); - vector().swap(matchPosList); - vector().swap(rcMatchPosList); - vector >().swap(globalChainEndpointBuffer); - vector().swap(sdpFragmentSet); - vector().swap(sdpPrefixFragmentSet); - vector().swap(sdpSuffixFragmentSet); - sdpCachedTargetTupleList.Reset(); - sdpCachedTargetPrefixTupleList.Reset(); - sdpCachedTargetSuffixTupleList.Reset(); - vector().swap(sdpCachedMaxFragmentChain); - vector().swap(probMat); - vector().swap(optPathProbMat); - vector().swap(lnSubPValueMat); - vector().swap(lnInsPValueMat); - vector().swap(lnDelPValueMat); - vector().swap(lnMatchPValueMat); - vector().swap(clusterNumBases); -} - -#endif diff -Nru blasr-0~20151014+git8e668be/include/MappingIPC.h blasr-5.3/include/MappingIPC.h --- blasr-0~20151014+git8e668be/include/MappingIPC.h 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/include/MappingIPC.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,105 +0,0 @@ -#ifndef MAPPING_IPC_H_ -#define MAPPING_IPC_H_ - -#include - -#include "MappingParameters.h" - -#include "FASTASequence.hpp" -#include "FASTQSequence.hpp" -#include "tuples/TupleList.hpp" -#include "tuples/DNATuple.hpp" -#include "tuples/CompressedDNATuple.hpp" -#include "tuples/TupleCountTable.hpp" -#include "files/ReaderAgglomerate.hpp" -#include "MappingMetrics.hpp" -#include "suffixarray/SuffixArrayTypes.hpp" -#include "metagenome/SequenceIndexDatabase.hpp" -#include "reads/RegionTable.hpp" -#include "bwt/BWT.hpp" -/* - * This structure contains pointers to all required data structures - * for mapping reads to a suffix array and evaluating the significance - * of the matches. - */ - -template -class MappingData { -public: - T_SuffixArray *suffixArrayPtr; - BWT *bwtPtr; - T_GenomeSequence *referenceSeqPtr; - SequenceIndexDatabase *seqDBPtr; - TupleCountTable *ctabPtr; - MappingParameters params; - MappingMetrics metrics; - RegionTable *regionTablePtr; - ReaderAgglomerate *reader; - ostream *outFilePtr; - ostream *unalignedFilePtr; - ostream *anchorFilePtr; - ostream *clusterFilePtr; - ostream *lcpBoundsOutPtr; - - // Declare a semaphore for blocking on reading from the same hdhf file. - - void ShallowCopySuffixArray(T_SuffixArray &dest) { - dest.index = suffixArrayPtr->index; - dest.length = suffixArrayPtr->length; - dest.target = suffixArrayPtr->target; - dest.startPosTable = suffixArrayPtr->startPosTable; - dest.endPosTable = suffixArrayPtr->endPosTable; - dest.lookupTableLength = suffixArrayPtr->lookupTableLength; - dest.lookupPrefixLength = suffixArrayPtr->lookupPrefixLength; - dest.tm = suffixArrayPtr->tm; - dest.deleteStructures = false; - // dest.useLCPTable = suffixArrayPtr->useLCPTable; - } - - void ShallowCopySequenceIndexDatabase(SequenceIndexDatabase &dest) { - dest.nSeqPos = seqDBPtr->nSeqPos; - dest.seqStartPos = seqDBPtr->seqStartPos; - dest.nameLengths = seqDBPtr->nameLengths; - dest.names = seqDBPtr->names; - dest.deleteStructures = false; - } - - void ShallowCopyTupleCountTable( TupleCountTable &dest) { - dest.countTable = ctabPtr->countTable; - dest.countTableLength = ctabPtr->countTableLength; - dest.nTuples = ctabPtr->nTuples; - dest.tm = ctabPtr->tm; - dest.deleteStructures = false; - } - - void ShallowCopyReferenceSequence(T_GenomeSequence &refSeq) { - refSeq.ShallowCopy(*referenceSeqPtr); - refSeq.deleteOnExit = false; - } - - void Initialize(T_SuffixArray *saP, T_GenomeSequence *refP, - SequenceIndexDatabase *seqDBP, - TupleCountTable *ctabP, - ReverseCompressIndex *rciP, - MappingParameters ¶msP, - ReaderAgglomerate *readerP, - RegionTable *regionTableP, - ostream *outFileP, - ostream *unalignedFileP, - ostream *anchorFilePtrP, - ostream *clusterFilePtrP=NULL) { - suffixArrayPtr = saP; - referenceSeqPtr = refP; - seqDBPtr = seqDBP; - ctabPtr = ctabP; - regionTablePtr = regionTableP; - params = paramsP; - reader = readerP; - outFilePtr = outFileP; - unalignedFilePtr = unalignedFileP; - anchorFilePtr = anchorFilePtrP; - clusterFilePtr= clusterFilePtrP; - } -}; - -#endif diff -Nru blasr-0~20151014+git8e668be/include/MappingParameters.h blasr-5.3/include/MappingParameters.h --- blasr-0~20151014+git8e668be/include/MappingParameters.h 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/include/MappingParameters.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,671 +0,0 @@ -#ifndef MAPPING_PARAMETERS_H_ -#define MAPPING_PARAMETERS_H_ - -#define REQUIRE_PBBAM_ERROR() \ -assert("blasr must be compiled with lib pbbam to perform IO on bam." == 0); - -#include - -#include "reads/ReadType.hpp" -#include "utils/FileOfFileNames.hpp" -#include "utils/RangeUtils.hpp" -#include "tuples/TupleMetrics.hpp" -#include "datastructures/anchoring/AnchorParameters.hpp" -#include "qvs/QualityValue.hpp" -#include "format/SAMPrinter.hpp" -#include "algorithms/alignment/AlignmentFormats.hpp" -#include "files/BaseSequenceIO.hpp" -#include "datastructures/alignment/FilterCriteria.hpp" - -class MappingParameters { -public: - // - // Parameters for global substitution, insertion, and deletion priors. - // - float minFractionToBeConsideredOverlapping; - float indelRate; - float minRatio; - int indel; - int idsIndel; - int sdpIndel; - int sdpIns, sdpDel; - int insertion; - int deletion; - int mismatch; - int sdpTupleSize; - int match; - int showAlign; - int refineAlign; - bool useScoreCutoff; - int maxScore; - int argi; - int nProc; - int globalChainType; - SAMOutput::Clipping clipping; - string clippingString; - QVScale qvScaleType; - vector readsFileNames; // = queryFileNames, genomeFileName - vector queryFileNames; - string genomeFileName; - // Query file type: FASTA/FASTQ/HDF*/PBBAM, - // Note that mixed query file types is not allowed. - FileType queryFileType; - // Query read type, SUBREAD, CCS or UNKNOWN - // Note that mixed read types is not allowed. - ReadType::ReadTypeEnum queryReadType; - vector regionTableFileNames; - vector ccsFofnFileNames; - string tupleListName; - string posTableName; - string outFileName; - string suffixArrayFileName; - string bwtFileName; - string indexFileName; - string anchorFileName; - string clusterFileName; - VectorIndex nBest; - int printWindow; - int doCondense; - int do4BitComp; - int cutoff; - int useSuffixArray; - int useBwt; - int useReverseCompressIndex; - int useTupleList; - int useSeqDB; - string seqDBName; - int useCountTable; - string countTableName; - int minMatchLength; - int listTupleSize; - int printFormat; - int maxExpand, minExpand; - int startRead; - int stride; - int pValueType; - float subsample; - int sortRefinedAlignments; - int verbosity; - bool printSAM; - bool cigarUseSeqMatch; - bool printBAM; - bool storeMapQV; - bool useRandomSeed; - int randomSeed; - bool placeRandomly; - bool printHeader; - bool samplePaths; - bool warp, nowarp; - //bool usePrefixLookupTable; - bool doSensitiveSearch; - bool emulateNucmer; - bool refineBetweenAnchorsOnly; - bool byAdapter; - bool extendDenovoCCSSubreads; - TupleMetrics saTupleMetrics; - TupleMetrics sdpTupleMetrics; - int lookupTableLength; - //int branchQualityThreshold; - int qualityLowerCaseThreshold; - AnchorParameters anchorParameters; - int readsFileIndex; - //int numBranches; - bool storeMetrics; - bool ignoreQualities; - bool extendFrontAlignment; - bool extendAlignments; - int maxExtendDropoff; - int minReadLength; - int maxReadLength; - int minSubreadLength; - int minRawSubreadScore; - int minAvgQual; - bool overlap; - bool advanceHalf; - int advanceExactMatches; - float approximateMaxInsertionRate; - float minPctSimilarity; // [0, 100] - float minPctAccuracy; // [0, 100] - bool refineAlignments; - int nCandidates; - bool doGlobalAlignment; - string tempDirectory; - bool useTitleTable; - string titleTableName; - bool readSeparateRegionTable; - bool readSeparateCcsFofn; - string regionTableFileName; - string ccsFofnFileName; - //float averageMismatchScore; - bool mapSubreadsSeparately; - bool concordant; - int flankSize; - bool useRegionTable; - bool useHQRegionTable; - bool printUnaligned; - string unalignedFileName; - string metricsFileName; - string lcpBoundsFileName; - string fullMetricsFileName; - bool printSubreadTitle; - bool useCcs; - bool useAllSubreadsInCcs; - bool useCcsOnly; - bool detailedSDPAlignment, nouseDetailedSDPAlignment; - int chunkSize; - int sdpFilterType; - bool useGuidedAlign; - int guidedAlignBandSize; - int bandSize; - int extendBandSize; - bool useQVScore; - int scoreType; - bool printVerboseHelp; - bool printDiscussion; - float sdpBypassThreshold; - bool computeAlignProbability; - float qvMatchWeight; - float qvMismatchWeight; - float qvInsWeight; - float qvDelWeight; - float readAccuracyPrior; - bool printVersion; - int substitutionPrior; - int globalDeletionPrior; - bool outputByThread; - int recurseOver; - bool forPicard; - bool separateGaps; - string scoreMatrixString; - bool printDotPlots; - bool preserveReadTitle; - bool forwardOnly; - bool printOnlyBest; - bool affineAlign; - int affineExtend; - int affineOpen; - bool scaleMapQVByNumSignificantClusters; - int limsAlign; - string holeNumberRangesStr; - Ranges holeNumberRanges; - int minAlnLength; - bool printSAMQV; - vector samQV; - SupplementalQVList samQVList; - bool fastMaxInterval; - bool aggressiveIntervalCut; - bool fastSDP; - string concordantTemplate; - bool concordantAlignBothDirections; - FilterCriteria filterCriteria; - string hitPolicyStr; - HitPolicy hitPolicy; - bool enableHiddenPaths; - - void Init() { - qvMatchWeight = 1.0; - qvMismatchWeight = 1.0; - qvInsWeight = 1.0; - qvDelWeight = 1.0; - minFractionToBeConsideredOverlapping = 0.75; - minRatio = 0.25; - indelRate = 0.3; - indel = 5; - insertion = 4; // asymmetric indel parameters - deletion = 5; - idsIndel = 15; - sdpIndel = 5; - sdpIns = 5; - sdpDel = 10; - sdpTupleSize = 11; - match = 0; - mismatch = 0; - showAlign = 1; - refineAlign = 1; - useScoreCutoff = false; - maxScore = -200; - argi = 1; - nProc = 1; - readsFileNames.clear(); - queryFileNames.clear(); - genomeFileName = ""; - queryReadType = ReadType::UNKNOWN; - queryFileType = FileType::None; - tupleListName = ""; - posTableName = ""; - suffixArrayFileName= ""; - bwtFileName = ""; - indexFileName = ""; - anchorFileName = ""; - outFileName = ""; - nBest = 10; - nCandidates = 10; - printWindow = 0; - doCondense = 0; - do4BitComp = 0; - pValueType = 0; - cutoff = 0; - useSuffixArray = 0; - useBwt = 0; - useReverseCompressIndex = 0; - useTupleList = 0; - useSeqDB = 0; - seqDBName = ""; - useCountTable = 0; - countTableName = ""; - lookupTableLength = 8; - anchorParameters.minMatchLength = minMatchLength = 12; - printFormat = SummaryPrint; - maxExpand = 0; - minExpand = 0; - startRead = 0; - stride = 1; - subsample = 1.1; - listTupleSize = 6; - sortRefinedAlignments = 1; - anchorParameters.verbosity = verbosity = 0; - saTupleMetrics.Initialize(listTupleSize); - sdpTupleMetrics.Initialize(sdpTupleSize); - qualityLowerCaseThreshold = 0; - anchorParameters.branchQualityThreshold = 0; - readsFileIndex = 0; - printSAM = false; - printBAM = false; - useRandomSeed = false; - randomSeed = 0; - placeRandomly = false; - samplePaths = false; - nowarp = false; - storeMapQV = true; - warp = true; - extendDenovoCCSSubreads = false; - storeMetrics = false; - ignoreQualities = true; - extendFrontAlignment = false; - extendAlignments = false; - maxExtendDropoff = 10; - minReadLength = 50; - maxReadLength = 0; // means no max read length - minSubreadLength = 0; - minRawSubreadScore = -1; // raw subread score in region table should be in range [0, 1000]. - minAvgQual = 0; - overlap = false; - advanceHalf = false; - refineAlignments = true; - anchorParameters.advanceExactMatches = advanceExactMatches = 0; - approximateMaxInsertionRate = 1.30; - minPctSimilarity = 0; - minPctAccuracy = 0; - doGlobalAlignment = false; - tempDirectory = ""; - useTitleTable = false; - titleTableName = ""; - readSeparateRegionTable = false; - readSeparateCcsFofn = false; - regionTableFileName = ""; - ccsFofnFileName = ""; - mapSubreadsSeparately=true; - concordant=false; - flankSize=40; - useRegionTable = true; - useHQRegionTable=true; - printUnaligned = false; - unalignedFileName = ""; - globalChainType = 0; - metricsFileName = ""; - fullMetricsFileName = ""; - doSensitiveSearch = false; - emulateNucmer = false; - refineBetweenAnchorsOnly = false; - printSubreadTitle = true; - detailedSDPAlignment = true; - nouseDetailedSDPAlignment = false; - useCcs = false; - useCcsOnly = false; - useAllSubreadsInCcs = false; - chunkSize = 10000000; - sdpFilterType = 0; - anchorParameters.stopMappingOnceUnique = true; - useGuidedAlign = true; - bandSize = 0; - extendBandSize = 10; - guidedAlignBandSize = 10; - useQVScore = false; - printVerboseHelp = false; - printDiscussion = false; - sdpBypassThreshold = 1000000.0; - scoreType = 0; - byAdapter = false; - qvScaleType = PHRED; - printHeader = false; - computeAlignProbability = false; - readAccuracyPrior = 0.85; - printVersion = false; - clipping = SAMOutput::none; - clippingString = ""; - substitutionPrior = 20; - globalDeletionPrior = 13; - outputByThread = false; - recurseOver = 10000; - forPicard = false; - separateGaps = false; - scoreMatrixString = ""; - printDotPlots = false; - preserveReadTitle = false; - forwardOnly = false; - printOnlyBest = false; - affineAlign = false; - affineExtend = 0; - affineOpen = 10; - scaleMapQVByNumSignificantClusters = false; - limsAlign = 0; - holeNumberRangesStr = ""; - minAlnLength = 0; - printSAMQV = false; - cigarUseSeqMatch = false; - samQV.clear(); - samQVList.clear(); - fastMaxInterval = false; - aggressiveIntervalCut = false; - fastSDP = false; - concordantTemplate = "mediansubread"; // typicalsubread or longestsubread - concordantAlignBothDirections = false; - - hitPolicyStr = "all"; - ResetFilterAndHit(); - enableHiddenPaths = false; //turn off hidden paths. - } - - MappingParameters() - : filterCriteria(0, 0, 0, false, Score(0, ScoreSign::NEGATIVE)) - , hitPolicy("all", ScoreSign::NEGATIVE) - { - Init(); - } - - void MakeSane() { - // Expand FOFN - FileOfFileNames::ExpandFileNameList(readsFileNames); - - // Must have at least a query and a genome - if (readsFileNames.size() <= 1) { - cout << "Error, you must provide at least one reads file and a genome file." < 1) { - cerr << "Warning: using new filter method for SDP alignments. The parameter is " << endl - << "either 0 or 1, but " << sdpFilterType << " was specified." << endl; - sdpFilterType = 1; - } - if (sdpFilterType == 0) { - detailedSDPAlignment = true; - nouseDetailedSDPAlignment = false; - } - if (detailedSDPAlignment == false) { - sdpFilterType = 1; - } - if (useGuidedAlign == true and bandSize == 0) { - bandSize = 16; - } - anchorParameters.minMatchLength = minMatchLength; - if (suffixArrayFileName != "") { - useSuffixArray = true; - } - if (bwtFileName != "") { - useBwt = true; - } - if (useBwt and useSuffixArray) { - cout << "ERROR, sa and bwt must be used independently." << endl; - exit(1); - } - if (countTableName != "") { - useCountTable = true; - } - if (metricsFileName != "" or fullMetricsFileName != "") { - storeMetrics = true; - } - if (useCcsOnly) { - useCcs = true; - } - if (useAllSubreadsInCcs == true) { - useCcs = true; - } - if (titleTableName != "") { - useTitleTable = true; - } - if (unalignedFileName != "") { - printUnaligned = true; - } - if (regionTableFileName != "") { - useRegionTable = true; - readSeparateRegionTable = true; - } - if (ccsFofnFileName != "") { - readSeparateCcsFofn = true; - } - if (nouseDetailedSDPAlignment == true) { - detailedSDPAlignment = false; - } - if (nouseDetailedSDPAlignment == false) { - detailedSDPAlignment = true; - } - if (anchorParameters.maxLCPLength != 0 and anchorParameters.maxLCPLength < anchorParameters.minMatchLength) { - cerr << "ERROR: maxLCPLength is less than minLCPLength, which will result in no hits." << endl; - } - if (subsample < 1 and stride > 1) { - cout << "ERROR, subsample and stride must be used independently." << endl; - exit(1); - } - - - if (emulateNucmer) { - SetEmulateNucmer(); - } - - if (randomSeed != 0) { - useRandomSeed = true; - } - if (printSAM) { - printFormat = SAM; - forPicard = true; - } - // - // Parse the clipping. - // - if (clippingString == "soft") { - clipping = SAMOutput::soft; - } - else if (clippingString == "hard") { - clipping = SAMOutput::hard; - } - else if (clippingString == "none") { - clipping = SAMOutput::none; - } - else if (clippingString == "subread") { - clipping = SAMOutput::subread; - } - else if (clippingString != "") { - cout << "ERROR, clipping should either be soft, hard, or none." << endl; - exit(1); - } - - if (printBAM) { -#ifndef USE_PBBAM - REQUIRE_PBBAM_ERROR(); -#else - cigarUseSeqMatch = true; // ALWAYS true for BAM - printFormat = BAM; - forPicard = true; - printSAM = false; - samQVList.SetDefaultQV(); - printSAMQV = true; - if (clipping != SAMOutput::soft) { - // Only support two clipping methods: soft or subread. - clipping = SAMOutput::subread; - } - if (queryFileType != PBBAM and not enableHiddenPaths) { - // bax|fasta|fastq -> bam paths are turned off by default - cout << "ERROR, could not output alignments in BAM unless input reads are in PacBio BAM files." << endl; - exit(1); - } - if (outFileName == "") { - cout << "ERROR, BAM output file must be specified." << endl; - exit(1); - } - if (outputByThread) { - cout << "ERROR, could not output alignments by threads in BAM format." << endl; - exit(1); - } -#endif - } - - if (limsAlign != 0) { - mapSubreadsSeparately = false; - forwardOnly = true; - } - - if (holeNumberRangesStr.size() > 0) { - if (not holeNumberRanges.setRanges(holeNumberRangesStr)) { - cout << "ERROR, could not parse hole number ranges: " - << holeNumberRangesStr << "." << endl; - exit(1); - } - } - - if (printSAMQV) { - if (samQV.size() == 0) { - samQVList.SetDefaultQV(); - } - else { - samQVList.UseQV(samQV); - } - } - - if (minRawSubreadScore > 1000) { - cout << "ERROR, minimum raw subread score should be less than 1000." << endl; - exit(1); - } - if (minRawSubreadScore != -1 and byAdapter) { - cout << "ERROR, minRawSubreadScore and byAdapter should not be used together." << endl; - exit(1); - } - // Determine query read type - queryReadType = DetermineQueryReadType(); - // Pass verbosity - anchorParameters.verbosity = verbosity; - - // Set filter criteria and hit policy - ResetFilterAndHit(); - } - void ResetFilterAndHit(void) { - filterCriteria = FilterCriteria(minAlnLength, minPctSimilarity, - minPctAccuracy, true, - Score(static_cast(maxScore), ScoreSign::NEGATIVE)); - hitPolicy = HitPolicy(hitPolicyStr, ScoreSign::NEGATIVE); - } - - ReadType::ReadTypeEnum DetermineQueryReadType() { - if (useCcsOnly or queryFileType == HDFCCSONLY) { - return ReadType::CCS; - } - if (queryFileType == PBBAM) { - // Read type in BAM may be CCS, SUBREAD, HQREGION or POLYMERASE. - // Determine it later. - return ReadType::UNKNOWN; - } - if (mapSubreadsSeparately) { - return ReadType::SUBREAD; - } else { - if (useHQRegionTable) { - return ReadType::HQREGION; - } else { - return ReadType::POLYMERASE; - } - } - } - - void SetEmulateNucmer() { - anchorParameters.stopMappingOnceUnique = true; - anchorParameters.advanceExactMatches = 30; - anchorParameters.maxAnchorsPerPosition = 1; - sdpBypassThreshold = 0.75; - sdpTupleSize = 15; - anchorParameters.minMatchLength = 30; - useGuidedAlign = true; - refineAlignments = false; - } - - void SetForSensitivity() { - advanceExactMatches = 0; - anchorParameters.numBranches = 1; - anchorParameters.maxAnchorsPerPosition = 10000; - } -}; - - -#endif diff -Nru blasr-0~20151014+git8e668be/include/MappingSemaphores.h blasr-5.3/include/MappingSemaphores.h --- blasr-0~20151014+git8e668be/include/MappingSemaphores.h 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/include/MappingSemaphores.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,45 +0,0 @@ -#ifndef ALIGNMENT_MAPPING_SEMAPHORE_H_ -#define ALIGNMENT_MAPPING_SEMAPHORE_H_ -#include -#include -#include - -#ifndef __APPLE__ -class MappingSemaphores { - public: - sem_t reader; - sem_t writer; - sem_t unaligned; - sem_t hitCluster; - MappingSemaphores& operator=(MappingSemaphores &rhs) { - return *this; - } - - void InitializeAll() { - sem_init(&reader, 0, 1); - sem_init(&writer, 0, 1); - sem_init(&unaligned, 0, 1); - sem_init(&hitCluster, 0, 1); - } -}; -#else -class MappingSemaphores { - public: - sem_t *reader; - sem_t *writer; - sem_t *unaligned; - sem_t *hitCluster; - MappingSemaphores& operator=(MappingSemaphores &rhs) { - return *this; - } - - void InitializeAll() { - reader = sem_open("/reader", O_CREAT, 0644, 1); - writer = sem_open("/writer", O_CREAT, 0644, 1); - unaligned = sem_open("/unaligned", O_CREAT, 0644, 1); - hitCluster = sem_open("/hitCluster", O_CREAT, 0644, 1); - } -}; -#endif - -#endif diff -Nru blasr-0~20151014+git8e668be/include/ReadAlignments.hpp blasr-5.3/include/ReadAlignments.hpp --- blasr-0~20151014+git8e668be/include/ReadAlignments.hpp 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/include/ReadAlignments.hpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,199 +0,0 @@ -// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. -// -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted (subject to the limitations in the -// disclaimer below) provided that the following conditions are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// -// * Neither the name of Pacific Biosciences nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE -// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC -// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED -// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES -// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF -// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT -// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -// SUCH DAMAGE. - -// Author: Mark Chaisson - -#ifndef __BLASR_READ_ALIGNMENTS__ -#define __BLASR_READ_ALIGNMENTS__ - -#include -#include -#include -#include "SMRTSequence.hpp" -#include "datastructures/alignment/AlignmentCandidate.hpp" - -using namespace std; - -class ReadAlignments { -public: -/* - This class stores the alignments from a read. A read may be - aligned in several different modes: - 1. Fullread - Treat the read as a unit from start to end - 2. Subread - Align each subread independently - 3. CCSDeNovo - Only align the CCS sequence from a read - 4. CCSAllPass - Align the de novo ccs sequences and then the - subreads to where the denovo ccs aligned. - 5. CCSFullPass - Same as allpass, except using only complete - subreads. - 6. ZmwSubreads - Align subreads of each zmw to where the longest - subread of the zmw aligned to. - - The alignments are a raggad array of n sequences; n is 1 for cases - 1 and 3, the number of subreads for cases 2 and 4, and the number - of full length passes for case 5. - - A ReadAligments class must only have alignments for a single type - of read in it. - -*/ - vector > subreadAlignments; - vector subreads; - AlignMode alignMode; - SMRTSequence read; - - inline int GetNAlignedSeq(); - - inline bool AllSubreadsHaveAlignments(); - - inline void Clear(); - - inline void Resize(int nSeq); - - inline void CheckSeqIndex(int seqIndex); - - inline void SetSequence(int seqIndex, SMRTSequence &seq); - - inline void AddAlignmentForSeq(int seqIndex, T_AlignmentCandidate *alignmentPtr); - - inline void AddAlignmentsForSeq(int seqIndex, vector &seqAlignmentPtrs); - - // Copy all T_AlignmentCandidate objects (to which subreadAlignment[seqIndex] - // is pointing) to newly created objects, and then return pointers to the new - // objects. - inline vector CopySubreadAlignments(int seqIndex); - - inline void Print(ostream &out=cout); - - inline ~ReadAlignments(); -}; - - -inline int ReadAlignments::GetNAlignedSeq() { - return subreadAlignments.size(); -} - -inline bool ReadAlignments::AllSubreadsHaveAlignments() { - int i, nAlignedSeq; - nAlignedSeq = subreadAlignments.size(); - for (i = 0; i < nAlignedSeq; i++) { - if (subreadAlignments[i].size() == 0) { - return false; - } - } - return true; -} - -inline void ReadAlignments::Clear() { - int i; - int nAlignedSeq; - for (i = 0, nAlignedSeq = subreadAlignments.size(); i < nAlignedSeq; i++) { - int nAlignments; - int a; - for (a = 0, nAlignments = subreadAlignments[i].size(); a < nAlignments; a++) { - delete subreadAlignments[i][a]; - } - subreadAlignments[i].clear(); - } - - for (i = 0, nAlignedSeq = subreads.size(); i< nAlignedSeq; i++) { - subreads[i].Free(); - } - subreadAlignments.clear(); - read.Free(); -} - -inline void ReadAlignments::Resize(int nSeq) { - subreadAlignments.resize(nSeq); - subreads.resize(nSeq); -} - -inline void ReadAlignments::CheckSeqIndex(int seqIndex) { - if ( seqIndex < 0 or seqIndex >= int(subreads.size()) ) { - cout << "ERROR, adding a sequence to an unallocated position." - << endl; - assert(0); - } -} - -inline void ReadAlignments::SetSequence(int seqIndex, SMRTSequence &seq) { - CheckSeqIndex(seqIndex); - subreads[seqIndex] = seq; -} - -inline void ReadAlignments::AddAlignmentForSeq(int seqIndex, T_AlignmentCandidate *alignmentPtr) { - CheckSeqIndex(seqIndex); - subreadAlignments[seqIndex].push_back(alignmentPtr); -} - -inline void ReadAlignments::AddAlignmentsForSeq(int seqIndex, vector &seqAlignmentPtrs) { - CheckSeqIndex(seqIndex); - subreadAlignments[seqIndex].insert(subreadAlignments[seqIndex].end(), seqAlignmentPtrs.begin(), seqAlignmentPtrs.end()); -} - -inline vector ReadAlignments::CopySubreadAlignments(int seqIndex) { - vector ret; - for (int i=0; iPrint(out); - } - } - out << " read: "; - read.Print(out); - out << endl << endl; -} - -inline ReadAlignments::~ReadAlignments() { - read.Free(); -} - -#endif diff -Nru blasr-0~20151014+git8e668be/include/RegisterBlasrOptions.h blasr-5.3/include/RegisterBlasrOptions.h --- blasr-0~20151014+git8e668be/include/RegisterBlasrOptions.h 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/include/RegisterBlasrOptions.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,525 +0,0 @@ -/* - * ============================================================================ - * - * Filename: RegisterOptions.hpp - * - * Description: - * - * Version: 1.0 - * Created: 04/29/2015 04:48:26 PM - * Revision: none - * Compiler: gcc - * - * Author: Yuan Li (yli), yli@pacificbiosciences.com - * Company: Pacific Biosciences - * - * ============================================================================ - */ - -#include "libconfig.h" -#include "CommandLineParser.hpp" -#include "MappingParameters.h" -#include "RegisterFilterOptions.h" -#include -using namespace std; - -void RegisterBlasrOptions(CommandLineParser & clp, MappingParameters & params) { - int trashbinInt; - float trashbinFloat; - bool trashbinBool; - clp.RegisterStringOption("sa", ¶ms.suffixArrayFileName, ""); - clp.RegisterStringOption("ctab", ¶ms.countTableName, "" ); - clp.RegisterStringOption("regionTable", ¶ms.regionTableFileName, ""); - clp.RegisterStringOption("ccsFofn", ¶ms.ccsFofnFileName, ""); - clp.RegisterIntOption("bestn", (int*) ¶ms.nBest, "", CommandLineParser::PositiveInteger); - clp.RegisterIntOption("limsAlign", ¶ms.limsAlign, "", CommandLineParser::PositiveInteger); - clp.RegisterFlagOption("printOnlyBest", ¶ms.printOnlyBest, ""); - clp.RegisterFlagOption("outputByThread", ¶ms.outputByThread, ""); - clp.RegisterFlagOption("rbao", ¶ms.refineBetweenAnchorsOnly, ""); - clp.RegisterFlagOption("allowAdjacentIndels", ¶ms.forPicard, ""); - clp.RegisterFlagOption("onegap", ¶ms.separateGaps, ""); - clp.RegisterFlagOption("allowAdjacentIndels", ¶ms.forPicard, ""); - clp.RegisterFlagOption("placeRepeatsRandomly", ¶ms.placeRandomly, ""); - clp.RegisterIntOption("randomSeed", ¶ms.randomSeed, "", CommandLineParser::Integer); - clp.RegisterFlagOption("extend", ¶ms.extendAlignments, ""); - clp.RegisterIntOption("branchExpand", ¶ms.anchorParameters.branchExpand, "", CommandLineParser::NonNegativeInteger); - clp.RegisterIntOption("maxExtendDropoff", ¶ms.maxExtendDropoff, "", CommandLineParser::NonNegativeInteger); - clp.RegisterFlagOption("nucmer", ¶ms.emulateNucmer, ""); - clp.RegisterIntOption("maxExpand", ¶ms.maxExpand, "", CommandLineParser::PositiveInteger); - clp.RegisterIntOption("minExpand", ¶ms.minExpand, "", CommandLineParser::NonNegativeInteger); - clp.RegisterStringOption("seqdb", ¶ms.seqDBName, ""); - clp.RegisterStringOption("anchors", ¶ms.anchorFileName, ""); - clp.RegisterStringOption("clusters", ¶ms.clusterFileName, ""); - clp.RegisterFlagOption("samplePaths", (bool*) ¶ms.samplePaths, ""); - clp.RegisterFlagOption("noStoreMapQV", ¶ms.storeMapQV, ""); - clp.RegisterFlagOption("nowarp", (bool*) ¶ms.nowarp, ""); - clp.RegisterFlagOption("noRefineAlign", (bool*) ¶ms.refineAlign, ""); - clp.RegisterFlagOption("guidedAlign", (bool*)¶ms.useGuidedAlign, ""); - clp.RegisterFlagOption("useGuidedAlign", (bool*)&trashbinBool, ""); - clp.RegisterFlagOption("noUseGuidedAlign", (bool*)¶ms.useGuidedAlign, ""); - clp.RegisterFlagOption("header", (bool*)¶ms.printHeader, ""); - clp.RegisterIntOption("bandSize", ¶ms.bandSize, "", CommandLineParser::PositiveInteger); - clp.RegisterIntOption("extendBandSize", ¶ms.extendBandSize, "", CommandLineParser::PositiveInteger); - clp.RegisterIntOption("guidedAlignBandSize", ¶ms.guidedAlignBandSize, "", CommandLineParser::PositiveInteger); - clp.RegisterIntOption("maxAnchorsPerPosition", ¶ms.anchorParameters.maxAnchorsPerPosition, "", CommandLineParser::PositiveInteger); - clp.RegisterIntOption("stopMappingOnceUnique", (int*) ¶ms.anchorParameters.stopMappingOnceUnique, "", CommandLineParser::NonNegativeInteger); - clp.RegisterStringOption("out", ¶ms.outFileName, ""); - clp.RegisterIntOption("match", ¶ms.match, "", CommandLineParser::Integer); - clp.RegisterIntOption("mismatch", ¶ms.mismatch, "", CommandLineParser::Integer); - clp.RegisterIntOption("minMatch", ¶ms.minMatchLength, "", CommandLineParser::PositiveInteger); - clp.RegisterIntOption("maxMatch", ¶ms.anchorParameters.maxLCPLength, "", CommandLineParser::NonNegativeInteger); - clp.RegisterIntOption("maxLCPLength", ¶ms.anchorParameters.maxLCPLength, "", CommandLineParser::NonNegativeInteger); - clp.RegisterIntOption("indel", ¶ms.indel, "", CommandLineParser::Integer); - clp.RegisterIntOption("insertion", ¶ms.insertion, "", CommandLineParser::Integer); - clp.RegisterIntOption("deletion", ¶ms.deletion, "", CommandLineParser::Integer); - clp.RegisterIntOption("idsIndel", ¶ms.idsIndel, "", CommandLineParser::Integer); - clp.RegisterIntOption("sdpindel", ¶ms.sdpIndel, "", CommandLineParser::Integer); - clp.RegisterIntOption("sdpIns", ¶ms.sdpIns, "", CommandLineParser::Integer); - clp.RegisterIntOption("sdpDel", ¶ms.sdpDel, "", CommandLineParser::Integer); - clp.RegisterFloatOption("indelRate", ¶ms.indelRate, "", CommandLineParser::NonNegativeFloat); - clp.RegisterFloatOption("minRatio", ¶ms.minRatio, "", CommandLineParser::NonNegativeFloat); - clp.RegisterFloatOption("sdpbypass", ¶ms.sdpBypassThreshold, "", CommandLineParser::NonNegativeFloat); - clp.RegisterFloatOption("minFrac", &trashbinFloat, "", CommandLineParser::NonNegativeFloat); - clp.RegisterIntOption("maxScore", ¶ms.maxScore, "", CommandLineParser::Integer); - clp.RegisterStringOption("bwt", ¶ms.bwtFileName, ""); - clp.RegisterIntOption("m", ¶ms.printFormat, "", CommandLineParser::NonNegativeInteger); - clp.RegisterFlagOption("sam", ¶ms.printSAM, ""); -#ifdef USE_PBBAM - clp.RegisterFlagOption("bam", ¶ms.printBAM, ""); -#endif - clp.RegisterStringOption("clipping", ¶ms.clippingString, ""); - clp.RegisterIntOption("sdpTupleSize", ¶ms.sdpTupleSize, "", CommandLineParser::PositiveInteger); - clp.RegisterIntOption("pvaltype", ¶ms.pValueType, "", CommandLineParser::NonNegativeInteger); - clp.RegisterIntOption("start", ¶ms.startRead, "", CommandLineParser::NonNegativeInteger); - clp.RegisterIntOption("stride", ¶ms.stride, "", CommandLineParser::NonNegativeInteger); - clp.RegisterFloatOption("subsample", ¶ms.subsample, "", CommandLineParser::PositiveFloat); - clp.RegisterIntOption("nproc", ¶ms.nProc, "", CommandLineParser::PositiveInteger); - clp.RegisterFlagOption("sortRefinedAlignments",(bool*) ¶ms.sortRefinedAlignments, ""); - clp.RegisterIntOption("quallc", ¶ms.qualityLowerCaseThreshold, "", CommandLineParser::Integer); - clp.RegisterFlagOption("v", (bool*) ¶ms.verbosity, ""); - clp.RegisterIntOption("V", ¶ms.verbosity, "Specify a level of verbosity.", CommandLineParser::NonNegativeInteger); - clp.RegisterIntOption("contextAlignLength", ¶ms.anchorParameters.contextAlignLength, "", CommandLineParser::PositiveInteger); - clp.RegisterFlagOption("skipLookupTable", ¶ms.anchorParameters.useLookupTable, ""); - clp.RegisterStringOption("metrics", ¶ms.metricsFileName, ""); - clp.RegisterStringOption("lcpBounds", ¶ms.lcpBoundsFileName, ""); - clp.RegisterStringOption("fullMetrics", ¶ms.fullMetricsFileName, ""); - clp.RegisterIntOption("nbranch", ¶ms.anchorParameters.numBranches, "", CommandLineParser::NonNegativeInteger); - clp.RegisterFlagOption("divideByAdapter", ¶ms.byAdapter, ""); - clp.RegisterFlagOption("useQuality", ¶ms.ignoreQualities, ""); - clp.RegisterFlagOption("noFrontAlign", ¶ms.extendFrontAlignment, ""); - clp.RegisterIntOption("minReadLength", ¶ms.minReadLength, "", CommandLineParser::NonNegativeInteger); - clp.RegisterIntOption("maxReadLength", ¶ms.maxReadLength, "", CommandLineParser::NonNegativeInteger); - clp.RegisterIntOption("minSubreadLength", ¶ms.minSubreadLength, "", CommandLineParser::NonNegativeInteger); - clp.RegisterIntOption("minRawSubreadScore", ¶ms.minRawSubreadScore, "", CommandLineParser::NonNegativeInteger); - clp.RegisterIntOption("minAvgQual", ¶ms.minAvgQual, "", CommandLineParser::Integer); - clp.RegisterFlagOption("advanceHalf", ¶ms.advanceHalf, ""); - clp.RegisterIntOption("advanceExactMatches", ¶ms.anchorParameters.advanceExactMatches, "", CommandLineParser::NonNegativeInteger); - clp.RegisterFlagOption("useccs", ¶ms.useCcs, ""); - clp.RegisterFlagOption("useccsdenovo", ¶ms.useCcsOnly, ""); - clp.RegisterFlagOption("useccsall", ¶ms.useAllSubreadsInCcs, ""); - clp.RegisterFlagOption("extendDenovoCCSSubreads", ¶ms.extendDenovoCCSSubreads, ""); - clp.RegisterFlagOption("noRefineAlignments", ¶ms.refineAlignments, ""); - clp.RegisterIntOption("nCandidates", ¶ms.nCandidates, "", CommandLineParser::NonNegativeInteger); - clp.RegisterFlagOption("useTemp", (bool*) ¶ms.tempDirectory, ""); - clp.RegisterFlagOption("noSplitSubreads", ¶ms.mapSubreadsSeparately, ""); - clp.RegisterFlagOption("concordant", ¶ms.concordant, ""); - // When -concordant is turned on, blasr first selects a subread (e.g., the median length full-pass subread) - // of a zmw as template, maps the template subread to a reference, then infers directions of all other subreads - // of the same zmw based on direction of the template, and finally maps all other subreads to the same - // genomic coordinates as the template. When -concordantAlignBothDirections is turned on, blasr will align - // all other subreads both forwardly and backwardly, without infering their directions. This is a hidden - // diagnostic option only useful for analyzing movies which have lots of un-identified or missed adapters such - // that directions of subreads can not be inferred accurately. - clp.RegisterFlagOption("concordantAlignBothDirections", ¶ms.concordantAlignBothDirections, ""); - clp.RegisterIntOption("flankSize", ¶ms.flankSize, "", CommandLineParser::NonNegativeInteger); - clp.RegisterStringOption("titleTable", ¶ms.titleTableName, ""); - clp.RegisterFlagOption("useSensitiveSearch", ¶ms.doSensitiveSearch, ""); - clp.RegisterFlagOption("ignoreRegions", ¶ms.useRegionTable, ""); - clp.RegisterFlagOption("ignoreHQRegions", ¶ms.useHQRegionTable, ""); - clp.RegisterFlagOption("computeAlignProbability", ¶ms.computeAlignProbability, ""); - clp.RegisterStringOption("unaligned", ¶ms.unalignedFileName, ""); - clp.RegisterFlagOption("global", ¶ms.doGlobalAlignment, ""); - clp.RegisterIntOption("globalChainType", ¶ms.globalChainType, "", CommandLineParser::NonNegativeInteger); - clp.RegisterFlagOption("noPrintSubreadTitle", (bool*) ¶ms.printSubreadTitle, ""); - clp.RegisterIntOption("saLookupTableLength", ¶ms.lookupTableLength, "", CommandLineParser::PositiveInteger); - clp.RegisterFlagOption("useDetailedSDP", ¶ms.detailedSDPAlignment, ""); - clp.RegisterFlagOption("nouseDetailedSDP", &trashbinBool, ""); - clp.RegisterIntOption("sdpFilterType", ¶ms.sdpFilterType, "", CommandLineParser::NonNegativeInteger); - clp.RegisterIntOption("scoreType", ¶ms.scoreType, "", CommandLineParser::NonNegativeInteger); - clp.RegisterFlagOption("h", ¶ms.printVerboseHelp, ""); - clp.RegisterFlagOption("help", ¶ms.printDiscussion, ""); - clp.RegisterFloatOption("accuracyPrior", ¶ms.readAccuracyPrior, "", CommandLineParser::NonNegativeFloat); - // holeNumberRangesStr is a string of comma-delimited hole number ranges, such as '1,2,3,10-15'. - // Blasr only analyzes reads whose hole numbers are in the specified hole number ranges. - clp.RegisterStringOption("holeNumbers", ¶ms.holeNumberRangesStr, ""); - clp.RegisterIntOption("substitutionPrior", ¶ms.substitutionPrior, "", CommandLineParser::NonNegativeInteger); - clp.RegisterIntOption("deletionPrior", ¶ms.globalDeletionPrior, "", CommandLineParser::NonNegativeInteger); - clp.RegisterIntOption("recurseOver", ¶ms.recurseOver, "", CommandLineParser::NonNegativeInteger); - clp.RegisterStringOption("scoreMatrix", ¶ms.scoreMatrixString, ""); - clp.RegisterFlagOption("printDotPlots", ¶ms.printDotPlots, ""); - clp.RegisterFlagOption("preserveReadTitle", ¶ms.preserveReadTitle,""); - clp.RegisterFlagOption("forwardOnly", ¶ms.forwardOnly,""); - clp.RegisterFlagOption("affineAlign", ¶ms.affineAlign, ""); - clp.RegisterIntOption("affineOpen", ¶ms.affineOpen, "", CommandLineParser::NonNegativeInteger); - clp.RegisterIntOption("affineExtend", ¶ms.affineExtend, "", CommandLineParser::NonNegativeInteger); - clp.RegisterFlagOption("scaleMapQVByNClusters", ¶ms.scaleMapQVByNumSignificantClusters, "", false); - clp.RegisterFlagOption("printSAMQV", ¶ms.printSAMQV, "", false); - clp.RegisterFlagOption("cigarUseSeqMatch", ¶ms.cigarUseSeqMatch, ""); - clp.RegisterStringListOption("samQV", ¶ms.samQV, ""); - clp.RegisterFlagOption("fastMaxInterval", ¶ms.fastMaxInterval, "", false); - clp.RegisterFlagOption("aggressiveIntervalCut", ¶ms.aggressiveIntervalCut, "", false); - clp.RegisterFlagOption("fastSDP", ¶ms.fastSDP, "", false); - clp.RegisterStringOption("concordantTemplate", ¶ms.concordantTemplate, "typicalsubread"); - - RegisterFilterOptions(clp, params.minAlnLength, params.minPctSimilarity, params.minPctAccuracy, - params.hitPolicyStr, trashbinBool=true, trashbinInt, params.maxScore); -} - -const string BlasrHelp(MappingParameters & params) { - stringstream helpStream; - helpStream << " Options for blasr " << endl - << " Basic usage: 'blasr reads.{bam|fasta|bax.h5|fofn} genome.fasta [-options] " << endl - << " option\tDescription (default_value)." << endl << endl - << " Input Files." << endl - << " reads.bam is a PacBio BAM file of reads." << endl - << " This is the preferred input to blasr because rich quality" << endl - << " value (insertion,deletion, and substitution quality values) information is " << endl - << " maintained. The extra quality information improves variant detection and mapping"< 3." << endl - << " -maxMatch l (inf)" << endl - << " Stop mapping a read to the genome when the lcp length reaches l. " << endl - << " This is useful when the query is part of the reference, for example when " < -using namespace std; - -/// Register options for filtering alignments. -void RegisterFilterOptions(CommandLineParser & clp, int & minAlnLength, - float & minPctSimilarity, float & minPctAccuracy, - string & hitPolicyStr, bool & useScoreCutoff, - int & scoreSignInt, int & scoreCutoff) { - ScoreSign ss = static_cast(scoreSignInt); - Score sc(static_cast(scoreCutoff), ss); - FilterCriteria fc(static_cast(minAlnLength), - minPctSimilarity, minPctAccuracy, - useScoreCutoff, sc); - - HitPolicy hp("randombest", ScoreSign::NEGATIVE); - - clp.RegisterIntOption("minAlnLength", &minAlnLength, - fc.MinAlnLengthHelp(), - CommandLineParser::PositiveInteger); - clp.RegisterIntOption("minAlignLength", &minAlnLength, - "Alias of -minAlnLength", - CommandLineParser::PositiveInteger); - clp.RegisterIntOption("minLength", &minAlnLength, - "Alias of -minAlnLength", - CommandLineParser::PositiveInteger); - - clp.RegisterFloatOption("minPctSimilarity", &minPctSimilarity, - fc.MinPctSimilarityHelp(), - CommandLineParser::PositiveFloat); - clp.RegisterFloatOption("minPctIdentity", &minPctSimilarity, - "Alias of -minPctSimilarity", - CommandLineParser::PositiveFloat); - - clp.RegisterFloatOption("minPctAccuracy", &minPctAccuracy, - fc.MinPctAccuracyHelp(), - CommandLineParser::PositiveFloat); - clp.RegisterFloatOption("minAccuracy", &minPctAccuracy, - "Alias of -minPctAccuracy", - CommandLineParser::PositiveFloat); - - clp.RegisterStringOption("hitPolicy", &hitPolicyStr, hp.Help()); - - clp.RegisterIntOption("scoreSign", &scoreSignInt, - fc.ScoreSignHelp(), - CommandLineParser::Integer); - - clp.RegisterIntOption("scoreCutoff", &scoreCutoff, - fc.ScoreCutoffHelp(), - CommandLineParser::Integer); -} diff -Nru blasr-0~20151014+git8e668be/LICENSE blasr-5.3/LICENSE --- blasr-0~20151014+git8e668be/LICENSE 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/LICENSE 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,34 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. diff -Nru blasr-0~20151014+git8e668be/makefile blasr-5.3/makefile --- blasr-0~20151014+git8e668be/makefile 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/makefile 2016-09-12 08:56:50.000000000 +0000 @@ -9,13 +9,14 @@ echo $(MAKEFILE_LIST) echo ${SRCDIR} -CXXFLAGS += -O3 -g +GET_SHA1 := $(shell git -C ${SRCDIR} describe --always --dirty='*') +CXXFLAGS += -O3 -g -DSHA1_7=\"${GET_SHA1}\" CXXOPTS += \ -std=c++0x -pedantic \ - -Wall -Wuninitialized -Wno-div-by-zero \ - -MMD -MP -w -fpermissive + -Wall -Wextra -Wno-div-by-zero -Wno-overloaded-virtual \ + -MMD -MP GCXXFLAGS := -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free -fno-omit-frame-pointer -CXXFLAGS += ${CXXOPTS} ${GCXXFLAGS} +override CXXFLAGS += ${CXXOPTS} ${GCXXFLAGS} #INC_DIRS:=${LIBBLASR_INC} ${LIBPBIHDF_INC} ${LIBPBDATA_INC} ${PBBAM_INC} ${HTSLIB_INC} ${HDF5_INC} ${ZLIB_INC} #LIB_DIRS:=${LIBBLASR_LIB} ${LIBPBIHDF_LIB} ${LIBPBDATA_LIB} ${PBBAM_LIB} ${HTSLIB_LIB} ${HDF5_LIB} ${ZLIB_LIB} #LDLIBS := \ @@ -25,13 +26,14 @@ # HDF5 needs -ldl, but mobs does not pass it in. -CPPFLAGS:=-I${SRCDIR}/include ${CPPFLAGS} - SRCS := Blasr.cpp OBJS := ${SRCS:.cpp=.o} DEPS := ${SRCS:.cpp=.d} -LD_LIBRARY_PATH:=${HDF5_LIB}:${LIBBLASR_LIB}:${LIBPBIHDF_LIB}:${LIBPBDATA_LIB}:${LD_LIBRARY_PATH} +override BLASR_PATH=${SRCDIR}/ +export BLASR_PATH + +override LD_LIBRARY_PATH:=${LIBBLASR_LIB}:${LIBPBIHDF_LIB}:${LIBPBDATA_LIB}:${HDF5_LIB}:${HTSLIB_LIB}:${PBBAM_LIB}:${ZLIB_LIB}:${LD_LIBRARY_PATH} export LD_LIBRARY_PATH # Note: On macosx, this would be DYLD_LIBRARY_PATH. @@ -39,17 +41,19 @@ init-submodule: ${MAKE} update-submodule + ${MAKE} configure-submodule ${MAKE} build-submodule update-submodule: git submodule update --init +configure-submodule: + ${MAKE} -f ${SRCDIR}/sub.mk configure-submodule + build-submodule: - # DON'T use pbbam which is not on github. - cd libcpp && NOPBBAM=true HDF5_LIB=${HDF5_LIB} HDF5_INC=${HDF5_INC} ./configure.py ${MAKE} -C libcpp -submodule-clean: +distclean-submodule: ${RM} -r libcpp # The rules above must be run separately. @@ -64,20 +68,12 @@ makeextrautils: ${MAKE} -C extrautils -CTESTS := \ -ctest/affineAlign.t ctest/bamOut.t ctest/ccsH5.t ctest/filtercriteria.t ctest/m0-5.t ctest/samNM.t \ -ctest/aggressiveIntervalCut.t ctest/bug25328.t ctest/concordant.t ctest/fofn.t ctest/multipart.t ctest/useccsallBestN1.t \ -ctest/alignScore.t ctest/bug25741.t ctest/ecoli.t ctest/hitpolicy.t ctest/noSplitSubreads.t ctest/useccsallLargeGenome.t\ -ctest/bamIn.t ctest/bug25766.t ctest/fastMaxInterval.t ctest/holeNumbers.t ctest/open_fail.t ctest/verbose.t - -SLOW_CTESTS := ctest/bug25328.t ctest/useccsallLargeGenome.t - cramtests: blasr utils - cram -v --shell=/bin/bash ${CTESTS} + ${MAKE} -f cram.mk cramtests ${MAKE} -C utils cramtests cramfast: blasr utils - cram -v --shell=/bin/bash $(filter-out ${SLOW_CTESTS},${CTESTS}) + ${MAKE} -f cram.mk cramfast ${MAKE} -C utils cramfast gtest: blasr diff -Nru blasr-0~20151014+git8e668be/README.INSTALL.md blasr-5.3/README.INSTALL.md --- blasr-0~20151014+git8e668be/README.INSTALL.md 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/README.INSTALL.md 2016-09-12 08:56:50.000000000 +0000 @@ -1,5 +1,9 @@ ## Installation +### See a step by step Blasr installation example on Blasr wiki page + + https://github.com/PacificBiosciences/blasr/wiki/Step-by-step-blasr-installation-example + ### Download source code * To pull this project from git hub to your local system: @@ -8,7 +12,11 @@ * To sync your code with the latest git code base: - git pull -u origin master && git submodule update --init + git pull --rebase origin master && git submodule update --init + +* To update the submodule: + + make update-submodule ### Requirements @@ -37,6 +45,10 @@ export HDF5_INC=path_to_your_hdf5_include && export HDF5_LIB=path_to_your_hdf5_lib +* To configure submodule: + + make configure-submodule + ### Build * To make the 'libcpp' libraries: @@ -71,6 +83,16 @@ make blasr ./blasr +### CXXFLAGS + +* For optimized builds: + + ./configure.py CXXFLAGS=-O3 ... + +* For debug builds: + + ./configure.py CXXFLAGS=-g ... + ## Other issues ### Static binaries If you want static binaries, drop `--shared` when you run configure.py. In that case, you diff -Nru blasr-0~20151014+git8e668be/rules.mk blasr-5.3/rules.mk --- blasr-0~20151014+git8e668be/rules.mk 2015-10-14 07:00:07.000000000 +0000 +++ blasr-5.3/rules.mk 2016-09-12 08:56:50.000000000 +0000 @@ -1,16 +1,16 @@ INCDIRS := \ - ${BLASR_INC} \ ${LIBBLASR_INC} \ - ${LIBPBDATA_INC} \ ${LIBPBIHDF_INC} \ + ${LIBPBDATA_INC} +SYSINCDIRS := \ ${PBBAM_INC} \ ${HDF5_INC} \ ${HTSLIB_INC} \ ${BOOST_INC} LIBDIRS := \ ${LIBBLASR_LIB} \ - ${LIBPBDATA_LIB} \ ${LIBPBIHDF_LIB} \ + ${LIBPBDATA_LIB} \ ${PBBAM_LIB} \ ${HDF5_LIB} \ ${HTSLIB_LIB} \ @@ -20,8 +20,8 @@ LDLIBS+= \ ${LIBPBIHDF_LIBFLAGS} \ ${LIBBLASR_LIBFLAGS} \ - ${LIBPBDATA_LIBFLAGS} \ ${LIBPBIHDF_LIBFLAGS} \ + ${LIBPBDATA_LIBFLAGS} \ ${PBBAM_LIBFLAGS} \ ${HDF5_LIBFLAGS} \ ${HTSLIB_LIBFLAGS} \ @@ -33,4 +33,5 @@ # We repeat LIBPBIHDF_LIBFLAGS because of a circular dependency. See #77. CPPFLAGS+=$(patsubst %,-I%,${INCDIRS}) +CPPFLAGS+=$(patsubst %,-I%,${SYSINCDIRS}) LDFLAGS+=$(patsubst %,-L%,${LIBDIRS}) diff -Nru blasr-0~20151014+git8e668be/sub.mk blasr-5.3/sub.mk --- blasr-0~20151014+git8e668be/sub.mk 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/sub.mk 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,9 @@ + +SRCDIR:=$(dir $(realpath $(firstword ${MAKEFILE_LIST}))) +-include ${CURDIR}/defines.mk +-include ${SRCDIR}/rules.mk + +export + +configure-submodule: + cd libcpp && ./configure.py diff -Nru blasr-0~20151014+git8e668be/utils/bam2bax/BUILD.txt blasr-5.3/utils/bam2bax/BUILD.txt --- blasr-0~20151014+git8e668be/utils/bam2bax/BUILD.txt 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/utils/bam2bax/BUILD.txt 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,48 @@ +Build instructions for developers: +Assuming that blasr and blaser_libcpp is placed under //depot/software/smrtanalysis/bioinformatics/ext/pi + + $ cd + $ module load boost + $ mkdir build; cd build; cmake .. + $ make + $ ../tests/bin/test_bam2bax # to test bam2bax exe + + +Build instructions for users: + $ cd + $ mkdir build; cd build; + $ cmake -DPacBioBAM_INCLUDE_DIRS= \ + -DHTSLIB_INCLUDE_DIRS= \ + -DPacBioBAM_LIBRARIES= \ + -DHTSLIB_LIBRARIES= \ + -DPBDATA_INCLUDE_DIRS= + -DPBDATA_LIBRARIES= \ + -DPBIHDF_INCLUDE_DIRS= + -DPBIHDF_LIBRARIES= \ + -DBLASR_INCLUDE_DIRS= + -DBLASR_LIBRARIES= \ + -DHDF5_INCLUDE_DIRS= + -DHDF5_CPP_LIBRARIES= \ + -DHDF5_LIBRARIES= \ + -DBam2Bax_EXE_LINKER_FLAGS="-Wl,--no-as-needed -ldl -pthread -lrt " \ + ../ + + $ make + $ ../tests/bin/test_bam2bax # to test bam2bax exe + +# e.g., +#cmake -DPacBioBAM_INCLUDE_DIRS=$smrtanalysis/bioinformatics/lib/cpp/pbbam/include \ +# -DHTSLIB_INCLUDE_DIRS=$smrtanalysis/bioinformatics/lib/cpp/htslib \ +# -DPacBioBAM_LIBRARIES=$smrtanalysis/bioinformatics/lib/cpp/pbbam/lib/libpbbam.a \ +# -DHTSLIB_LIBRARIES=$smrtanalysis/bioinformatics/lib/cpp/htslib/libhts.a \ +# -DPBDATA_INCLUDE_DIRS=$smrtanalysis/bioinformatics/ext/pi/blasr/libcpp/pbdata \ +# -DPBDATA_LIBRARIES=$smrtanalysis/bioinformatics/ext/pi/blasr/libcpp/pbdata/libpbdata.a \ +# -DPBIHDF_INCLUDE_DIRS=$smrtanalysis/bioinformatics/ext/pi/blasr/libcpp/hdf \ +# -DPBIHDF_LIBRARIES=$smrtanalysis/bioinformatics/ext/pi/blasr/libcpp/hdf/libpbihdf.a \ +# -DBLASR_INCLUDE_DIRS=$smrtanalysis/bioinformatics/ext/pi/blasr/libcpp/alignment/ \ +# -DBLASR_LIBRARIES=$smrtanalysis/bioinformatics/ext/pi/blasr/libcpp/alignment/libblasr.a \ +# -DHDF5_INCLUDE_DIRS=$smrtanalysis/prebuilt.out/hdf5/hdf5-1.8.12/ubuntu-1404/include \ +# -DHDF5_CPP_LIBRARIES=$smrtanalysis/prebuilt.out/hdf5/hdf5-1.8.12/ubuntu-1404/lib/libhdf5_cpp.a \ +# -DHDF5_LIBRARIES=$smrtanalysis/prebuilt.out/hdf5/hdf5-1.8.12/ubuntu-1404/lib/libhdf5.a \ +# -DBam2Bax_EXE_LINKER_FLAGS="-Wl,--no-as-needed -ldl -pthread -lrt " \ +# ../ diff -Nru blasr-0~20151014+git8e668be/utils/bam2bax/CMakeLists.txt blasr-5.3/utils/bam2bax/CMakeLists.txt --- blasr-0~20151014+git8e668be/utils/bam2bax/CMakeLists.txt 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/utils/bam2bax/CMakeLists.txt 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,133 @@ +######################################################################## +# CMake build script for Bam2Bax executable. +######################################################################## + +project(Bam2Bax CXX C) +cmake_minimum_required(VERSION 2.8) + +# project version +set(Bam2Bax_MAJOR_VERSION 0) +set(Bam2Bax_MINOR_VERSION 0) +set(Bam2Bax_PATCH_VERSION 1) +set(Bam2Bax_VERSION + "${Bam2Bax_MAJOR_VERSION}.${Bam2Bax_MINOR_VERSION}.${Bam2Bax_PATCH_VERSION}" +) + +# build-time options +option(Bam2Bax_build_tests "Build Bam2Bax's unit tests." ON) + +# main project paths +set(Bam2Bax_RootDir ${Bam2Bax_SOURCE_DIR}) +set(Bam2Bax_DocsDir ${Bam2Bax_RootDir}/docs) +set(Bam2Bax_SourceDir ${Bam2Bax_RootDir}/src) +set(Bam2Bax_TestsDir ${Bam2Bax_RootDir}/tests) +set(Bam2Bax_ThirdPartyDir ${Bam2Bax_RootDir}/third-party) + +if (NOT Bam2Bax_OutputDir) + set(Bam2Bax_OutputDir ${Bam2Bax_RootDir}) +endif() + +set(Bam2Bax_BinDir ${Bam2Bax_OutputDir}/bin) +file(MAKE_DIRECTORY ${Bam2Bax_BinDir}) + +# shared & third-party paths + +if (NOT PBDATA_ROOT_DIR) + set(PBDATA_ROOT_DIR ${Bam2Bax_RootDir}/../../../blasr_libcpp) +endif() + +# find (existing) libraries needed by executable and tests +if (NOT BLASR_INCLUDE_DIRS OR NOT BLASR_LIBRARIES) + find_library(BLASR_LIBRARIES blasr ${PBDATA_ROOT_DIR}/alignment) + set(BLASR_INCLUDE_DIRS ${PBDATA_ROOT_DIR}/alignment) +endif() + +if (NOT PBIHDF_INCLUDE_DIRS OR NOT PBIHDF_LIBRARIES) + find_library(PBIHDF_LIBRARIES pbihdf ${PBDATA_ROOT_DIR}/hdf) + set(PBIHDF_INCLUDE_DIRS ${PBDATA_ROOT_DIR}/hdf) +endif() + +if (NOT PBDATA_INCLUDE_DIRS OR NOT PBDATA_LIBRARIES) + find_library(PBDATA_LIBRARIES pbdata ${PBDATA_ROOT_DIR}/pbdata) + set(PBDATA_INCLUDE_DIRS ${PBDATA_ROOT_DIR}/pbdata) +endif() + +if (NOT HDF5_INCLUDE_DIRS OR NOT HDF5_LIBRARIES) + if (NOT HDF5_RootDir) + set(HDF5_RootDir ${Bam2Bax_RootDir}/../../../../../../prebuilt.out/hdf5/hdf5-1.8.12/ubuntu-1404) + endif() + + set(HDF5_INCLUDE_DIRS ${HDF5_RootDir}/include) + set(HDF5_LibDir ${HDF5_RootDir}/lib) + + find_library(HDF5_LIBRARIES hdf5 ${HDF5_LibDir} NO_CMAKE_SYSTEM_PATH) + find_library(HDF5_CPP_LIBRARIES hdf5_cpp ${HDF5_LibDir} NO_CMAKE_SYSTEM_PATH) +endif() + +if (NOT PacBioBAM_INCLUDE_DIRS OR NOT PacBioBAM_LIBRARIES + OR NOT HTSLIB_INCLUDE_DIRS OR NOT HTSLIB_LIBRARIES) + set(PacBioBAM_RootDir ${Bam2Bax_RootDir}/../../../../../lib/cpp/pbbam) + add_subdirectory(${PacBioBAM_RootDir} external/build/pbbam) +endif() + +if (NOT Boost_INCLUDE_DIRS) + find_package(Boost REQUIRED) +endif() + +if (NOT ZLIB_LIBRARIES OR NOT ZLIB_INCLUDE_DIRS) + find_package(ZLIB REQUIRED) +endif() + +# shared CXX flags for src & tests +include(CheckCXXCompilerFlag) +set(Bam2Bax_CXX_FLAGS "-g -std=c++11 -Wall") + +# quash warnings from pbdata +check_cxx_compiler_flag("-Wno-overloaded-virtual" HAS_NO_OVERLOADED_VIRTUAL) +if(HAS_NO_OVERLOADED_VIRTUAL) + set(Bam2Bax_CXX_FLAGS "${Bam2Bax_CXX_FLAGS} -Wno-overloaded-virtual") +endif() +#check_cxx_compiler_flag("-Wno-unused-private-field" HAS_NO_UNUSED_PRIVATE_FIELD) +#if(HAS_NO_UNUSED_PRIVATE_FIELD) +# set(Bam2Bax_CXX_FLAGS "${Bam2Bax_CXX_FLAGS} -Wno-unused-private-field") +#endif() +check_cxx_compiler_flag("-Wno-unused-variable" HAS_NO_UNUSED_VARIABLE) +if(HAS_NO_UNUSED_VARIABLE) + set(Bam2Bax_CXX_FLAGS "${Bam2Bax_CXX_FLAGS} -Wno-unused-variable") +endif() +check_cxx_compiler_flag("-Wno-uninitialized" HAS_NO_UNINITIALIZED) +if(HAS_NO_UNINITIALIZED) + set(Bam2Bax_CXX_FLAGS "${Bam2Bax_CXX_FLAGS} -Wno-uninitialized") +endif() +check_cxx_compiler_flag("-Wunused-but-set-variable" HAS_UNUSED_BUT_SET_VARIABLE) +if(HAS_UNUSED_BUT_SET_VARIABLE) + set(Bam2Bax_CXX_FLAGS "${Bam2Bax_CXX_FLAGS} -Wunused-but-set-variable") +endif() +check_cxx_compiler_flag("-Wno-deprecated-declarations" HAS_NO_DEPRECATED_DECLARATIONS) +if(HAS_NO_DEPRECATED_DECLARATIONS) + set(Bam2Bax_CXX_FLAGS "${Bam2Bax_CXX_FLAGS} -Wno-deprecated-declarations") +endif() +# NOTE: -Wno-unused-local-typedefs used to quash clang warnings w/ Boost +check_cxx_compiler_flag("-Wno-unused-local-typedef" HAS_NO_UNUSED_LOCAL_TYPEDEF) +if(HAS_NO_UNUSED_LOCAL_TYPEDEF) + set(Bam2Bax_CXX_FLAGS "${Bam2Bax_CXX_FLAGS} -Wno-unused-local-typedef") +endif() + +SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${Bam2Bax_LINKER_FLAGS}" ) + +# main exe src +add_subdirectory(src) + +# testing +if(Bam2Bax_build_tests) + + enable_testing() + + if (NOT GTEST_SRC_DIR) + set(GTEST_SRC_DIR ../gtest) + endif() + + add_subdirectory(${GTEST_SRC_DIR} external/gtest/build) + add_subdirectory(tests) +endif() + diff -Nru blasr-0~20151014+git8e668be/utils/bam2bax/makefile blasr-5.3/utils/bam2bax/makefile --- blasr-0~20151014+git8e668be/utils/bam2bax/makefile 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/utils/bam2bax/makefile 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,30 @@ +.PHONY=all + +SRCDIR:=$(dir $(realpath $(lastword $(MAKEFILE_LIST)))) +-include ${CURDIR}/../../defines.mk +include ${SRCDIR}/../../rules.mk + +all: ${CURDIR}/src/*.cpp ${CURDIR}/src/*.h ${CURDIR}/tests/src/*.cpp ${CURDIR}/tests/src/*.h + @mkdir -p ${CURDIR}/build && \ + cd ${CURDIR}/build && \ + cmake -DBOOST_ROOT=${BOOST_ROOT} \ + -DPacBioBAM_INCLUDE_DIRS=${PBBAM_INC} \ + -DHTSLIB_INCLUDE_DIRS=${HTSLIB_INC} \ + -DPacBioBAM_LIBRARIES=${PBBAM_LIB}/libpbbam${SH_LIB_EXT} \ + -DHTSLIB_LIBRARIES=${HTSLIB_LIB}/libhts${SH_LIB_EXT} \ + -DPBDATA_INCLUDE_DIRS=${LIBPBDATA_INC} \ + -DPBDATA_LIBRARIES=${LIBPBDATA_LIB}/libpbdata${SH_LIB_EXT} \ + -DPBIHDF_INCLUDE_DIRS=${LIBPBIHDF_INC} \ + -DPBIHDF_LIBRARIES=${LIBPBIHDF_LIB}/libpbihdf${SH_LIB_EXT} \ + -DBLASR_INCLUDE_DIRS=${LIBBLASR_INC}/ \ + -DBLASR_LIBRARIES=${LIBBLASR_LIB}/libblasr${SH_LIB_EXT} \ + -DHDF5_INCLUDE_DIRS=${HDF5_INC} \ + -DHDF5_CPP_LIBRARIES=${HDF5_LIB}/libhdf5_cpp${SH_LIB_EXT} \ + -DHDF5_LIBRARIES=${HDF5_LIB}/libhdf5${SH_LIB_EXT} \ + -DBam2Bax_EXE_LINKER_FLAGS="-Wl,--no-as-needed -ldl -pthread -lrt " \ + ../ && \ + make + +clean: + @rm -rf ${CURDIR}/bin/ + @rm -rf ${CURDIR}/build diff -Nru blasr-0~20151014+git8e668be/utils/bam2bax/README.md blasr-5.3/utils/bam2bax/README.md --- blasr-0~20151014+git8e668be/utils/bam2bax/README.md 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/utils/bam2bax/README.md 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,11 @@ +#bam2bax# + +##Usage## + + bam2bax movie.subreads.bam movie.scrapes.bam -o movie + + # movie.bax.h5 will be generated. + +##Example## + + tests/example/end-to-end.sh diff -Nru blasr-0~20151014+git8e668be/utils/bam2bax/src/Bam2BaxConverter.h blasr-5.3/utils/bam2bax/src/Bam2BaxConverter.h --- blasr-0~20151014+git8e668be/utils/bam2bax/src/Bam2BaxConverter.h 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/utils/bam2bax/src/Bam2BaxConverter.h 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,38 @@ +// Author: Yuan Li +#ifndef _BAM2BAXCONVERTER_H_ +#define _BAM2BAXCONVERTER_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "HDFFile.hpp" +#include "RegionsAdapter.h" +#include "IConverter.h" + + + +template +class Bam2BaxConverter : public IConverter +{ +public: + Bam2BaxConverter(Settings & settings) + :IConverter(settings) {} + + ~Bam2BaxConverter(void) {} + + bool Run(void) {return ConvertFile();} + +protected: + bool ConvertFile(void); +}; + +#include "Bam2BaxConverterImpl.hpp" +#endif diff -Nru blasr-0~20151014+git8e668be/utils/bam2bax/src/Bam2BaxConverterImpl.hpp blasr-5.3/utils/bam2bax/src/Bam2BaxConverterImpl.hpp --- blasr-0~20151014+git8e668be/utils/bam2bax/src/Bam2BaxConverterImpl.hpp 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/utils/bam2bax/src/Bam2BaxConverterImpl.hpp 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,57 @@ +// Author: Yuan Li +#ifndef BAM2BAX_CONVERTER_IMPL_HPP +#define BAM2BAX_CONVERTER_IMPL_HPP +#include +#include "MetadataWriter.h" +#include "Bam2BaxInternal.h" +#include +#include + + +template +bool Bam2BaxConverter::ConvertFile(void) { + + + // Write metadata.xml to parent directory of Bax.h5. + if (not settings_.outputMetadataFilename.empty()) + MetadataWriter metaWriter_(settings_.outputMetadataFilename, + rg, + settings_.outputAnalysisDirname); + + T_HDFWRITER writer(outfn, + rg.BasecallerVersion(), + scandata.BaseMap(), + qvs, + Bam2BaxDefaults::Bax_Regions_RegionTypes); + + if (settings_.traceFilename.empty()) { + writer.WriteScanData(scandata); + } else { + HDFFile traceFile; + traceFile.Open(settings_.traceFilename, H5F_ACC_RDONLY); + writer.CopyObject(traceFile, "/ScanData"); + traceFile.Close(); + } + + if (not settings_.subreadsBamFilename.empty() and + not settings_.scrapsBamFilename.empty()) { + + // Stich subreads and scraps in order to reconstruct polymerase reads. + PacBio::BAM::VirtualPolymeraseReader reader(settings_.subreadsBamFilename, + settings_.scrapsBamFilename); + while(reader.HasNext()) { + // FIXME: pbbam should not crash when reading internal pulse features. + const PacBio::BAM::VirtualPolymeraseBamRecord & record = reader.Next(); + SMRTSequence smrt; + smrt.Copy(record, true); + std::vector ras = RegionsAdapter::ToRegionAnnotations(record, regionTypes); + if (not writer.WriteOneZmw(smrt, ras) or not writer.Errors().empty()) { break; } + writer.Flush(); + } + if (not settings_.ignoreQV) writer.WriteFakeDataSets(); + for (auto error: writer.Errors()) { AddErrorMessage(error); } + } + + return errors_.empty(); +} +#endif diff -Nru blasr-0~20151014+git8e668be/utils/bam2bax/src/Bam2Bax.cpp blasr-5.3/utils/bam2bax/src/Bam2Bax.cpp --- blasr-0~20151014+git8e668be/utils/bam2bax/src/Bam2Bax.cpp 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/utils/bam2bax/src/Bam2Bax.cpp 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,29 @@ +// Author: Yuan Li + +#include // getcwd +#include +#include + +#include "Bam2Bax.h" +#include "Converter.h" + +using namespace std; + +int Bam2Bax::Run(Settings& settings) { + + bool success = false; + Converter converter(settings); + + if (converter.Run()) { + success = true; + } + + // return success/fail + if (success) + return EXIT_SUCCESS; + else { + for (const string& e : converter.Errors()) + cerr << "ERROR: " << e << endl; + return EXIT_FAILURE; + } +} diff -Nru blasr-0~20151014+git8e668be/utils/bam2bax/src/Bam2Bax.h blasr-5.3/utils/bam2bax/src/Bam2Bax.h --- blasr-0~20151014+git8e668be/utils/bam2bax/src/Bam2Bax.h 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/utils/bam2bax/src/Bam2Bax.h 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,13 @@ +// Author: Yuan Li +#ifndef BAM2BAX_H +#define BAM2BAX_H + +class Settings; + +class Bam2Bax +{ +public: + static int Run(Settings& settings); +}; + +#endif // BAM2BAX_H diff -Nru blasr-0~20151014+git8e668be/utils/bam2bax/src/Bam2BaxInternal.h blasr-5.3/utils/bam2bax/src/Bam2BaxInternal.h --- blasr-0~20151014+git8e668be/utils/bam2bax/src/Bam2BaxInternal.h 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/utils/bam2bax/src/Bam2BaxInternal.h 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,78 @@ +// Author: Yuan Li + +#ifndef _BAM2BAXINTERNAL_H_ +#define _BAM2BAXINTERNAL_H_ +#include + +//namespace internal +namespace internal { + /// \name \{ + static const std::vector QVEnums = { + PacBio::BAM::BaseFeature::DELETION_QV + , PacBio::BAM::BaseFeature::DELETION_TAG + , PacBio::BAM::BaseFeature::INSERTION_QV + , PacBio::BAM::BaseFeature::MERGE_QV + , PacBio::BAM::BaseFeature::SUBSTITUTION_QV + , PacBio::BAM::BaseFeature::SUBSTITUTION_TAG + , PacBio::BAM::BaseFeature::IPD + , PacBio::BAM::BaseFeature::PULSE_WIDTH + , PacBio::BAM::BaseFeature::PKMID + , PacBio::BAM::BaseFeature::PKMEAN + , PacBio::BAM::BaseFeature::LABEL + , PacBio::BAM::BaseFeature::LABEL_QV + , PacBio::BAM::BaseFeature::ALT_LABEL + , PacBio::BAM::BaseFeature::ALT_LABEL_QV + , PacBio::BAM::BaseFeature::PULSE_MERGE_QV + , PacBio::BAM::BaseFeature::PULSE_CALL + , PacBio::BAM::BaseFeature::START_FRAME + , PacBio::BAM::BaseFeature::PULSE_CALL_WIDTH + }; + + /// \returns QVs contained by read group rg. + /// FIXME: this function should be provided by pbbam.ReadGroupInfo + /// FIXME: pbbam, ReadGroupInfo does not recognize internal pulse features such as AltLabelQV. + inline std::vector + QVEnumsInReadGroup(const PacBio::BAM::ReadGroupInfo & rg) { + std::vector ret; + for (auto it = internal::QVEnums.begin(); it != internal::QVEnums.end(); it++) { + if (rg.HasBaseFeature(*it)) { + ret.push_back(*it); + } + } + return ret; + } + /// \} + + /// \returns QVs contained by the first record if it exists, otherwise, return {} + /// FIXME: this function provides an alternative route to get QVs contained in the bam file now, + /// because pbbam ReadGroupInfo does not recorgize internal pulse features such as AltLabelQV. + /// Note: Ignore Label because it is neither base feature nor internal pulse feature. + inline std::vector + QVEnumsInFirstRecord(const PacBio::BAM::BamFile & bamFile) { + std::vector ret; + PacBio::BAM::EntireFileQuery query(bamFile); + for (const PacBio::BAM::BamRecord & record: query) { + if (record.HasDeletionQV()) {ret.push_back(PacBio::BAM::BaseFeature::DELETION_QV);} + if (record.HasDeletionTag()) {ret.push_back(PacBio::BAM::BaseFeature::DELETION_TAG);} + if (record.HasInsertionQV()) {ret.push_back(PacBio::BAM::BaseFeature::INSERTION_QV);} + if (record.HasMergeQV()) {ret.push_back(PacBio::BAM::BaseFeature::MERGE_QV);} + if (record.HasSubstitutionQV()) {ret.push_back(PacBio::BAM::BaseFeature::SUBSTITUTION_QV);} + if (record.HasSubstitutionTag()) {ret.push_back(PacBio::BAM::BaseFeature::SUBSTITUTION_TAG);} + if (record.HasIPD()) {ret.push_back(PacBio::BAM::BaseFeature::IPD);} + if (record.HasPulseWidth()) {ret.push_back(PacBio::BAM::BaseFeature::PULSE_WIDTH);} + if (record.HasPkmid()) {ret.push_back(PacBio::BAM::BaseFeature::PKMID);} + if (record.HasPkmean()) {ret.push_back(PacBio::BAM::BaseFeature::PKMEAN);} + if (record.HasLabelQV()) {ret.push_back(PacBio::BAM::BaseFeature::LABEL_QV);} + if (record.HasAltLabelTag()) {ret.push_back(PacBio::BAM::BaseFeature::ALT_LABEL);} + if (record.HasAltLabelQV()) {ret.push_back(PacBio::BAM::BaseFeature::ALT_LABEL_QV);} + if (record.HasPulseMergeQV()) {ret.push_back(PacBio::BAM::BaseFeature::PULSE_MERGE_QV);} + if (record.HasPulseCall()) {ret.push_back(PacBio::BAM::BaseFeature::PULSE_CALL);} + if (record.HasStartFrame()) {ret.push_back(PacBio::BAM::BaseFeature::START_FRAME);} + if (record.HasPulseCallWidth()) {ret.push_back(PacBio::BAM::BaseFeature::PULSE_CALL_WIDTH);} + break; // only use the first record. + } + return ret; + } +}; + +#endif diff -Nru blasr-0~20151014+git8e668be/utils/bam2bax/src/Bam2BaxMain.cpp blasr-5.3/utils/bam2bax/src/Bam2BaxMain.cpp --- blasr-0~20151014+git8e668be/utils/bam2bax/src/Bam2BaxMain.cpp 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/utils/bam2bax/src/Bam2BaxMain.cpp 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,75 @@ +// Author: Yuan Li + +#include "Bam2Bax.h" +#include "OptionParser.h" +#include "Settings.h" +#include +#include +#include +using namespace std; + +int main(int argc, char* argv[]) +{ + // setup help & options + optparse::OptionParser parser; + parser.description("bam2bax converts the PacBio BAM format into bax.h5 format."); + parser.prog("bam2bax"); + parser.version("1.0.0.170337"); + parser.add_version_option(true); + parser.add_help_option(true); + + auto ioGroup = optparse::OptionGroup(parser, "Input/output files"); + ioGroup.add_option("") + .dest(Settings::Option::input_) + .metavar("movie.subreads.bam movie.scraps.bam") + .help("A movie.subreads.bam and a movie.scraps.bam"); + ioGroup.add_option("--trace") + .dest(Settings::Option::trace_) + .metavar("movie.trc.h5") + .help("(Optional but recommended) Input trace file to copy ScanData from"); + ioGroup.add_option("-o") + .dest(Settings::Option::output_) + .metavar("STRING") + .help("Prefix of output filenames. Movie name will be used if no prefix provided"); + ioGroup.add_option("--metadata") + .dest(Settings::Option::metadata_) + .action("store_true") + .help("Write metadata.xml to the upper directory of output file."); + parser.add_option_group(ioGroup); + + auto modeGroup = optparse::OptionGroup(parser, "Output file types (mutually exclusive:)"); + modeGroup.add_option("--base") + .dest(Settings::Option::baseMode_) + .metavar("") + .action("store_true") + .help("Output bax.h5 (default)"); + modeGroup.add_option("--pulse") + .dest(Settings::Option::pulseMode_) + .metavar("") + .action("store_true") + .help("Output pls.h5"); + modeGroup.add_option("--baseMap") + .dest(Settings::Option::baseMap_) + .metavar(Settings::OptionValue::baseMap_) + .help("Set /ScanData/DyeSet/BaseMap, mapping channels to bases."); + modeGroup.add_option("--ignoreQV") + .dest(Settings::Option::ignoreQV_) + .metavar("") + .action("store_true") + .help("Don't save QVs in ouptut file."); + parser.add_option_group(modeGroup); + + // parse command line + Settings settings = Settings::FromCommandLine(parser, argc, argv); + if (!settings.errors_.empty()) { + cerr << endl; + for (const auto e : settings.errors_) + cerr << "ERROR: " << e << endl; + cerr << endl; + parser.print_help(); + return EXIT_FAILURE; + } + + // main conversion + return Bam2Bax::Run(settings); +} diff -Nru blasr-0~20151014+git8e668be/utils/bam2bax/src/Bam2PlxMain.cpp blasr-5.3/utils/bam2bax/src/Bam2PlxMain.cpp --- blasr-0~20151014+git8e668be/utils/bam2bax/src/Bam2PlxMain.cpp 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/utils/bam2bax/src/Bam2PlxMain.cpp 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,65 @@ +// Author: Yuan Li + +#include "Bam2Bax.h" +#include "OptionParser.h" +#include "Settings.h" +#include +#include +#include +using namespace std; + +int main(int argc, char* argv[]) +{ + // setup help & options + optparse::OptionParser parser; + parser.description("bam2plx converts the PacBio Internal BAM format into plx.h5 format."); + parser.prog("bam2plx"); + parser.version("1.0.0.170337"); + parser.add_version_option(true); + parser.add_help_option(true); + + auto ioGroup = optparse::OptionGroup(parser, "Input/output files"); + ioGroup.add_option("") + .dest(Settings::Option::input_) + .metavar("movie.subreads.bam movie.scraps.bam") + .help("A movie.subreads.bam and a movie.scraps.bam"); + ioGroup.add_option("-o") + .dest(Settings::Option::output_) + .metavar("STRING") + .help("Prefix of output filenames. Movie name will be used if no prefix provided"); + ioGroup.add_option("--metadata") + .dest(Settings::Option::metadata_) + .action("store_true") + .help("Write metadata.xml to the upper directory of output file."); + parser.add_option_group(ioGroup); + + auto modeGroup = optparse::OptionGroup(parser, "Output file types (mutually exclusive:)"); + modeGroup.add_option("--baseMap") + .dest(Settings::Option::baseMap_) + .metavar(Settings::OptionValue::baseMap_) + .help("Set /ScanData/DyeSet/BaseMap, mapping channels to bases."); + modeGroup.add_option("--ignoreQV") + .dest(Settings::Option::ignoreQV_) + .metavar("") + .action("store_true") + .help("Don't save QVs in ouptut file."); + parser.add_option_group(modeGroup); + + // parse command line + Settings settings = Settings::FromCommandLine(parser, argc, argv, true); + if (!settings.errors_.empty()) { + cerr << endl; + for (const auto e : settings.errors_) + cerr << "ERROR: " << e << endl; + cerr << endl; + parser.print_help(); + return EXIT_FAILURE; + } + + // Reset settings.mode to pulse mode. + settings.mode = Settings::PulseMode; + settings.outputBaxFilename = settings.outputBaxPrefix + ".plx.h5"; + + // main conversion + return Bam2Bax::Run(settings); +} diff -Nru blasr-0~20151014+git8e668be/utils/bam2bax/src/CMakeLists.txt blasr-5.3/utils/bam2bax/src/CMakeLists.txt --- blasr-0~20151014+git8e668be/utils/bam2bax/src/CMakeLists.txt 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/utils/bam2bax/src/CMakeLists.txt 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,79 @@ +include_directories( + . + ${BLASR_INCLUDE_DIRS} + ${Boost_INCLUDE_DIRS} + ${HDF5_INCLUDE_DIRS} + ${HTSLIB_INCLUDE_DIRS} + ${PacBioBAM_INCLUDE_DIRS} + ${PBDATA_INCLUDE_DIRS} + ${PBDATA_ROOT_DIR} + ${PBIHDF_INCLUDE_DIRS} + ${ZLIB_INCLUDE_DIRS} +) + +set(SOURCES + Settings.h + Settings.cpp + Converter.h + Converter.cpp + Bam2BaxInternal.h + RegionTypeAdapter.h + RegionsAdapter.h + Bam2BaxConverter.h + Bam2BaxConverterImpl.hpp + Bam2Bax.h + Bam2Bax.cpp + OptionParser.h + OptionParser.cpp + MetadataWriter.h + MetadataWriter.cpp +) + +set(BAM2BAX_SOURCES + Bam2BaxMain.cpp + ${SOURCES} +) + +set(BAM2PLX_SOURCES + Bam2PlxMain.cpp + ${SOURCES} +) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${Bam2Bax_CXX_FLAGS}") +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${Bam2Bax_EXE_LINKER_FLAGS}") + +add_executable(bam2bax ${BAM2BAX_SOURCES}) +set_target_properties(bam2bax PROPERTIES + RUNTIME_OUTPUT_DIRECTORY ${Bam2Bax_BinDir} +) +if (NOT APPLE) + set(MY_LIBRT -lrt) +else() +endif() +target_link_libraries(bam2bax + ${BLASR_LIBRARIES} + ${PBIHDF_LIBRARIES} + ${PBDATA_LIBRARIES} + ${HDF5_CPP_LIBRARIES} + ${HDF5_LIBRARIES} + ${PacBioBAM_LIBRARIES} + ${HTSLIB_LIBRARIES} + ${ZLIB_LIBRARIES} + ${MY_LIBRT} +) + +add_executable(bam2plx ${BAM2PLX_SOURCES} ) +set_target_properties(bam2plx PROPERTIES + RUNTIME_OUTPUT_DIRECTORY ${Bam2Bax_BinDir} +) +target_link_libraries(bam2plx + ${BLASR_LIBRARIES} + ${PBIHDF_LIBRARIES} + ${PBDATA_LIBRARIES} + ${HDF5_CPP_LIBRARIES} + ${HDF5_LIBRARIES} + ${PacBioBAM_LIBRARIES} + ${HTSLIB_LIBRARIES} + ${ZLIB_LIBRARIES} + ${MY_LIBRT} +) diff -Nru blasr-0~20151014+git8e668be/utils/bam2bax/src/Converter.cpp blasr-5.3/utils/bam2bax/src/Converter.cpp --- blasr-0~20151014+git8e668be/utils/bam2bax/src/Converter.cpp 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/utils/bam2bax/src/Converter.cpp 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,130 @@ +#include "Converter.h" + +Converter::Converter(Settings const& settings) +:settings_(settings) { + writer_ = NULL; + scanData_ = NULL; + + std::string infn = settings_.subreadsBamFilename; + + bamfile_ = new PacBio::BAM::BamFile(infn); + PacBio::BAM::BamHeader bamheader = bamfile_->Header(); + + if (bamheader.ReadGroups().size() != 1) { + AddErrorMessage("Bam file must contain reads from exactly one SMRTCell."); + // XXX: Throw initialization exception + } + PacBio::BAM::ReadGroupInfo rg = bamheader.ReadGroups()[0]; + MockScanData(rg); + + // Write metadata.xml to parent directory of Bax.h5. + if (not settings_.outputMetadataFilename.empty()) + MetadataWriter metaWriter_(settings_.outputMetadataFilename, + rg, + settings_.outputAnalysisDirname); + + // FIXME: pbbam needs to provide an API which returns BaseFeatures in read group + std::vector qvs = settings_.ignoreQV ? std::vector({}) : internal::QVEnumsInFirstRecord(*bamfile_); + + InitializeWriter(rg.BasecallerVersion(), qvs); +} + +Converter::~Converter(void) { + if (scanData_ != NULL) delete scanData_; + if (writer_ != NULL) delete writer_; + delete bamfile_; +} + +std::vector Converter::Errors(void) const { + return errors_; +} + +bool Converter::Run() { + if (settings_.traceFilename.empty()) { + writer_->WriteScanData(*scanData_); + } else { + HDFFile traceFile; + traceFile.Open(settings_.traceFilename, H5F_ACC_RDONLY); + writer_->CopyObject(traceFile, "/ScanData"); + if (settings_.mode == Settings::PulseMode) { + SetInverseGain(traceFile); + } + traceFile.Close(); + } + + // Regions attribute RegionTypes, which defines supported region types in ORDER. + std::vector regionTypes = + RegionTypeAdapter::ToRegionTypes(Bam2BaxDefaults::Bax_Regions_RegionTypes); + + if (not settings_.subreadsBamFilename.empty() and + not settings_.scrapsBamFilename.empty()) { + + // Stich subreads and scraps in order to reconstruct polymerase reads. + PacBio::BAM::VirtualPolymeraseReader reader(settings_.subreadsBamFilename, + settings_.scrapsBamFilename); + while(reader.HasNext()) { + // FIXME: pbbam should not crash when reading internal pulse features. + const PacBio::BAM::VirtualPolymeraseBamRecord & record = reader.Next(); + SMRTSequence smrt; + smrt.Copy(record, true); + std::vector ras = RegionsAdapter::ToRegionAnnotations(record, regionTypes); + if (not writer_->WriteOneZmw(smrt, ras) or not writer_->Errors().empty()) { break; } + writer_->Flush(); + } + if (not settings_.ignoreQV) writer_->WriteFakeDataSets(); + for (auto error: writer_->Errors()) { AddErrorMessage(error); } + } + + return errors_.empty(); +} + +void Converter::MockScanData(PacBio::BAM::ReadGroupInfo& rg) { + // Construct AcqParams + AcqParams acqParams(Bam2BaxDefaults::Bax_ScanData_AduGain, + Bam2BaxDefaults::Bax_ScanData_CameraGain, + Bam2BaxDefaults::Bax_ScanData_CameraType, + Bam2BaxDefaults::Bax_ScanData_HotStartFrame, + Bam2BaxDefaults::Bax_ScanData_LaserOnFrame); + + // Construct scandata. + scanData_ = new ScanData(acqParams); + scanData_->PlatformID(Sequel) // assume sequel movie + .MovieName(rg.MovieName()) // should be reliable now + .WhenStarted(rg.Date()) + .RunCode(Bam2BaxDefaults::Bax_ScanData_RunCode) // bam does not contain RunCode + .NumFrames(Bam2BaxDefaults::Bax_ScanData_NumFrames) // bam does not contain NumFrames + .FrameRate(Bam2BaxDefaults::Bax_ScanData_FrameRate) // Ignore bam header FrameRate. + .SequencingKit(rg.SequencingKit()) + .BindingKit(rg.BindingKit()) + .BaseMap(settings_.baseMap); +} + +void Converter::InitializeWriter(const std::string& bcvers, + const std::vector& qvs) + { + std::string outfn = settings_.outputBaxFilename; + Settings::Mode mode = settings_.mode; + + if (mode == Settings::BaseMode) { + std::cout << "Converting BAM to bax.h5." << std::endl; + writer_ = new HDFBaxWriter(outfn, bcvers, + scanData_->BaseMap(), qvs, Bam2BaxDefaults::Bax_Regions_RegionTypes); + } else if (mode == Settings::PulseMode) { + std::cout << "Converting BAM to plx.h5." << std::endl; + writer_ = new HDFPulseWriter(outfn, bcvers, + scanData_->BaseMap(), qvs, Bam2BaxDefaults::Bax_Regions_RegionTypes); + } else { + std::cerr << "UNKNOWN mode." << settings_.mode << std::endl; + throw std::exception(); + } +} + +void Converter::SetInverseGain(HDFFile& traceFile) { + H5::Group acqGrp = traceFile.hdfFile.openGroup("/ScanData/AcqParams"); + H5::Attribute aduAttr = acqGrp.openAttribute("AduGain"); + float igain; + H5::DataType* dt = new H5::DataType(H5::PredType::IEEE_F32LE); + aduAttr.read(*dt, &igain); + HDFPulseWriter* pw = static_cast(writer_); + pw->SetInverseGain(igain); +} diff -Nru blasr-0~20151014+git8e668be/utils/bam2bax/src/Converter.h blasr-5.3/utils/bam2bax/src/Converter.h --- blasr-0~20151014+git8e668be/utils/bam2bax/src/Converter.h 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/utils/bam2bax/src/Converter.h 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,77 @@ +// Author: Yuan Li +#ifndef BAM2BAX_ICONVERTER_H_ +#define BAM2BAX_ICONVERTER_H_ + +#include +#include +#include +#include "pbdata/Enumerations.h" +#include "pbbam/BamFile.h" +#include "pbbam/BamHeader.h" +#include "pbbam/ReadGroupInfo.h" +#include "pbbam/virtual/VirtualPolymeraseReader.h" +#include "pbbam/virtual/VirtualPolymeraseBamRecord.h" +#include "pbbam/virtual/VirtualRegion.h" +#include "pbbam/virtual/VirtualRegionType.h" +#include "pbbam/virtual/VirtualRegionTypeMap.h" +#include "HDFWriterBase.hpp" +#include "HDFBaxWriter.hpp" +#include "HDFPulseWriter.hpp" +#include "RegionsAdapter.h" +#include "Settings.h" +#include "MetadataWriter.h" +#include "Bam2BaxInternal.h" + +namespace Bam2BaxDefaults { + // Default value of attribute /ScanData/AcqParams/NumFrames in Bax. + static const unsigned int Bax_ScanData_NumFrames = 0; + // Default value of attribute /ScanData/AcqParams/AduGain in Bax. + static const float Bax_ScanData_AduGain = 1.0; + // Default value of attribute /ScanData/AcqParams/CameraGain in Bax. + static const float Bax_ScanData_CameraGain = 1.0; + // Default value of attribute /ScanData/AcqParams/CameraType in Bax. + static const int Bax_ScanData_CameraType = 0; + // Default value of attribute /ScanData/AcqParams/HotStartFrame in Bax. + static const UInt Bax_ScanData_HotStartFrame = 0; + // Default value of attribute /ScanData/AcqParams/LaserOnFrame in Bax. + static const UInt Bax_ScanData_LaserOnFrame = 0; + // Default value of attribute /ScanData/AcqParams/FrameRate in Bax. + static const float Bax_ScanData_FrameRate = 80.047035; + + // Default value of attribute /ScanData/RunInfo/RunCode in Bax. + static const std::string Bax_ScanData_RunCode = "Bam2Bax_Run_Code"; + // Default value of attribute /ScanData/DyeSet/BaseMap in Bax. + static const std::string Bax_ScanData_BaseMap = PacBio::AttributeValues::ScanData::DyeSet::basemap; + // Default value of attribute /Regions/RegionTypes in Bax. + static const std::vector Bax_Regions_RegionTypes = PacBio::AttributeValues::Regions::regiontypes; +} + +class Converter { +public: + Converter(Settings const& settings); + ~Converter(void); + +public: + std::vector Errors(void) const; + bool Run(); + +protected: + void AddErrorMessage(const std::string & errmsg) { + errors_.push_back(errmsg); + } + +protected: + // protected variables + Settings const& settings_; + ScanData* scanData_; + HDFWriterBase* writer_; + PacBio::BAM::BamFile* bamfile_; + std::vector errors_; + +private: + void MockScanData(PacBio::BAM::ReadGroupInfo& rg); + void InitializeWriter(const std::string& bcvers, + const std::vector& qvs); + void SetInverseGain(HDFFile& traceFile); +}; +#endif diff -Nru blasr-0~20151014+git8e668be/utils/bam2bax/src/MetadataWriter.cpp blasr-5.3/utils/bam2bax/src/MetadataWriter.cpp --- blasr-0~20151014+git8e668be/utils/bam2bax/src/MetadataWriter.cpp 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/utils/bam2bax/src/MetadataWriter.cpp 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,42 @@ +#include "MetadataWriter.h" + +std::string internal::Replace(const std::string & in_str, + const std::string & to_find, + const std::string & to_replace) { + // Replace the first occurrence of to_find by to_replace. + std::string ret = in_str; + std::size_t pos = ret.find(to_find); + if (pos != std::string::npos) { + ret.replace(pos, to_find.size(), to_replace); + } + return ret; +} + +MetadataWriter::MetadataWriter(const std::string & filename, + const PacBio::BAM::ReadGroupInfo & rg, + const std::string & analysisDir) { + MetadataWriter(filename, + rg.BasecallerVersion(), + rg.SequencingKit(), + rg.BindingKit(), + analysisDir); +} + +MetadataWriter::MetadataWriter(const std::string & filename, + const std::string & basecallerVersion, + const std::string & sequencingKit, + const std::string & bindingKit, + const std::string & analysisDir) { + assert(analysisDir.find('/') == std::string::npos); + std::ofstream ofile; + ofile.open(filename, std::ofstream::out); + + std::string to_print = internal::META_CONTENT; + to_print = internal::Replace(to_print, "__BASECALLERVERSION__", basecallerVersion); + to_print = internal::Replace(to_print, "__SEQUENCINGKIT__", sequencingKit); + to_print = internal::Replace(to_print, "__BINDINGKIT__", bindingKit); + to_print = internal::Replace(to_print, "__ANALYSISDIR__", analysisDir); + + ofile << to_print << std::endl; + ofile.close(); +} diff -Nru blasr-0~20151014+git8e668be/utils/bam2bax/src/MetadataWriter.h blasr-5.3/utils/bam2bax/src/MetadataWriter.h --- blasr-0~20151014+git8e668be/utils/bam2bax/src/MetadataWriter.h 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/utils/bam2bax/src/MetadataWriter.h 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,38 @@ +// Author: Yuan Li + +#ifndef _BAM2BAX_METADATA_WRITER_H_ +#define _BAM2BAX_METADATA_WRITER_H_ + +#include +#include +#include +#include + +namespace internal{ + +const std::string DEFAULT_ANALYSIS_DIR = "Analysis_Results"; + +const std::string META_CONTENT = +"__BASECALLERVERSION__31__BINDINGKIT____SEQUENCINGKIT__BasecallerV1__ANALYSISDIR__"; + +std::string Replace(const std::string & in_str, + const std::string & to_find, + const std::string & to_replace); +} //namespace internal + +class MetadataWriter { +public: + MetadataWriter(const std::string & filename, + const PacBio::BAM::ReadGroupInfo & rg, + const std::string & analysisDir=internal::DEFAULT_ANALYSIS_DIR); + + MetadataWriter(const std::string & filename, + const std::string & basecallerVersion, + const std::string & sequencingKit, + const std::string & bindingKit, + const std::string & analysisDir); + + ~MetadataWriter(void) {} +}; + +#endif diff -Nru blasr-0~20151014+git8e668be/utils/bam2bax/src/OptionParser.cpp blasr-5.3/utils/bam2bax/src/OptionParser.cpp --- blasr-0~20151014+git8e668be/utils/bam2bax/src/OptionParser.cpp 1970-01-01 00:00:00.000000000 +0000 +++ blasr-5.3/utils/bam2bax/src/OptionParser.cpp 2016-09-12 08:56:50.000000000 +0000 @@ -0,0 +1,562 @@ +/** + * Copyright (C) 2010 Johannes Weißl + * License: your favourite BSD-style license + * + * See OptionParser.h for help. + */ + +#include "OptionParser.h" + +#include +#include +#include +#include + +#if defined(ENABLE_NLS) && ENABLE_NLS +# include +# define _(s) gettext(s) +#else +# define _(s) ((const char *) (s)) +#endif + +using namespace std; + +namespace optparse { + +////////// auxiliary (string) functions { ////////// +class str_wrap { +public: + str_wrap(const string& l, const string& r) : lwrap(l), rwrap(r) {} + str_wrap(const string& w) : lwrap(w), rwrap(w) {} + string operator() (const string& s) { return lwrap + s + rwrap; } + const string lwrap, rwrap; +}; +template +static string str_join_trans(const string& sep, InputIterator begin, InputIterator end, UnaryOperator op) { + string buf; + for (InputIterator it = begin; it != end; ++it) { + if (it != begin) + buf += sep; + buf += op(*it); + } + return buf; +} +template +static string str_join(const string& sep, InputIterator begin, InputIterator end) { + return str_join_trans(sep, begin, end, str_wrap("")); +} +static string& str_replace(string& s, const string& patt, const string& repl) { + size_t pos = 0, n = patt.length(); + while (true) { + pos = s.find(patt, pos); + if (pos == string::npos) + break; + s.replace(pos, n, repl); + pos += repl.size(); + } + return s; +} +static string str_replace(const string& s, const string& patt, const string& repl) { + string tmp = s; + str_replace(tmp, patt, repl); + return tmp; +} +static string str_format(const string& s, size_t pre, size_t len, bool indent_first = true) { + stringstream ss; + string p; + if (indent_first) + p = string(pre, ' '); + + size_t pos = 0, linestart = 0; + size_t line = 0; + while (true) { + bool wrap = false; + + size_t new_pos = s.find_first_of(" \n\t", pos); + if (new_pos == string::npos) + break; + if (s[new_pos] == '\n') { + pos = new_pos + 1; + wrap = true; + } + if (line == 1) + p = string(pre, ' '); + if (wrap || new_pos + pre > linestart + len) { + ss << p << s.substr(linestart, pos - linestart - 1) << endl; + linestart = pos; + line++; + } + pos = new_pos + 1; + } + ss << p << s.substr(linestart) << endl; + return ss.str(); +} +static string str_inc(const string& s) { + stringstream ss; + string v = (s != "") ? s : "0"; + long i; + istringstream(v) >> i; + ss << i+1; + return ss.str(); +} +static unsigned int cols() { + unsigned int n = 80; +#ifndef _WIN32 + const char *s = getenv("COLUMNS"); + if (s) + istringstream(s) >> n; +#endif + return n; +} +static string basename(const string& s) { + string b = s; + size_t i = b.find_last_not_of('/'); + if (i == string::npos) { + if (b[0] == '/') + b.erase(1); + return b; + } + b.erase(i+1, b.length()-i-1); + i = b.find_last_of("/"); + if (i != string::npos) + b.erase(0, i+1); + return b; +} +////////// } auxiliary (string) functions ////////// + + +////////// class OptionParser { ////////// +OptionParser::OptionParser() : + _usage(_("%prog [options]")), + _add_help_option(true), + _add_version_option(true), + _interspersed_args(true) {} + +Option& OptionParser::add_option(const string& opt) { + const string tmp[1] = { opt }; + return add_option(vector(&tmp[0], &tmp[1])); +} +Option& OptionParser::add_option(const string& opt1, const string& opt2) { + const string tmp[2] = { opt1, opt2 }; + return add_option(vector(&tmp[0], &tmp[2])); +} +Option& OptionParser::add_option(const string& opt1, const string& opt2, const string& opt3) { + const string tmp[3] = { opt1, opt2, opt3 }; + return add_option(vector(&tmp[0], &tmp[3])); +} +Option& OptionParser::add_option(const vector& v) { + _opts.resize(_opts.size()+1); + Option& option = _opts.back(); + string dest_fallback; + for (vector::const_iterator it = v.begin(); it != v.end(); ++it) { + if (it->substr(0,2) == "--") { + const string s = it->substr(2); + if (option.dest() == "") + option.dest(str_replace(s, "-", "_")); + option._long_opts.insert(s); + _optmap_l[s] = &option; + } else if ( it->empty() ) { + continue; + } else { + const string s = it->substr(1,1); + if (dest_fallback == "") + dest_fallback = s; + option._short_opts.insert(s); + _optmap_s[s] = &option; + } + } + if (option.dest() == "") + option.dest(dest_fallback); + return option; +} + +OptionParser& OptionParser::add_option_group(const OptionGroup& group) { + for (list