diff -Nru last-align-961/ChangeLog.txt last-align-963/ChangeLog.txt --- last-align-961/ChangeLog.txt 2018-12-09 23:26:51.000000000 +0000 +++ last-align-963/ChangeLog.txt 2018-12-26 07:28:54.000000000 +0000 @@ -1,8 +1,19 @@ +2018-12-26 Martin C. Frith + + * src/tantan.cc: + Make tantan repeat-finding faster + [44199bff9cef] [tip] + + * doc/Makefile, doc/last-parallel.txt, doc/last-split.txt, doc/last- + tutorial.txt: + Update the documents a bit + [6727ae8b7a12] + 2018-12-10 Martin C. Frith * doc/last-dotplot.txt, scripts/last-dotplot: Add last-dotplot --maxseqs option - [ed0fb9b1eb40] [tip] + [ed0fb9b1eb40] * src/split/last-split.cc: Make last-split -n keep E-values diff -Nru last-align-961/debian/changelog last-align-963/debian/changelog --- last-align-961/debian/changelog 2018-12-17 10:13:03.000000000 +0000 +++ last-align-963/debian/changelog 2019-01-17 10:21:30.000000000 +0000 @@ -1,3 +1,20 @@ +last-align (963-2) unstable; urgency=medium + + * Example files are not compressed any more - adapt autopkgtest + Closes: #919556 + + -- Andreas Tille Thu, 17 Jan 2019 11:21:30 +0100 + +last-align (963-1) unstable; urgency=medium + + * New upstream version + * debhelper 12 + * Standards-Version: 4.3.0 + * debian/tests/control: Replace needs-recommends by explicitly + adding recommended package python-pil to test depends + + -- Andreas Tille Fri, 11 Jan 2019 22:49:18 +0100 + last-align (961-1) unstable; urgency=medium [ Jelmer Vernooij ] diff -Nru last-align-961/debian/compat last-align-963/debian/compat --- last-align-961/debian/compat 2018-12-17 10:13:03.000000000 +0000 +++ last-align-963/debian/compat 2019-01-17 10:21:30.000000000 +0000 @@ -1 +1 @@ -11 +12 diff -Nru last-align-961/debian/control last-align-963/debian/control --- last-align-961/debian/control 2018-12-17 10:13:03.000000000 +0000 +++ last-align-963/debian/control 2019-01-17 10:21:30.000000000 +0000 @@ -4,11 +4,11 @@ Andreas Tille Section: science Priority: optional -Build-Depends: debhelper (>= 11~), +Build-Depends: debhelper (>= 12~), help2man, python-pil, zlib1g-dev -Standards-Version: 4.2.1 +Standards-Version: 4.3.0 Vcs-Browser: https://salsa.debian.org/med-team/last-align Vcs-Git: https://salsa.debian.org/med-team/last-align.git Homepage: http://last.cbrc.jp/ diff -Nru last-align-961/debian/tests/control last-align-963/debian/tests/control --- last-align-961/debian/tests/control 2018-12-17 10:13:03.000000000 +0000 +++ last-align-963/debian/tests/control 2019-01-17 10:21:30.000000000 +0000 @@ -1,3 +1,3 @@ Tests: run-unit-test -Depends: @ -Restrictions: allow-stderr, needs-recommends +Depends: @, python-pil +Restrictions: allow-stderr diff -Nru last-align-961/debian/tests/run-unit-test last-align-963/debian/tests/run-unit-test --- last-align-961/debian/tests/run-unit-test 2018-12-17 10:13:03.000000000 +0000 +++ last-align-963/debian/tests/run-unit-test 2019-01-17 10:21:30.000000000 +0000 @@ -9,7 +9,7 @@ cd $ADTTMP -cp -a /usr/share/doc/${pkg}/examples/*.fa.gz . +cp -a /usr/share/doc/${pkg}/examples/*.fa* . gunzip -r * # first example from tutorial diff -Nru last-align-961/doc/last-parallel.html last-align-963/doc/last-parallel.html --- last-align-961/doc/last-parallel.html 2018-04-20 14:43:38.000000000 +0000 +++ last-align-963/doc/last-parallel.html 2018-12-26 07:28:59.000000000 +0000 @@ -364,11 +364,11 @@

Instead of this:

-lastal -Q1 -D100 db q.fastq | last-split > out.maf
+lastal -Q1 db q.fastq | last-split > out.maf
 

try this:

-parallel-fastq "lastal -Q1 -D100 db | last-split" < q.fastq > out.maf
+parallel-fastq "lastal -Q1 db | last-split" < q.fastq > out.maf
 

Instead of this:

diff -Nru last-align-961/doc/last-parallel.txt last-align-963/doc/last-parallel.txt
--- last-align-961/doc/last-parallel.txt	2017-09-22 10:01:15.000000000 +0000
+++ last-align-963/doc/last-parallel.txt	2018-12-26 07:28:37.000000000 +0000
@@ -53,11 +53,11 @@
 
 Instead of this::
 
-  lastal -Q1 -D100 db q.fastq | last-split > out.maf
+  lastal -Q1 db q.fastq | last-split > out.maf
 
 try this::
 
-  parallel-fastq "lastal -Q1 -D100 db | last-split" < q.fastq > out.maf
+  parallel-fastq "lastal -Q1 db | last-split" < q.fastq > out.maf
 
 Instead of this::
 
diff -Nru last-align-961/doc/last-split.html last-align-963/doc/last-split.html
--- last-align-961/doc/last-split.html	2018-08-01 11:00:29.000000000 +0000
+++ last-align-963/doc/last-split.html	2018-12-26 07:29:01.000000000 +0000
@@ -334,7 +334,7 @@
 can do the alignment like this:

 lastdb -uNEAR -R01 db genome.fasta
-lastal -Q1 -D100 db q.fastq | last-split > out.maf
+lastal -Q1 db q.fastq | last-split > out.maf
 
@@ -397,7 +397,7 @@

Going faster by parallelization

For example, split alignment of DNA reads to a genome:

-parallel-fastq "lastal -Q1 -D100 db | last-split" < q.fastq > out.maf
+parallel-fastq "lastal -Q1 db | last-split" < q.fastq > out.maf
 

This requires GNU parallel to be installed (http://www.gnu.org/software/parallel/).

@@ -558,7 +558,7 @@ splice signals or favouring cis-splices:

 lastdb -uNEAR -R01 db genome.fasta
-lastal -Q1 -D100 db q.fastq | last-split -c0 > out.maf
+lastal -Q1 db q.fastq | last-split -c0 > out.maf
 
diff -Nru last-align-961/doc/last-split.txt last-align-963/doc/last-split.txt --- last-align-961/doc/last-split.txt 2018-08-01 11:00:06.000000000 +0000 +++ last-align-963/doc/last-split.txt 2018-12-26 07:28:37.000000000 +0000 @@ -21,7 +21,7 @@ can do the alignment like this:: lastdb -uNEAR -R01 db genome.fasta - lastal -Q1 -D100 db q.fastq | last-split > out.maf + lastal -Q1 db q.fastq | last-split > out.maf Spliced alignment of RNA reads to a genome ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -78,7 +78,7 @@ For example, split alignment of DNA reads to a genome:: - parallel-fastq "lastal -Q1 -D100 db | last-split" < q.fastq > out.maf + parallel-fastq "lastal -Q1 db | last-split" < q.fastq > out.maf This requires GNU parallel to be installed (http://www.gnu.org/software/parallel/). @@ -173,7 +173,7 @@ splice signals or favouring cis-splices:: lastdb -uNEAR -R01 db genome.fasta - lastal -Q1 -D100 db q.fastq | last-split -c0 > out.maf + lastal -Q1 db q.fastq | last-split -c0 > out.maf Options ------- diff -Nru last-align-961/doc/last-tutorial.html last-align-963/doc/last-tutorial.html --- last-align-961/doc/last-tutorial.html 2018-04-20 14:43:41.000000000 +0000 +++ last-align-963/doc/last-tutorial.html 2018-12-26 07:29:03.000000000 +0000 @@ -379,20 +379,28 @@ lastal -F15 protdb dnas.fa
-
-

Example 4: Find short protein alignments

+
+

Example 4: Find high-similarity, and short, protein alignments

LAST uses a scoring scheme to find -similarities. Some scoring schemes are good for long-and-weak -similarities, others for short-and-strong similarities. If we seek -very short similarities, weak ones are hopeless (statistically -insignificant), so we had better focus on strong ones. The PAM30 -scoring scheme may work well:

+similarities. Some scoring schemes are tuned for weak similarities, +others for strong similarities. The PAM30 scoring scheme finds strong +protein similarities:

 lastdb -p -cR01 invdb invertebrate.fa
 lastal -pPAM30 invdb vertebrate.fa
 
-

(How short is "very short"? It depends on the amount of sequence data -we are searching, but perhaps roughly less than 40 amino acids.)

+

This has two advantages:

+
    +
  • It omits weak alignments, or alignment parts (occasionally a strong +similarity is flanked by a weak similarity).

    +
  • +
  • It can find short similarities. If we seek very short similarities, +weak ones are hopeless (statistically insignificant), so we had +better focus on strong ones. (How short is "very short"? It +depends on the amount of sequence data we are searching, but perhaps +roughly less than 40 amino acids.)

    +
  • +

Example 5: Align human DNA sequences to the human genome

@@ -446,13 +454,14 @@

DNA sequences are not always perfectly accurate, and they are sometimes provided in fastq format, which indicates the reliability of each base. LAST can use this information to improve alignment -accuracy. (It assumes the reliabilities reflect substitution errors, -not insertion/deletion errors: if that is not true, it may be better -to use fasta format.) Option -Q1 indicates fastq-sanger format:

+accuracy. Option -Q1 indicates fastq-sanger format:

 lastdb -uNEAR -R01 humandb human/chr*.fa
-lastal -Q1 -D100 humandb queries.fastq | last-split > myalns.maf
+lastal -Q1 humandb queries.fastq | last-split > myalns.maf
 
+

Assumption: LAST assumes the reliabilities reflect substitution +errors, not insertion/deletion errors. If that is not true, you can +tell it to ignore the reliability data with -Q0.

Fastq format confusion

@@ -490,7 +499,19 @@

Example 9: Compare the human and chimp genomes

-

See here.

+

See here. But +that recipe is extremely slow-and-accurate. You can tune it to compare huge, high-similarity genomes with +moderate run time and memory use:

+
    +
  • Omit the sensitivity-boosting lastal -m option.

    +
  • +
  • Add -W99 (or so) to the lastdb options.

    +
  • +
  • If the "reference" genome (the one given to lastdb) is > 4 GB, it's +probably more efficient to use lastdb8 and lastal8, instead of +lastdb and lastal.

    +
  • +

Example 10: Ambiguity of alignment columns

diff -Nru last-align-961/doc/last-tutorial.txt last-align-963/doc/last-tutorial.txt --- last-align-961/doc/last-tutorial.txt 2017-06-02 10:27:21.000000000 +0000 +++ last-align-963/doc/last-tutorial.txt 2018-12-26 07:28:37.000000000 +0000 @@ -70,21 +70,27 @@ lastdb -p -cR01 protdb proteins.fa lastal -F15 protdb dnas.fa -Example 4: Find short protein alignments ----------------------------------------- +Example 4: Find high-similarity, and short, protein alignments +-------------------------------------------------------------- LAST uses a `scoring scheme `_ to find -similarities. Some scoring schemes are good for long-and-weak -similarities, others for short-and-strong similarities. If we seek -very short similarities, weak ones are hopeless (statistically -insignificant), so we had better focus on strong ones. The PAM30 -scoring scheme may work well:: +similarities. Some scoring schemes are tuned for weak similarities, +others for strong similarities. The PAM30 scoring scheme finds strong +protein similarities:: lastdb -p -cR01 invdb invertebrate.fa lastal -pPAM30 invdb vertebrate.fa -(How short is "very short"? It depends on the amount of sequence data -we are searching, but perhaps roughly less than 40 amino acids.) +This has two advantages: + +* It omits weak alignments, or alignment parts (occasionally a strong + similarity is flanked by a weak similarity). + +* It can find short similarities. If we seek very short similarities, + weak ones are hopeless (statistically insignificant), so we had + better focus on strong ones. (How short is "very short"? It + depends on the amount of sequence data we are searching, but perhaps + roughly less than 40 amino acids.) Example 5: Align human DNA sequences to the human genome -------------------------------------------------------- @@ -138,12 +144,14 @@ DNA sequences are not always perfectly accurate, and they are sometimes provided in fastq format, which indicates the reliability of each base. LAST can use this information to improve alignment -accuracy. (It assumes the reliabilities reflect substitution errors, -not insertion/deletion errors: if that is not true, it may be better -to use fasta format.) Option -Q1 indicates fastq-sanger format:: +accuracy. Option -Q1 indicates fastq-sanger format:: lastdb -uNEAR -R01 humandb human/chr*.fa - lastal -Q1 -D100 humandb queries.fastq | last-split > myalns.maf + lastal -Q1 humandb queries.fastq | last-split > myalns.maf + +**Assumption:** LAST assumes the reliabilities reflect substitution +errors, not insertion/deletion errors. If that is not true, you can +tell it to ignore the reliability data with -Q0. Fastq format confusion ---------------------- @@ -183,7 +191,18 @@ Example 9: Compare the human and chimp genomes ---------------------------------------------- -See `here `_. +See `here `_. But +that recipe is *extremely* slow-and-accurate. You can `tune +`_ it to compare huge, high-similarity genomes with +moderate run time and memory use: + +* Omit the sensitivity-boosting lastal -m option. + +* Add -W99 (or so) to the lastdb options. + +* If the "reference" genome (the one given to lastdb) is > 4 GB, it's + probably more efficient to use lastdb8 and lastal8, instead of + lastdb and lastal. Example 10: Ambiguity of alignment columns ------------------------------------------ diff -Nru last-align-961/doc/Makefile last-align-963/doc/Makefile --- last-align-961/doc/Makefile 2018-04-20 14:43:12.000000000 +0000 +++ last-align-963/doc/Makefile 2018-12-26 07:28:37.000000000 +0000 @@ -15,6 +15,7 @@ # Ugh! Is there a better way? RST_CSS = `locate html4css1.css | tail -n1` +#RST_CSS = html4css1.css RSTFLAGS = --initial-header-level=2 --no-compact-lists \ --no-compact-field-lists --option-limit=0 --no-doc-info diff -Nru last-align-961/src/tantan.cc last-align-963/src/tantan.cc --- last-align-961/src/tantan.cc 2015-01-30 10:22:29.000000000 +0000 +++ last-align-963/src/tantan.cc 2018-12-26 07:28:37.000000000 +0000 @@ -20,10 +20,10 @@ } double firstRepeatOffsetProb(double probMult, int maxRepeatOffset) { - if (probMult < 1 || probMult > 1) + if (probMult < 1 || probMult > 1) { return (1 - probMult) / (1 - std::pow(probMult, maxRepeatOffset)); - else - return 1.0 / maxRepeatOffset; + } + return 1.0 / maxRepeatOffset; } void checkForwardAndBackwardTotals(double fTot, double bTot) { @@ -64,6 +64,7 @@ double b2fLast; // background state to last foreground state double backgroundProb; + std::vector b2fProbs; // background state to each foreground state std::vector foregroundProbs; std::vector insertionProbs; @@ -99,7 +100,7 @@ //f2g = firstGapProb; //g2f = 1 - otherGapProb; oneGapProb = firstGapProb * (1 - otherGapProb); - endGapProb = firstGapProb * 1; + endGapProb = firstGapProb * (maxRepeatOffset > 1); f2f0 = 1 - repeatEndProb; f2f1 = 1 - repeatEndProb - firstGapProb; f2f2 = 1 - repeatEndProb - firstGapProb * 2; @@ -110,9 +111,16 @@ b2fFirst = repeatProb * firstRepeatOffsetProb(b2fDecay, maxRepeatOffset); b2fLast = repeatProb * firstRepeatOffsetProb(b2fGrowth, maxRepeatOffset); + b2fProbs.resize(maxRepeatOffset); foregroundProbs.resize(maxRepeatOffset); insertionProbs.resize(maxRepeatOffset - 1); + double p = b2fFirst; + for (int i = 0; i < maxRepeatOffset; ++i) { + b2fProbs[i] = p; + p *= b2fDecay; + } + scaleFactors.resize((seqEnd - seqBeg) / scaleStepSize); } @@ -125,8 +133,7 @@ double forwardTotal() { double fromForeground = std::accumulate(foregroundProbs.begin(), foregroundProbs.end(), 0.0); - fromForeground *= f2b; - double total = backgroundProb * b2b + fromForeground; + double total = backgroundProb * b2b + fromForeground * f2b; assert(total > 0); return total; } @@ -148,36 +155,31 @@ double f = *foregroundPtr; double fromForeground = f; - if (insertionProbs.empty()) { - *foregroundPtr = fromBackground + f * f2f0; - } else { - double *insertionPtr = &insertionProbs.back(); - double i = *insertionPtr; - *foregroundPtr = fromBackground + f * f2f1 + i * endGapProb; - double d = f; - --foregroundPtr; - fromBackground *= b2fGrowth; - - while (foregroundPtr > &foregroundProbs.front()) { - f = *foregroundPtr; - fromForeground += f; - i = *(insertionPtr - 1); - *foregroundPtr = fromBackground + f * f2f2 + (i + d) * oneGapProb; - *insertionPtr = f + i * g2g; - d = f + d * g2g; - --foregroundPtr; - --insertionPtr; - fromBackground *= b2fGrowth; - } + double *insertionPtr = &insertionProbs.back(); + double i = *insertionPtr; + *foregroundPtr = fromBackground + f * f2f1 + i * endGapProb; + double d = f; + --foregroundPtr; + fromBackground *= b2fGrowth; + while (foregroundPtr > &foregroundProbs.front()) { f = *foregroundPtr; fromForeground += f; - *foregroundPtr = fromBackground + f * f2f1 + d * endGapProb; - *insertionPtr = f; + i = *(insertionPtr - 1); + *foregroundPtr = fromBackground + f * f2f2 + (i + d) * oneGapProb; + *insertionPtr = f + i * g2g; + d = f + d * g2g; + --foregroundPtr; + --insertionPtr; + fromBackground *= b2fGrowth; } - fromForeground *= f2b; - backgroundProb = backgroundProb * b2b + fromForeground; + f = *foregroundPtr; + fromForeground += f; + *foregroundPtr = fromBackground + f * f2f1 + d * endGapProb; + *insertionPtr = f; + + backgroundProb = backgroundProb * b2b + fromForeground * f2b; } void calcBackwardTransitionProbsWithGaps() { @@ -186,57 +188,48 @@ double f = *foregroundPtr; double toForeground = f; - if (insertionProbs.empty()) { - *foregroundPtr = toBackground + f2f0 * f; - } else { - double *insertionPtr = &insertionProbs.front(); - double i = *insertionPtr; - *foregroundPtr = toBackground + f2f1 * f + i; - double d = endGapProb * f; - ++foregroundPtr; - toForeground *= b2fGrowth; - - while (foregroundPtr < &foregroundProbs.back()) { - f = *foregroundPtr; - toForeground += f; - i = *(insertionPtr + 1); - *foregroundPtr = toBackground + f2f2 * f + (i + d); - double oneGapProb_f = oneGapProb * f; - *insertionPtr = oneGapProb_f + g2g * i; - d = oneGapProb_f + g2g * d; - ++foregroundPtr; - ++insertionPtr; - toForeground *= b2fGrowth; - } + double *insertionPtr = &insertionProbs.front(); + double i = *insertionPtr; + *foregroundPtr = toBackground + f2f1 * f + i; + double d = endGapProb * f; + ++foregroundPtr; + toForeground *= b2fGrowth; + while (foregroundPtr < &foregroundProbs.back()) { f = *foregroundPtr; toForeground += f; - *foregroundPtr = toBackground + f2f1 * f + d; - *insertionPtr = endGapProb * f; + i = *(insertionPtr + 1); + *foregroundPtr = toBackground + f2f2 * f + (i + d); + double oneGapProb_f = oneGapProb * f; + *insertionPtr = oneGapProb_f + g2g * i; + d = oneGapProb_f + g2g * d; + ++foregroundPtr; + ++insertionPtr; + toForeground *= b2fGrowth; } - toForeground *= b2fLast; - backgroundProb = b2b * backgroundProb + toForeground; + f = *foregroundPtr; + toForeground += f; + *foregroundPtr = toBackground + f2f1 * f + d; + *insertionPtr = endGapProb * f; + + backgroundProb = b2b * backgroundProb + b2fLast * toForeground; } void calcForwardTransitionProbs() { if (endGapProb > 0) return calcForwardTransitionProbsWithGaps(); - double fromBackground = backgroundProb * b2fLast; + double b = backgroundProb; double fromForeground = 0; - double *foregroundPtr = END(foregroundProbs); double *foregroundBeg = BEG(foregroundProbs); - while (foregroundPtr > foregroundBeg) { - --foregroundPtr; - double f = *foregroundPtr; + for (int i = 0; i < maxRepeatOffset; ++i) { + double f = foregroundBeg[i]; fromForeground += f; - *foregroundPtr = fromBackground + f * f2f0; - fromBackground *= b2fGrowth; + foregroundBeg[i] = b * b2fProbs[i] + f * f2f0; } - fromForeground *= f2b; - backgroundProb = backgroundProb * b2b + fromForeground; + backgroundProb = b * b2b + fromForeground * f2b; } void calcBackwardTransitionProbs() { @@ -244,18 +237,14 @@ double toBackground = f2b * backgroundProb; double toForeground = 0; - double *foregroundPtr = BEG(foregroundProbs); - double *foregroundEnd = END(foregroundProbs); + double *foregroundBeg = BEG(foregroundProbs); - while (foregroundPtr < foregroundEnd) { - toForeground *= b2fGrowth; - double f = *foregroundPtr; - toForeground += f; - *foregroundPtr = toBackground + f2f0 * f; - ++foregroundPtr; + for (int i = 0; i < maxRepeatOffset; ++i) { + double f = foregroundBeg[i]; + toForeground += b2fProbs[i] * f; + foregroundBeg[i] = toBackground + f2f0 * f; } - toForeground *= b2fLast; backgroundProb = b2b * backgroundProb + toForeground; } @@ -281,12 +270,17 @@ } } - void calcEmissionProbs() { - const double *lrRow = likelihoodRatioMatrix[*seqPtr]; + bool isNearSeqBeg() { + return seqPtr - seqBeg < maxRepeatOffset; + } - bool isNearSeqBeg = (seqPtr - seqBeg < maxRepeatOffset); - const uchar *seqStop = isNearSeqBeg ? seqBeg : seqPtr - maxRepeatOffset; + const uchar *seqFurthestBack() { + return isNearSeqBeg() ? seqBeg : seqPtr - maxRepeatOffset; + } + void calcEmissionProbs() { + const double *lrRow = likelihoodRatioMatrix[*seqPtr]; + const uchar *seqStop = seqFurthestBack(); double *foregroundPtr = BEG(foregroundProbs); const uchar *offsetPtr = seqPtr; diff -Nru last-align-961/src/version.hh last-align-963/src/version.hh --- last-align-961/src/version.hh 2018-12-09 23:26:30.000000000 +0000 +++ last-align-963/src/version.hh 2018-12-26 07:28:37.000000000 +0000 @@ -1 +1 @@ -"961" +"963"