diff -Nru last-align-961/ChangeLog.txt last-align-963/ChangeLog.txt
--- last-align-961/ChangeLog.txt 2018-12-09 23:26:51.000000000 +0000
+++ last-align-963/ChangeLog.txt 2018-12-26 07:28:54.000000000 +0000
@@ -1,8 +1,19 @@
+2018-12-26 Martin C. Frith
+
+ * src/tantan.cc:
+ Make tantan repeat-finding faster
+ [44199bff9cef] [tip]
+
+ * doc/Makefile, doc/last-parallel.txt, doc/last-split.txt, doc/last-
+ tutorial.txt:
+ Update the documents a bit
+ [6727ae8b7a12]
+
2018-12-10 Martin C. Frith
* doc/last-dotplot.txt, scripts/last-dotplot:
Add last-dotplot --maxseqs option
- [ed0fb9b1eb40] [tip]
+ [ed0fb9b1eb40]
* src/split/last-split.cc:
Make last-split -n keep E-values
diff -Nru last-align-961/debian/changelog last-align-963/debian/changelog
--- last-align-961/debian/changelog 2018-12-17 10:13:03.000000000 +0000
+++ last-align-963/debian/changelog 2019-01-17 10:21:30.000000000 +0000
@@ -1,3 +1,20 @@
+last-align (963-2) unstable; urgency=medium
+
+ * Example files are not compressed any more - adapt autopkgtest
+ Closes: #919556
+
+ -- Andreas Tille Thu, 17 Jan 2019 11:21:30 +0100
+
+last-align (963-1) unstable; urgency=medium
+
+ * New upstream version
+ * debhelper 12
+ * Standards-Version: 4.3.0
+ * debian/tests/control: Replace needs-recommends by explicitly
+ adding recommended package python-pil to test depends
+
+ -- Andreas Tille Fri, 11 Jan 2019 22:49:18 +0100
+
last-align (961-1) unstable; urgency=medium
[ Jelmer Vernooij ]
diff -Nru last-align-961/debian/compat last-align-963/debian/compat
--- last-align-961/debian/compat 2018-12-17 10:13:03.000000000 +0000
+++ last-align-963/debian/compat 2019-01-17 10:21:30.000000000 +0000
@@ -1 +1 @@
-11
+12
diff -Nru last-align-961/debian/control last-align-963/debian/control
--- last-align-961/debian/control 2018-12-17 10:13:03.000000000 +0000
+++ last-align-963/debian/control 2019-01-17 10:21:30.000000000 +0000
@@ -4,11 +4,11 @@
Andreas Tille
Section: science
Priority: optional
-Build-Depends: debhelper (>= 11~),
+Build-Depends: debhelper (>= 12~),
help2man,
python-pil,
zlib1g-dev
-Standards-Version: 4.2.1
+Standards-Version: 4.3.0
Vcs-Browser: https://salsa.debian.org/med-team/last-align
Vcs-Git: https://salsa.debian.org/med-team/last-align.git
Homepage: http://last.cbrc.jp/
diff -Nru last-align-961/debian/tests/control last-align-963/debian/tests/control
--- last-align-961/debian/tests/control 2018-12-17 10:13:03.000000000 +0000
+++ last-align-963/debian/tests/control 2019-01-17 10:21:30.000000000 +0000
@@ -1,3 +1,3 @@
Tests: run-unit-test
-Depends: @
-Restrictions: allow-stderr, needs-recommends
+Depends: @, python-pil
+Restrictions: allow-stderr
diff -Nru last-align-961/debian/tests/run-unit-test last-align-963/debian/tests/run-unit-test
--- last-align-961/debian/tests/run-unit-test 2018-12-17 10:13:03.000000000 +0000
+++ last-align-963/debian/tests/run-unit-test 2019-01-17 10:21:30.000000000 +0000
@@ -9,7 +9,7 @@
cd $ADTTMP
-cp -a /usr/share/doc/${pkg}/examples/*.fa.gz .
+cp -a /usr/share/doc/${pkg}/examples/*.fa* .
gunzip -r *
# first example from tutorial
diff -Nru last-align-961/doc/last-parallel.html last-align-963/doc/last-parallel.html
--- last-align-961/doc/last-parallel.html 2018-04-20 14:43:38.000000000 +0000
+++ last-align-963/doc/last-parallel.html 2018-12-26 07:28:59.000000000 +0000
@@ -364,11 +364,11 @@
Instead of this:
-lastal -Q1 -D100 db q.fastq | last-split > out.maf
+lastal -Q1 db q.fastq | last-split > out.maf
try this:
-parallel-fastq "lastal -Q1 -D100 db | last-split" < q.fastq > out.maf
+parallel-fastq "lastal -Q1 db | last-split" < q.fastq > out.maf
Instead of this:
diff -Nru last-align-961/doc/last-parallel.txt last-align-963/doc/last-parallel.txt
--- last-align-961/doc/last-parallel.txt 2017-09-22 10:01:15.000000000 +0000
+++ last-align-963/doc/last-parallel.txt 2018-12-26 07:28:37.000000000 +0000
@@ -53,11 +53,11 @@
Instead of this::
- lastal -Q1 -D100 db q.fastq | last-split > out.maf
+ lastal -Q1 db q.fastq | last-split > out.maf
try this::
- parallel-fastq "lastal -Q1 -D100 db | last-split" < q.fastq > out.maf
+ parallel-fastq "lastal -Q1 db | last-split" < q.fastq > out.maf
Instead of this::
diff -Nru last-align-961/doc/last-split.html last-align-963/doc/last-split.html
--- last-align-961/doc/last-split.html 2018-08-01 11:00:29.000000000 +0000
+++ last-align-963/doc/last-split.html 2018-12-26 07:29:01.000000000 +0000
@@ -334,7 +334,7 @@
can do the alignment like this:
@@ -397,7 +397,7 @@
Going faster by parallelization
For example, split alignment of DNA reads to a genome:
-parallel-fastq "lastal -Q1 -D100 db | last-split" < q.fastq > out.maf
+parallel-fastq "lastal -Q1 db | last-split" < q.fastq > out.maf
This requires GNU parallel to be installed
(http://www.gnu.org/software/parallel/).
@@ -558,7 +558,7 @@
splice signals or favouring cis-splices:
lastdb -uNEAR -R01 db genome.fasta
-lastal -Q1 -D100 db q.fastq | last-split -c0 > out.maf
+lastal -Q1 db q.fastq | last-split -c0 > out.maf
diff -Nru last-align-961/doc/last-split.txt last-align-963/doc/last-split.txt
--- last-align-961/doc/last-split.txt 2018-08-01 11:00:06.000000000 +0000
+++ last-align-963/doc/last-split.txt 2018-12-26 07:28:37.000000000 +0000
@@ -21,7 +21,7 @@
can do the alignment like this::
lastdb -uNEAR -R01 db genome.fasta
- lastal -Q1 -D100 db q.fastq | last-split > out.maf
+ lastal -Q1 db q.fastq | last-split > out.maf
Spliced alignment of RNA reads to a genome
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -78,7 +78,7 @@
For example, split alignment of DNA reads to a genome::
- parallel-fastq "lastal -Q1 -D100 db | last-split" < q.fastq > out.maf
+ parallel-fastq "lastal -Q1 db | last-split" < q.fastq > out.maf
This requires GNU parallel to be installed
(http://www.gnu.org/software/parallel/).
@@ -173,7 +173,7 @@
splice signals or favouring cis-splices::
lastdb -uNEAR -R01 db genome.fasta
- lastal -Q1 -D100 db q.fastq | last-split -c0 > out.maf
+ lastal -Q1 db q.fastq | last-split -c0 > out.maf
Options
-------
diff -Nru last-align-961/doc/last-tutorial.html last-align-963/doc/last-tutorial.html
--- last-align-961/doc/last-tutorial.html 2018-04-20 14:43:41.000000000 +0000
+++ last-align-963/doc/last-tutorial.html 2018-12-26 07:29:03.000000000 +0000
@@ -379,20 +379,28 @@
lastal -F15 protdb dnas.fa
-
-
Example 4: Find short protein alignments
+
+
Example 4: Find high-similarity, and short, protein alignments
LAST uses a scoring scheme to find
-similarities. Some scoring schemes are good for long-and-weak
-similarities, others for short-and-strong similarities. If we seek
-very short similarities, weak ones are hopeless (statistically
-insignificant), so we had better focus on strong ones. The PAM30
-scoring scheme may work well:
+similarities. Some scoring schemes are tuned for weak similarities,
+others for strong similarities. The PAM30 scoring scheme finds strong
+protein similarities:
lastdb -p -cR01 invdb invertebrate.fa
lastal -pPAM30 invdb vertebrate.fa
-
(How short is "very short"? It depends on the amount of sequence data
-we are searching, but perhaps roughly less than 40 amino acids.)
+
This has two advantages:
+
+It omits weak alignments, or alignment parts (occasionally a strong
+similarity is flanked by a weak similarity).
+
+It can find short similarities. If we seek very short similarities,
+weak ones are hopeless (statistically insignificant), so we had
+better focus on strong ones. (How short is "very short"? It
+depends on the amount of sequence data we are searching, but perhaps
+roughly less than 40 amino acids.)
+
+
Example 5: Align human DNA sequences to the human genome
@@ -446,13 +454,14 @@
DNA sequences are not always perfectly accurate, and they are
sometimes provided in fastq format, which indicates the reliability of
each base. LAST can use this information to improve alignment
-accuracy. (It assumes the reliabilities reflect substitution errors,
-not insertion/deletion errors: if that is not true, it may be better
-to use fasta format.) Option -Q1 indicates fastq-sanger format:
+accuracy. Option -Q1 indicates fastq-sanger format:
lastdb -uNEAR -R01 humandb human/chr*.fa
-lastal -Q1 -D100 humandb queries.fastq | last-split > myalns.maf
+lastal -Q1 humandb queries.fastq | last-split > myalns.maf
+
Assumption: LAST assumes the reliabilities reflect substitution
+errors, not insertion/deletion errors. If that is not true, you can
+tell it to ignore the reliability data with -Q0.
Fastq format confusion
@@ -490,7 +499,19 @@
Example 9: Compare the human and chimp genomes
-
See here.
+
See here. But
+that recipe is extremely slow-and-accurate. You can tune it to compare huge, high-similarity genomes with
+moderate run time and memory use:
+
+Omit the sensitivity-boosting lastal -m option.
+
+Add -W99 (or so) to the lastdb options.
+
+If the "reference" genome (the one given to lastdb) is > 4 GB, it's
+probably more efficient to use lastdb8 and lastal8, instead of
+lastdb and lastal.
+
+
Example 10: Ambiguity of alignment columns
diff -Nru last-align-961/doc/last-tutorial.txt last-align-963/doc/last-tutorial.txt
--- last-align-961/doc/last-tutorial.txt 2017-06-02 10:27:21.000000000 +0000
+++ last-align-963/doc/last-tutorial.txt 2018-12-26 07:28:37.000000000 +0000
@@ -70,21 +70,27 @@
lastdb -p -cR01 protdb proteins.fa
lastal -F15 protdb dnas.fa
-Example 4: Find short protein alignments
-----------------------------------------
+Example 4: Find high-similarity, and short, protein alignments
+--------------------------------------------------------------
LAST uses a `scoring scheme `_ to find
-similarities. Some scoring schemes are good for long-and-weak
-similarities, others for short-and-strong similarities. If we seek
-very short similarities, weak ones are hopeless (statistically
-insignificant), so we had better focus on strong ones. The PAM30
-scoring scheme may work well::
+similarities. Some scoring schemes are tuned for weak similarities,
+others for strong similarities. The PAM30 scoring scheme finds strong
+protein similarities::
lastdb -p -cR01 invdb invertebrate.fa
lastal -pPAM30 invdb vertebrate.fa
-(How short is "very short"? It depends on the amount of sequence data
-we are searching, but perhaps roughly less than 40 amino acids.)
+This has two advantages:
+
+* It omits weak alignments, or alignment parts (occasionally a strong
+ similarity is flanked by a weak similarity).
+
+* It can find short similarities. If we seek very short similarities,
+ weak ones are hopeless (statistically insignificant), so we had
+ better focus on strong ones. (How short is "very short"? It
+ depends on the amount of sequence data we are searching, but perhaps
+ roughly less than 40 amino acids.)
Example 5: Align human DNA sequences to the human genome
--------------------------------------------------------
@@ -138,12 +144,14 @@
DNA sequences are not always perfectly accurate, and they are
sometimes provided in fastq format, which indicates the reliability of
each base. LAST can use this information to improve alignment
-accuracy. (It assumes the reliabilities reflect substitution errors,
-not insertion/deletion errors: if that is not true, it may be better
-to use fasta format.) Option -Q1 indicates fastq-sanger format::
+accuracy. Option -Q1 indicates fastq-sanger format::
lastdb -uNEAR -R01 humandb human/chr*.fa
- lastal -Q1 -D100 humandb queries.fastq | last-split > myalns.maf
+ lastal -Q1 humandb queries.fastq | last-split > myalns.maf
+
+**Assumption:** LAST assumes the reliabilities reflect substitution
+errors, not insertion/deletion errors. If that is not true, you can
+tell it to ignore the reliability data with -Q0.
Fastq format confusion
----------------------
@@ -183,7 +191,18 @@
Example 9: Compare the human and chimp genomes
----------------------------------------------
-See `here `_.
+See `here `_. But
+that recipe is *extremely* slow-and-accurate. You can `tune
+`_ it to compare huge, high-similarity genomes with
+moderate run time and memory use:
+
+* Omit the sensitivity-boosting lastal -m option.
+
+* Add -W99 (or so) to the lastdb options.
+
+* If the "reference" genome (the one given to lastdb) is > 4 GB, it's
+ probably more efficient to use lastdb8 and lastal8, instead of
+ lastdb and lastal.
Example 10: Ambiguity of alignment columns
------------------------------------------
diff -Nru last-align-961/doc/Makefile last-align-963/doc/Makefile
--- last-align-961/doc/Makefile 2018-04-20 14:43:12.000000000 +0000
+++ last-align-963/doc/Makefile 2018-12-26 07:28:37.000000000 +0000
@@ -15,6 +15,7 @@
# Ugh! Is there a better way?
RST_CSS = `locate html4css1.css | tail -n1`
+#RST_CSS = html4css1.css
RSTFLAGS = --initial-header-level=2 --no-compact-lists \
--no-compact-field-lists --option-limit=0 --no-doc-info
diff -Nru last-align-961/src/tantan.cc last-align-963/src/tantan.cc
--- last-align-961/src/tantan.cc 2015-01-30 10:22:29.000000000 +0000
+++ last-align-963/src/tantan.cc 2018-12-26 07:28:37.000000000 +0000
@@ -20,10 +20,10 @@
}
double firstRepeatOffsetProb(double probMult, int maxRepeatOffset) {
- if (probMult < 1 || probMult > 1)
+ if (probMult < 1 || probMult > 1) {
return (1 - probMult) / (1 - std::pow(probMult, maxRepeatOffset));
- else
- return 1.0 / maxRepeatOffset;
+ }
+ return 1.0 / maxRepeatOffset;
}
void checkForwardAndBackwardTotals(double fTot, double bTot) {
@@ -64,6 +64,7 @@
double b2fLast; // background state to last foreground state
double backgroundProb;
+ std::vector b2fProbs; // background state to each foreground state
std::vector foregroundProbs;
std::vector insertionProbs;
@@ -99,7 +100,7 @@
//f2g = firstGapProb;
//g2f = 1 - otherGapProb;
oneGapProb = firstGapProb * (1 - otherGapProb);
- endGapProb = firstGapProb * 1;
+ endGapProb = firstGapProb * (maxRepeatOffset > 1);
f2f0 = 1 - repeatEndProb;
f2f1 = 1 - repeatEndProb - firstGapProb;
f2f2 = 1 - repeatEndProb - firstGapProb * 2;
@@ -110,9 +111,16 @@
b2fFirst = repeatProb * firstRepeatOffsetProb(b2fDecay, maxRepeatOffset);
b2fLast = repeatProb * firstRepeatOffsetProb(b2fGrowth, maxRepeatOffset);
+ b2fProbs.resize(maxRepeatOffset);
foregroundProbs.resize(maxRepeatOffset);
insertionProbs.resize(maxRepeatOffset - 1);
+ double p = b2fFirst;
+ for (int i = 0; i < maxRepeatOffset; ++i) {
+ b2fProbs[i] = p;
+ p *= b2fDecay;
+ }
+
scaleFactors.resize((seqEnd - seqBeg) / scaleStepSize);
}
@@ -125,8 +133,7 @@
double forwardTotal() {
double fromForeground = std::accumulate(foregroundProbs.begin(),
foregroundProbs.end(), 0.0);
- fromForeground *= f2b;
- double total = backgroundProb * b2b + fromForeground;
+ double total = backgroundProb * b2b + fromForeground * f2b;
assert(total > 0);
return total;
}
@@ -148,36 +155,31 @@
double f = *foregroundPtr;
double fromForeground = f;
- if (insertionProbs.empty()) {
- *foregroundPtr = fromBackground + f * f2f0;
- } else {
- double *insertionPtr = &insertionProbs.back();
- double i = *insertionPtr;
- *foregroundPtr = fromBackground + f * f2f1 + i * endGapProb;
- double d = f;
- --foregroundPtr;
- fromBackground *= b2fGrowth;
-
- while (foregroundPtr > &foregroundProbs.front()) {
- f = *foregroundPtr;
- fromForeground += f;
- i = *(insertionPtr - 1);
- *foregroundPtr = fromBackground + f * f2f2 + (i + d) * oneGapProb;
- *insertionPtr = f + i * g2g;
- d = f + d * g2g;
- --foregroundPtr;
- --insertionPtr;
- fromBackground *= b2fGrowth;
- }
+ double *insertionPtr = &insertionProbs.back();
+ double i = *insertionPtr;
+ *foregroundPtr = fromBackground + f * f2f1 + i * endGapProb;
+ double d = f;
+ --foregroundPtr;
+ fromBackground *= b2fGrowth;
+ while (foregroundPtr > &foregroundProbs.front()) {
f = *foregroundPtr;
fromForeground += f;
- *foregroundPtr = fromBackground + f * f2f1 + d * endGapProb;
- *insertionPtr = f;
+ i = *(insertionPtr - 1);
+ *foregroundPtr = fromBackground + f * f2f2 + (i + d) * oneGapProb;
+ *insertionPtr = f + i * g2g;
+ d = f + d * g2g;
+ --foregroundPtr;
+ --insertionPtr;
+ fromBackground *= b2fGrowth;
}
- fromForeground *= f2b;
- backgroundProb = backgroundProb * b2b + fromForeground;
+ f = *foregroundPtr;
+ fromForeground += f;
+ *foregroundPtr = fromBackground + f * f2f1 + d * endGapProb;
+ *insertionPtr = f;
+
+ backgroundProb = backgroundProb * b2b + fromForeground * f2b;
}
void calcBackwardTransitionProbsWithGaps() {
@@ -186,57 +188,48 @@
double f = *foregroundPtr;
double toForeground = f;
- if (insertionProbs.empty()) {
- *foregroundPtr = toBackground + f2f0 * f;
- } else {
- double *insertionPtr = &insertionProbs.front();
- double i = *insertionPtr;
- *foregroundPtr = toBackground + f2f1 * f + i;
- double d = endGapProb * f;
- ++foregroundPtr;
- toForeground *= b2fGrowth;
-
- while (foregroundPtr < &foregroundProbs.back()) {
- f = *foregroundPtr;
- toForeground += f;
- i = *(insertionPtr + 1);
- *foregroundPtr = toBackground + f2f2 * f + (i + d);
- double oneGapProb_f = oneGapProb * f;
- *insertionPtr = oneGapProb_f + g2g * i;
- d = oneGapProb_f + g2g * d;
- ++foregroundPtr;
- ++insertionPtr;
- toForeground *= b2fGrowth;
- }
+ double *insertionPtr = &insertionProbs.front();
+ double i = *insertionPtr;
+ *foregroundPtr = toBackground + f2f1 * f + i;
+ double d = endGapProb * f;
+ ++foregroundPtr;
+ toForeground *= b2fGrowth;
+ while (foregroundPtr < &foregroundProbs.back()) {
f = *foregroundPtr;
toForeground += f;
- *foregroundPtr = toBackground + f2f1 * f + d;
- *insertionPtr = endGapProb * f;
+ i = *(insertionPtr + 1);
+ *foregroundPtr = toBackground + f2f2 * f + (i + d);
+ double oneGapProb_f = oneGapProb * f;
+ *insertionPtr = oneGapProb_f + g2g * i;
+ d = oneGapProb_f + g2g * d;
+ ++foregroundPtr;
+ ++insertionPtr;
+ toForeground *= b2fGrowth;
}
- toForeground *= b2fLast;
- backgroundProb = b2b * backgroundProb + toForeground;
+ f = *foregroundPtr;
+ toForeground += f;
+ *foregroundPtr = toBackground + f2f1 * f + d;
+ *insertionPtr = endGapProb * f;
+
+ backgroundProb = b2b * backgroundProb + b2fLast * toForeground;
}
void calcForwardTransitionProbs() {
if (endGapProb > 0) return calcForwardTransitionProbsWithGaps();
- double fromBackground = backgroundProb * b2fLast;
+ double b = backgroundProb;
double fromForeground = 0;
- double *foregroundPtr = END(foregroundProbs);
double *foregroundBeg = BEG(foregroundProbs);
- while (foregroundPtr > foregroundBeg) {
- --foregroundPtr;
- double f = *foregroundPtr;
+ for (int i = 0; i < maxRepeatOffset; ++i) {
+ double f = foregroundBeg[i];
fromForeground += f;
- *foregroundPtr = fromBackground + f * f2f0;
- fromBackground *= b2fGrowth;
+ foregroundBeg[i] = b * b2fProbs[i] + f * f2f0;
}
- fromForeground *= f2b;
- backgroundProb = backgroundProb * b2b + fromForeground;
+ backgroundProb = b * b2b + fromForeground * f2b;
}
void calcBackwardTransitionProbs() {
@@ -244,18 +237,14 @@
double toBackground = f2b * backgroundProb;
double toForeground = 0;
- double *foregroundPtr = BEG(foregroundProbs);
- double *foregroundEnd = END(foregroundProbs);
+ double *foregroundBeg = BEG(foregroundProbs);
- while (foregroundPtr < foregroundEnd) {
- toForeground *= b2fGrowth;
- double f = *foregroundPtr;
- toForeground += f;
- *foregroundPtr = toBackground + f2f0 * f;
- ++foregroundPtr;
+ for (int i = 0; i < maxRepeatOffset; ++i) {
+ double f = foregroundBeg[i];
+ toForeground += b2fProbs[i] * f;
+ foregroundBeg[i] = toBackground + f2f0 * f;
}
- toForeground *= b2fLast;
backgroundProb = b2b * backgroundProb + toForeground;
}
@@ -281,12 +270,17 @@
}
}
- void calcEmissionProbs() {
- const double *lrRow = likelihoodRatioMatrix[*seqPtr];
+ bool isNearSeqBeg() {
+ return seqPtr - seqBeg < maxRepeatOffset;
+ }
- bool isNearSeqBeg = (seqPtr - seqBeg < maxRepeatOffset);
- const uchar *seqStop = isNearSeqBeg ? seqBeg : seqPtr - maxRepeatOffset;
+ const uchar *seqFurthestBack() {
+ return isNearSeqBeg() ? seqBeg : seqPtr - maxRepeatOffset;
+ }
+ void calcEmissionProbs() {
+ const double *lrRow = likelihoodRatioMatrix[*seqPtr];
+ const uchar *seqStop = seqFurthestBack();
double *foregroundPtr = BEG(foregroundProbs);
const uchar *offsetPtr = seqPtr;
diff -Nru last-align-961/src/version.hh last-align-963/src/version.hh
--- last-align-961/src/version.hh 2018-12-09 23:26:30.000000000 +0000
+++ last-align-963/src/version.hh 2018-12-26 07:28:37.000000000 +0000
@@ -1 +1 @@
-"961"
+"963"