Binary files /tmp/tmpUT00p8/oECFLFUvmt/r-bioc-dada2-1.16.0+dfsg/build/vignette.rds and /tmp/tmpUT00p8/NapwzYeCID/r-bioc-dada2-1.18.0+dfsg/build/vignette.rds differ diff -Nru r-bioc-dada2-1.16.0+dfsg/debian/changelog r-bioc-dada2-1.18.0+dfsg/debian/changelog --- r-bioc-dada2-1.16.0+dfsg/debian/changelog 2020-09-30 13:51:38.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/debian/changelog 2020-11-03 12:19:13.000000000 +0000 @@ -1,3 +1,10 @@ +r-bioc-dada2 (1.18.0+dfsg-1) unstable; urgency=medium + + * Team upload. + * d/copyright: pval.c was removed from source + + -- Andreas Tille Tue, 03 Nov 2020 13:19:13 +0100 + r-bioc-dada2 (1.16.0+dfsg-2) unstable; urgency=medium * Team upload. diff -Nru r-bioc-dada2-1.16.0+dfsg/debian/control r-bioc-dada2-1.18.0+dfsg/debian/control --- r-bioc-dada2-1.16.0+dfsg/debian/control 2020-09-30 13:51:38.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/debian/control 2020-11-03 12:19:13.000000000 +0000 @@ -12,7 +12,7 @@ r-cran-ggplot2 (>= 2.1.0), r-cran-reshape2 (>= 1.4.1), r-bioc-shortread (>= 1.32.0), - r-cran-rcppparallel (>= 5.0.2+dfsg-3), + r-cran-rcppparallel, r-bioc-iranges (>= 2.6.0), r-bioc-xvector (>= 0.16.0), r-bioc-biocgenerics (>= 0.22.0) @@ -28,8 +28,7 @@ ${shlibs:Depends}, ${misc:Depends} Recommends: ${R:Recommends} -Suggests: ${R:Suggests}, - libjs-jquery +Suggests: ${R:Suggests} Description: sample inference from amplicon sequencing data The dada2 package contributes to software workflows to interpret sequencing data from microbiota - the relative abundance of diff -Nru r-bioc-dada2-1.16.0+dfsg/debian/copyright r-bioc-dada2-1.18.0+dfsg/debian/copyright --- r-bioc-dada2-1.16.0+dfsg/debian/copyright 2020-09-30 13:51:38.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/debian/copyright 2020-11-03 12:19:13.000000000 +0000 @@ -11,15 +11,6 @@ Susan Holmes License: LGPL-3 -Files: src/pval.c -Copyright: 2019 Benjamin Callahan - 2005-6 Morten Welinder - 2005-10 The R Foundation - 2006-2015 The R Core Team - 1998 Ross Ihaka - 2000 The R Core Team -License: GPL-2.0+ - Files: debian/* Copyright: 2019 Steffen Moeller License: LGPL-3 diff -Nru r-bioc-dada2-1.16.0+dfsg/DESCRIPTION r-bioc-dada2-1.18.0+dfsg/DESCRIPTION --- r-bioc-dada2-1.16.0+dfsg/DESCRIPTION 2020-04-28 02:15:51.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/DESCRIPTION 2020-10-27 23:34:15.000000000 +0000 @@ -9,8 +9,8 @@ removing substitution and chimera errors. Taxonomic classification is available via a native implementation of the RDP naive Bayesian classifier, and species-level assignment to 16S rRNA gene fragments by exact matching. -Version: 1.16.0 -Date: 2020-04-07 +Version: 1.18.0 +Date: 2020-08-07 Maintainer: Benjamin Callahan Author: Benjamin Callahan , Paul McMurdie, Susan Holmes License: LGPL-3 @@ -33,11 +33,11 @@ 'dada.R' 'errorModels.R' 'filter.R' 'misc.R' 'multiSample.R' 'paired.R' 'plot-methods.R' 'sequenceIO.R' 'show-methods.R' 'taxonomy.R' -RoxygenNote: 6.1.1 +RoxygenNote: 7.1.0 git_url: https://git.bioconductor.org/packages/dada2 -git_branch: RELEASE_3_11 -git_last_commit: 8bedfb4 -git_last_commit_date: 2020-04-27 -Date/Publication: 2020-04-27 +git_branch: RELEASE_3_12 +git_last_commit: a20a676 +git_last_commit_date: 2020-10-27 +Date/Publication: 2020-10-27 NeedsCompilation: yes -Packaged: 2020-04-28 02:15:51 UTC; biocbuild +Packaged: 2020-10-27 23:34:15 UTC; biocbuild diff -Nru r-bioc-dada2-1.16.0+dfsg/man/addSpecies.Rd r-bioc-dada2-1.18.0+dfsg/man/addSpecies.Rd --- r-bioc-dada2-1.16.0+dfsg/man/addSpecies.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/addSpecies.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -4,8 +4,14 @@ \alias{addSpecies} \title{Add species-level annotation to a taxonomic table.} \usage{ -addSpecies(taxtab, refFasta, allowMultiple = FALSE, tryRC = FALSE, - n = 2000, verbose = FALSE) +addSpecies( + taxtab, + refFasta, + allowMultiple = FALSE, + tryRC = FALSE, + n = 2000, + verbose = FALSE +) } \arguments{ \item{taxtab}{(Required). A taxonomic table, the output of \code{\link{assignTaxonomy}}.} diff -Nru r-bioc-dada2-1.16.0+dfsg/man/assignSpecies.Rd r-bioc-dada2-1.18.0+dfsg/man/assignSpecies.Rd --- r-bioc-dada2-1.16.0+dfsg/man/assignSpecies.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/assignSpecies.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -4,8 +4,14 @@ \alias{assignSpecies} \title{Taxonomic assignment to the species level by exact matching.} \usage{ -assignSpecies(seqs, refFasta, allowMultiple = FALSE, tryRC = FALSE, - n = 2000, verbose = FALSE) +assignSpecies( + seqs, + refFasta, + allowMultiple = FALSE, + tryRC = FALSE, + n = 2000, + verbose = FALSE +) } \arguments{ \item{seqs}{(Required). A character vector of the sequences to be assigned, or an object diff -Nru r-bioc-dada2-1.16.0+dfsg/man/assignTaxonomy.Rd r-bioc-dada2-1.18.0+dfsg/man/assignTaxonomy.Rd --- r-bioc-dada2-1.16.0+dfsg/man/assignTaxonomy.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/assignTaxonomy.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -4,10 +4,16 @@ \alias{assignTaxonomy} \title{Classifies sequences against reference training dataset.} \usage{ -assignTaxonomy(seqs, refFasta, minBoot = 50, tryRC = FALSE, - outputBootstraps = FALSE, taxLevels = c("Kingdom", "Phylum", "Class", - "Order", "Family", "Genus", "Species"), multithread = FALSE, - verbose = FALSE) +assignTaxonomy( + seqs, + refFasta, + minBoot = 50, + tryRC = FALSE, + outputBootstraps = FALSE, + taxLevels = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"), + multithread = FALSE, + verbose = FALSE +) } \arguments{ \item{seqs}{(Required). A character vector of the sequences to be assigned, or an object diff -Nru r-bioc-dada2-1.16.0+dfsg/man/c-dada-method.Rd r-bioc-dada2-1.18.0+dfsg/man/c-dada-method.Rd --- r-bioc-dada2-1.16.0+dfsg/man/c-dada-method.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/c-dada-method.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/show-methods.R -\docType{methods} \name{c,dada-method} \alias{c,dada-method} \title{Change concatenation of dada-class objects to list construction.} diff -Nru r-bioc-dada2-1.16.0+dfsg/man/c-derep-method.Rd r-bioc-dada2-1.18.0+dfsg/man/c-derep-method.Rd --- r-bioc-dada2-1.16.0+dfsg/man/c-derep-method.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/c-derep-method.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/show-methods.R -\docType{methods} \name{c,derep-method} \alias{c,derep-method} \title{Change concatenation of derep-class objects to list construction.} diff -Nru r-bioc-dada2-1.16.0+dfsg/man/collapseNoMismatch.Rd r-bioc-dada2-1.18.0+dfsg/man/collapseNoMismatch.Rd --- r-bioc-dada2-1.16.0+dfsg/man/collapseNoMismatch.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/collapseNoMismatch.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -4,8 +4,15 @@ \alias{collapseNoMismatch} \title{Combine together sequences that are identical up to shifts and/or length.} \usage{ -collapseNoMismatch(seqtab, minOverlap = 20, orderBy = "abundance", - identicalOnly = FALSE, vec = TRUE, band = -1, verbose = FALSE) +collapseNoMismatch( + seqtab, + minOverlap = 20, + orderBy = "abundance", + identicalOnly = FALSE, + vec = TRUE, + band = -1, + verbose = FALSE +) } \arguments{ \item{seqtab}{(Required). A sample by sequence matrix, the return of \code{\link{makeSequenceTable}}.} diff -Nru r-bioc-dada2-1.16.0+dfsg/man/dada.Rd r-bioc-dada2-1.18.0+dfsg/man/dada.Rd --- r-bioc-dada2-1.16.0+dfsg/man/dada.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/dada.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -4,9 +4,17 @@ \alias{dada} \title{High resolution sample inference from amplicon data.} \usage{ -dada(derep, err, errorEstimationFunction = loessErrfun, - selfConsist = FALSE, pool = FALSE, priors = character(0), - multithread = FALSE, verbose = TRUE, ...) +dada( + derep, + err, + errorEstimationFunction = loessErrfun, + selfConsist = FALSE, + pool = FALSE, + priors = character(0), + multithread = FALSE, + verbose = TRUE, + ... +) } \arguments{ \item{derep}{(Required). \code{character} or \code{\link{derep-class}}. diff -Nru r-bioc-dada2-1.16.0+dfsg/man/derepFastq.Rd r-bioc-dada2-1.18.0+dfsg/man/derepFastq.Rd --- r-bioc-dada2-1.16.0+dfsg/man/derepFastq.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/derepFastq.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -42,7 +42,7 @@ # Test that chunk-size, `n`, does not affect the result. testFastq = system.file("extdata", "sam1F.fastq.gz", package="dada2") derep1 = derepFastq(testFastq, verbose = TRUE) -derep1.35 = derepFastq(testFastq, 35, TRUE) +derep1.35 = derepFastq(testFastq, n = 35, verbose = TRUE) all.equal(getUniques(derep1), getUniques(derep1.35)[names(getUniques(derep1))]) } diff -Nru r-bioc-dada2-1.16.0+dfsg/man/errBalancedF.Rd r-bioc-dada2-1.18.0+dfsg/man/errBalancedF.Rd --- r-bioc-dada2-1.16.0+dfsg/man/errBalancedF.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/errBalancedF.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -3,9 +3,11 @@ \name{errBalancedF} \alias{errBalancedF} \title{An empirical error matrix.} -\format{A numerical matrix with 16 rows and 41 columns. +\format{ +A numerical matrix with 16 rows and 41 columns. Rows correspond to the 16 transition (eg. A2A, A2C, ...) - Columns correspond to consensus quality scores 0 to 40.} + Columns correspond to consensus quality scores 0 to 40. +} \description{ A dataset containing the error matrix estimated by DADA2 from the forward reads of the Illumina Miseq 2x250 sequenced Balanced mock community (see manuscript). diff -Nru r-bioc-dada2-1.16.0+dfsg/man/errBalancedR.Rd r-bioc-dada2-1.18.0+dfsg/man/errBalancedR.Rd --- r-bioc-dada2-1.16.0+dfsg/man/errBalancedR.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/errBalancedR.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -3,9 +3,11 @@ \name{errBalancedR} \alias{errBalancedR} \title{An empirical error matrix.} -\format{A numerical matrix with 16 rows and 41 columns. +\format{ +A numerical matrix with 16 rows and 41 columns. Rows correspond to the 16 transition (eg. A2A, A2C, ...) - Columns correspond to consensus quality scores 0 to 40.} + Columns correspond to consensus quality scores 0 to 40. +} \description{ A dataset containing the error matrix estimated by DADA2 from the reverse reads of the Illumina Miseq 2x250 sequenced Balanced mock community (see manuscript). diff -Nru r-bioc-dada2-1.16.0+dfsg/man/fastqFilter.Rd r-bioc-dada2-1.18.0+dfsg/man/fastqFilter.Rd --- r-bioc-dada2-1.16.0+dfsg/man/fastqFilter.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/fastqFilter.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -4,11 +4,28 @@ \alias{fastqFilter} \title{Filter and trim a fastq file.} \usage{ -fastqFilter(fn, fout, truncQ = 2, truncLen = 0, maxLen = Inf, - minLen = 20, trimLeft = 0, trimRight = 0, maxN = 0, minQ = 0, - maxEE = Inf, rm.phix = TRUE, rm.lowcomplex = 0, - orient.fwd = NULL, n = 1e+06, OMP = TRUE, qualityType = "Auto", - compress = TRUE, verbose = FALSE, ...) +fastqFilter( + fn, + fout, + truncQ = 2, + truncLen = 0, + maxLen = Inf, + minLen = 20, + trimLeft = 0, + trimRight = 0, + maxN = 0, + minQ = 0, + maxEE = Inf, + rm.phix = TRUE, + rm.lowcomplex = 0, + orient.fwd = NULL, + n = 1e+06, + OMP = TRUE, + qualityType = "Auto", + compress = TRUE, + verbose = FALSE, + ... +) } \arguments{ \item{fn}{(Required). The path to the input fastq file.} diff -Nru r-bioc-dada2-1.16.0+dfsg/man/fastqPairedFilter.Rd r-bioc-dada2-1.18.0+dfsg/man/fastqPairedFilter.Rd --- r-bioc-dada2-1.16.0+dfsg/man/fastqPairedFilter.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/fastqPairedFilter.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -4,13 +4,31 @@ \alias{fastqPairedFilter} \title{Filters and trims paired forward and reverse fastq files.} \usage{ -fastqPairedFilter(fn, fout, maxN = c(0, 0), truncQ = c(2, 2), - truncLen = c(0, 0), maxLen = c(Inf, Inf), minLen = c(20, 20), - trimLeft = c(0, 0), trimRight = c(0, 0), minQ = c(0, 0), - maxEE = c(Inf, Inf), rm.phix = c(TRUE, TRUE), rm.lowcomplex = c(0, - 0), matchIDs = FALSE, orient.fwd = NULL, id.sep = "\\\\s", - id.field = NULL, n = 1e+06, OMP = TRUE, qualityType = "Auto", - compress = TRUE, verbose = FALSE, ...) +fastqPairedFilter( + fn, + fout, + maxN = c(0, 0), + truncQ = c(2, 2), + truncLen = c(0, 0), + maxLen = c(Inf, Inf), + minLen = c(20, 20), + trimLeft = c(0, 0), + trimRight = c(0, 0), + minQ = c(0, 0), + maxEE = c(Inf, Inf), + rm.phix = c(TRUE, TRUE), + rm.lowcomplex = c(0, 0), + matchIDs = FALSE, + orient.fwd = NULL, + id.sep = "\\\\s", + id.field = NULL, + n = 1e+06, + OMP = TRUE, + qualityType = "Auto", + compress = TRUE, + verbose = FALSE, + ... +) } \arguments{ \item{fn}{(Required). A \code{character(2)} naming the paths to the (forward,reverse) fastq files.} diff -Nru r-bioc-dada2-1.16.0+dfsg/man/filterAndTrim.Rd r-bioc-dada2-1.18.0+dfsg/man/filterAndTrim.Rd --- r-bioc-dada2-1.16.0+dfsg/man/filterAndTrim.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/filterAndTrim.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -4,13 +4,33 @@ \alias{filterAndTrim} \title{Filter and trim fastq file(s).} \usage{ -filterAndTrim(fwd, filt, rev = NULL, filt.rev = NULL, - compress = TRUE, truncQ = 2, truncLen = 0, trimLeft = 0, - trimRight = 0, maxLen = Inf, minLen = 20, maxN = 0, minQ = 0, - maxEE = Inf, rm.phix = TRUE, rm.lowcomplex = 0, - orient.fwd = NULL, matchIDs = FALSE, id.sep = "\\\\s", - id.field = NULL, multithread = FALSE, n = 1e+05, OMP = TRUE, - qualityType = "Auto", verbose = FALSE) +filterAndTrim( + fwd, + filt, + rev = NULL, + filt.rev = NULL, + compress = TRUE, + truncQ = 2, + truncLen = 0, + trimLeft = 0, + trimRight = 0, + maxLen = Inf, + minLen = 20, + maxN = 0, + minQ = 0, + maxEE = Inf, + rm.phix = TRUE, + rm.lowcomplex = 0, + orient.fwd = NULL, + matchIDs = FALSE, + id.sep = "\\\\s", + id.field = NULL, + multithread = FALSE, + n = 1e+05, + OMP = TRUE, + qualityType = "Auto", + verbose = FALSE +) } \arguments{ \item{fwd}{(Required). \code{character}. diff -Nru r-bioc-dada2-1.16.0+dfsg/man/isBimeraDenovo.Rd r-bioc-dada2-1.18.0+dfsg/man/isBimeraDenovo.Rd --- r-bioc-dada2-1.16.0+dfsg/man/isBimeraDenovo.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/isBimeraDenovo.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -4,10 +4,16 @@ \alias{isBimeraDenovo} \title{Identify bimeras from collections of unique sequences.} \usage{ -isBimeraDenovo(unqs, minFoldParentOverAbundance = 2, - minParentAbundance = 8, allowOneOff = FALSE, - minOneOffParentDistance = 4, maxShift = 16, multithread = FALSE, - verbose = FALSE) +isBimeraDenovo( + unqs, + minFoldParentOverAbundance = 2, + minParentAbundance = 8, + allowOneOff = FALSE, + minOneOffParentDistance = 4, + maxShift = 16, + multithread = FALSE, + verbose = FALSE +) } \arguments{ \item{unqs}{(Required). A \code{\link{uniques-vector}} or any object that can be coerced diff -Nru r-bioc-dada2-1.16.0+dfsg/man/isBimeraDenovoTable.Rd r-bioc-dada2-1.18.0+dfsg/man/isBimeraDenovoTable.Rd --- r-bioc-dada2-1.16.0+dfsg/man/isBimeraDenovoTable.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/isBimeraDenovoTable.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -4,11 +4,18 @@ \alias{isBimeraDenovoTable} \title{Identify bimeras in a sequence table.} \usage{ -isBimeraDenovoTable(seqtab, minSampleFraction = 0.9, - ignoreNNegatives = 1, minFoldParentOverAbundance = 1.5, - minParentAbundance = 2, allowOneOff = FALSE, - minOneOffParentDistance = 4, maxShift = 16, multithread = FALSE, - verbose = FALSE) +isBimeraDenovoTable( + seqtab, + minSampleFraction = 0.9, + ignoreNNegatives = 1, + minFoldParentOverAbundance = 1.5, + minParentAbundance = 2, + allowOneOff = FALSE, + minOneOffParentDistance = 4, + maxShift = 16, + multithread = FALSE, + verbose = FALSE +) } \arguments{ \item{seqtab}{(Required). A sequence table. That is, an integer matrix with colnames diff -Nru r-bioc-dada2-1.16.0+dfsg/man/isBimera.Rd r-bioc-dada2-1.18.0+dfsg/man/isBimera.Rd --- r-bioc-dada2-1.16.0+dfsg/man/isBimera.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/isBimera.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -4,8 +4,13 @@ \alias{isBimera} \title{Determine if input sequence is a bimera of putative parent sequences.} \usage{ -isBimera(sq, parents, allowOneOff = FALSE, minOneOffParentDistance = 4, - maxShift = 16) +isBimera( + sq, + parents, + allowOneOff = FALSE, + minOneOffParentDistance = 4, + maxShift = 16 +) } \arguments{ \item{sq}{(Required). A \code{character(1)}. diff -Nru r-bioc-dada2-1.16.0+dfsg/man/isPhiX.Rd r-bioc-dada2-1.18.0+dfsg/man/isPhiX.Rd --- r-bioc-dada2-1.16.0+dfsg/man/isPhiX.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/isPhiX.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -4,8 +4,7 @@ \alias{isPhiX} \title{Determine if input sequence(s) match the phiX genome.} \usage{ -isPhiX(seqs, wordSize = 16, minMatches = 2, nonOverlapping = TRUE, - ...) +isPhiX(seqs, wordSize = 16, minMatches = 2, nonOverlapping = TRUE, ...) } \arguments{ \item{seqs}{(Required). A \code{character} vector of A/C/G/T sequences.} diff -Nru r-bioc-dada2-1.16.0+dfsg/man/isShiftDenovo.Rd r-bioc-dada2-1.18.0+dfsg/man/isShiftDenovo.Rd --- r-bioc-dada2-1.16.0+dfsg/man/isShiftDenovo.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/isShiftDenovo.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -5,8 +5,7 @@ \title{Identify sequences that are identical to a more abundant sequence up to an overall shift.} \usage{ -isShiftDenovo(unqs, minOverlap = 20, flagSubseqs = FALSE, - verbose = FALSE) +isShiftDenovo(unqs, minOverlap = 20, flagSubseqs = FALSE, verbose = FALSE) } \arguments{ \item{unqs}{(Required). A \code{\link{uniques-vector}} or any object that can be coerced diff -Nru r-bioc-dada2-1.16.0+dfsg/man/learnErrors.Rd r-bioc-dada2-1.18.0+dfsg/man/learnErrors.Rd --- r-bioc-dada2-1.16.0+dfsg/man/learnErrors.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/learnErrors.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -4,10 +4,19 @@ \alias{learnErrors} \title{Learns the error rates from an input list, or vector, of file names or a list of \code{\link{derep-class}} objects.} \usage{ -learnErrors(fls, nbases = 1e+08, nreads = NULL, - errorEstimationFunction = loessErrfun, multithread = FALSE, - randomize = FALSE, MAX_CONSIST = 10, OMEGA_C = 0, - qualityType = "Auto", verbose = FALSE, ...) +learnErrors( + fls, + nbases = 1e+08, + nreads = NULL, + errorEstimationFunction = loessErrfun, + multithread = FALSE, + randomize = FALSE, + MAX_CONSIST = 10, + OMEGA_C = 0, + qualityType = "Auto", + verbose = FALSE, + ... +) } \arguments{ \item{fls}{(Required). \code{character}. diff -Nru r-bioc-dada2-1.16.0+dfsg/man/makeTaxonomyFasta_SilvaNR.Rd r-bioc-dada2-1.18.0+dfsg/man/makeTaxonomyFasta_SilvaNR.Rd --- r-bioc-dada2-1.16.0+dfsg/man/makeTaxonomyFasta_SilvaNR.Rd 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/makeTaxonomyFasta_SilvaNR.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/taxonomy.R +\name{makeTaxonomyFasta_SilvaNR} +\alias{makeTaxonomyFasta_SilvaNR} +\title{This function creates the dada2 assignTaxonomy training fasta for the official Silva NR99 +release files. If `include.species`=TRUE, a 7th taxonomic level (species) will be added based on the +Genus species binomial in the Silva taxonomy string, if present and valid.} +\usage{ +makeTaxonomyFasta_SilvaNR( + fin, + ftax, + fout, + include.species = FALSE, + compress = TRUE +) +} +\description{ +## Silva release v138 +path <- "~/tax/Silva/v138" +dada2:::makeTaxonomyFasta_SilvaNR(file.path(path, "SILVA_138_SSURef_NR99_tax_silva.fasta.gz"), + file.path(path, "tax_slv_ssu_138.txt"), + "~/Desktop/silva_nr99_v138_train_set.fa.gz") +dada2:::makeTaxonomyFasta_SilvaNR(file.path(path, "SILVA_138_SSURef_NR99_tax_silva.fasta.gz"), + file.path(path, "tax_slv_ssu_138.txt"), + include.species=TRUE, "~/Desktop/silva_nr99_v138_wSpecies_train_set.fa.gz") +} +\keyword{internal} diff -Nru r-bioc-dada2-1.16.0+dfsg/man/makeTaxonomyFasta_Silva.Rd r-bioc-dada2-1.18.0+dfsg/man/makeTaxonomyFasta_Silva.Rd --- r-bioc-dada2-1.16.0+dfsg/man/makeTaxonomyFasta_Silva.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/makeTaxonomyFasta_Silva.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -2,19 +2,21 @@ % Please edit documentation in R/taxonomy.R \name{makeTaxonomyFasta_Silva} \alias{makeTaxonomyFasta_Silva} -\title{This function creates the dada2 assignTaxonomy training fasta for the Silva .align file -generated by the Mothur project.} +\title{DEPRECATED in favor of `makeTaxonomyFasta_SilvaNR``} \usage{ makeTaxonomyFasta_Silva(fin, ftax, fout, compress = TRUE) } \description{ +This function creates the dada2 assignTaxonomy training fasta for the Silva .align file +generated by the Mothur project. +} +\details{ ## Silva release v128 path <- "~/Desktop/Silva/Silva.nr_v128" dada2:::makeTaxonomyFasta_Silva(file.path(path, "silva.nr_v128.align"), file.path(path, "silva.nr_v128.tax"), "~/tax/silva_nr_v128_train_set.fa.gz") -} -\details{ + ## Silva release v132 path <- "~/Desktop/Silva/Silva.nr_v132" dada2:::makeTaxonomyFasta_Silva(file.path(path, "silva.nr_v132.align"), diff -Nru r-bioc-dada2-1.16.0+dfsg/man/mergePairs.Rd r-bioc-dada2-1.18.0+dfsg/man/mergePairs.Rd --- r-bioc-dada2-1.16.0+dfsg/man/mergePairs.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/mergePairs.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -4,10 +4,20 @@ \alias{mergePairs} \title{Merge denoised forward and reverse reads.} \usage{ -mergePairs(dadaF, derepF, dadaR, derepR, minOverlap = 12, - maxMismatch = 0, returnRejects = FALSE, - propagateCol = character(0), justConcatenate = FALSE, - trimOverhang = FALSE, verbose = FALSE, ...) +mergePairs( + dadaF, + derepF, + dadaR, + derepR, + minOverlap = 12, + maxMismatch = 0, + returnRejects = FALSE, + propagateCol = character(0), + justConcatenate = FALSE, + trimOverhang = FALSE, + verbose = FALSE, + ... +) } \arguments{ \item{dadaF}{(Required). A \code{\link{dada-class}} object, or a list of such objects. diff -Nru r-bioc-dada2-1.16.0+dfsg/man/mergeSequenceTables.Rd r-bioc-dada2-1.18.0+dfsg/man/mergeSequenceTables.Rd --- r-bioc-dada2-1.16.0+dfsg/man/mergeSequenceTables.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/mergeSequenceTables.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -4,8 +4,15 @@ \alias{mergeSequenceTables} \title{Merge two or more sample-by-sequence observation matrices.} \usage{ -mergeSequenceTables(table1 = NULL, table2 = NULL, ..., tables = NULL, - repeats = "error", orderBy = "abundance", tryRC = FALSE) +mergeSequenceTables( + table1 = NULL, + table2 = NULL, + ..., + tables = NULL, + repeats = "error", + orderBy = "abundance", + tryRC = FALSE +) } \arguments{ \item{table1}{(Optional, default=NULL). Named integer matrix. Rownames correspond to samples @@ -47,7 +54,7 @@ \examples{ \dontrun{ - mergetab <- mergeSequenceTables(seqtab1, seqtab2, seqtab3) # unnamed arguments are assumed to be individual sequence tables + mergetab <- mergeSequenceTables(seqtab1, seqtab2, seqtab3) # unnamed arguments assumed to be sequence tables input_tables <- list(seqtab1, seqtab2, seqtab3) mergetab <- mergeSequenceTables(tables=input_tables) # list of sequence tables files <- c(file1, file2, file3) diff -Nru r-bioc-dada2-1.16.0+dfsg/man/names-set-dada-ANY-method.Rd r-bioc-dada2-1.18.0+dfsg/man/names-set-dada-ANY-method.Rd --- r-bioc-dada2-1.16.0+dfsg/man/names-set-dada-ANY-method.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/names-set-dada-ANY-method.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/show-methods.R -\docType{methods} \name{names<-,dada,ANY-method} \alias{names<-,dada,ANY-method} \title{Deactivate renaming of dada-class objects.} diff -Nru r-bioc-dada2-1.16.0+dfsg/man/names-set-derep-ANY-method.Rd r-bioc-dada2-1.18.0+dfsg/man/names-set-derep-ANY-method.Rd --- r-bioc-dada2-1.16.0+dfsg/man/names-set-derep-ANY-method.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/names-set-derep-ANY-method.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/show-methods.R -\docType{methods} \name{names<-,derep,ANY-method} \alias{names<-,derep,ANY-method} \title{Deactivate renaming of derep-class objects.} diff -Nru r-bioc-dada2-1.16.0+dfsg/man/nwalign.Rd r-bioc-dada2-1.18.0+dfsg/man/nwalign.Rd --- r-bioc-dada2-1.16.0+dfsg/man/nwalign.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/nwalign.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -4,9 +4,17 @@ \alias{nwalign} \title{Needleman-Wunsch alignment.} \usage{ -nwalign(s1, s2, match = getDadaOpt("MATCH"), - mismatch = getDadaOpt("MISMATCH"), gap = getDadaOpt("GAP_PENALTY"), - homo_gap = NULL, band = -1, endsfree = TRUE, vec = FALSE) +nwalign( + s1, + s2, + match = getDadaOpt("MATCH"), + mismatch = getDadaOpt("MISMATCH"), + gap = getDadaOpt("GAP_PENALTY"), + homo_gap = NULL, + band = -1, + endsfree = TRUE, + vec = FALSE +) } \arguments{ \item{s1}{(Required). \code{character(1)}. The first sequence to align. A/C/G/T only.} diff -Nru r-bioc-dada2-1.16.0+dfsg/man/plotComplexity.Rd r-bioc-dada2-1.18.0+dfsg/man/plotComplexity.Rd --- r-bioc-dada2-1.16.0+dfsg/man/plotComplexity.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/plotComplexity.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -4,8 +4,16 @@ \alias{plotComplexity} \title{Plot sequence complexity profile of a fastq file.} \usage{ -plotComplexity(fl, kmerSize = 2, window = NULL, by = 5, n = 1e+05, - bins = 100, aggregate = FALSE, ...) +plotComplexity( + fl, + kmerSize = 2, + window = NULL, + by = 5, + n = 1e+05, + bins = 100, + aggregate = FALSE, + ... +) } \arguments{ \item{fl}{(Required). \code{character}. diff -Nru r-bioc-dada2-1.16.0+dfsg/man/plotErrors.Rd r-bioc-dada2-1.18.0+dfsg/man/plotErrors.Rd --- r-bioc-dada2-1.16.0+dfsg/man/plotErrors.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/plotErrors.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -4,9 +4,15 @@ \alias{plotErrors} \title{Plot observed and estimated error rates.} \usage{ -plotErrors(dq, nti = c("A", "C", "G", "T"), ntj = c("A", "C", "G", - "T"), obs = TRUE, err_out = TRUE, err_in = FALSE, - nominalQ = FALSE) +plotErrors( + dq, + nti = c("A", "C", "G", "T"), + ntj = c("A", "C", "G", "T"), + obs = TRUE, + err_out = TRUE, + err_in = FALSE, + nominalQ = FALSE +) } \arguments{ \item{dq}{(Required). An object from which error rates can be extracted. Valid inputs are diff -Nru r-bioc-dada2-1.16.0+dfsg/man/qtables2.Rd r-bioc-dada2-1.18.0+dfsg/man/qtables2.Rd --- r-bioc-dada2-1.16.0+dfsg/man/qtables2.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/qtables2.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -4,7 +4,7 @@ \alias{qtables2} \title{Internal tables function} \usage{ -qtables2(x, qeff = FALSE) +qtables2(x, qeff = FALSE, handle.zerolen = TRUE) } \arguments{ \item{x}{ShortReadQ. @@ -12,6 +12,9 @@ \item{qeff}{\code{logical(1)}. Calculate average quality by first transforming to expected error rate.} + +\item{handle.zerolen}{\code{logical(1)}. +Default TRUE. If TRUE, gracefully excludes zero-length sequences.} } \value{ List. diff -Nru r-bioc-dada2-1.16.0+dfsg/man/removePrimers.Rd r-bioc-dada2-1.18.0+dfsg/man/removePrimers.Rd --- r-bioc-dada2-1.16.0+dfsg/man/removePrimers.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/removePrimers.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -4,9 +4,19 @@ \alias{removePrimers} \title{Removes primers and orients reads in a consistent direction.} \usage{ -removePrimers(fn, fout, primer.fwd, primer.rev = NULL, - max.mismatch = 2, allow.indels = FALSE, trim.fwd = TRUE, - trim.rev = TRUE, orient = TRUE, compress = TRUE, verbose = FALSE) +removePrimers( + fn, + fout, + primer.fwd, + primer.rev = NULL, + max.mismatch = 2, + allow.indels = FALSE, + trim.fwd = TRUE, + trim.rev = TRUE, + orient = TRUE, + compress = TRUE, + verbose = FALSE +) } \arguments{ \item{fn}{(Required). \code{character}. diff -Nru r-bioc-dada2-1.16.0+dfsg/man/show-methods.Rd r-bioc-dada2-1.18.0+dfsg/man/show-methods.Rd --- r-bioc-dada2-1.16.0+dfsg/man/show-methods.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/show-methods.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/show-methods.R -\docType{methods} \name{show,derep-method} \alias{show,derep-method} \alias{show,dada-method} diff -Nru r-bioc-dada2-1.16.0+dfsg/man/tperr1.Rd r-bioc-dada2-1.18.0+dfsg/man/tperr1.Rd --- r-bioc-dada2-1.16.0+dfsg/man/tperr1.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/tperr1.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -3,9 +3,11 @@ \name{tperr1} \alias{tperr1} \title{An empirical error matrix.} -\format{A numerical matrix with 16 rows and 41 columns. +\format{ +A numerical matrix with 16 rows and 41 columns. Rows correspond to the 16 transition (eg. A2A, A2C, ...) - Columns correspond to consensus quality scores 0 to 40.} + Columns correspond to consensus quality scores 0 to 40. +} \description{ A dataset containing the error matrix estimated by fitting a piecewise linear model to the errors observed in the mock community featured in Schirmer 2015 (metaID 35). diff -Nru r-bioc-dada2-1.16.0+dfsg/man/uniquesToFasta.Rd r-bioc-dada2-1.18.0+dfsg/man/uniquesToFasta.Rd --- r-bioc-dada2-1.16.0+dfsg/man/uniquesToFasta.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/uniquesToFasta.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -4,8 +4,7 @@ \alias{uniquesToFasta} \title{Write a uniques vector to a FASTA file} \usage{ -uniquesToFasta(unqs, fout, ids = NULL, mode = "w", width = 20000, - ...) +uniquesToFasta(unqs, fout, ids = NULL, mode = "w", width = 20000, ...) } \arguments{ \item{unqs}{(Required). diff -Nru r-bioc-dada2-1.16.0+dfsg/man/writeFasta.Rd r-bioc-dada2-1.18.0+dfsg/man/writeFasta.Rd --- r-bioc-dada2-1.16.0+dfsg/man/writeFasta.Rd 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/man/writeFasta.Rd 2020-10-27 17:56:28.000000000 +0000 @@ -1,13 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/sequenceIO.R -\docType{methods} \name{writeFasta,character-method} \alias{writeFasta,character-method} \title{Writes a named character vector of DNA sequences to a fasta file. Values are the sequences, and names are used for the id lines.} \usage{ -\S4method{writeFasta}{character}(object, file, mode = "w", - width = 20000L, ...) +\S4method{writeFasta}{character}(object, file, mode = "w", width = 20000L, ...) } \arguments{ \item{object}{(Required). A named \code{character} vector.} diff -Nru r-bioc-dada2-1.16.0+dfsg/NAMESPACE r-bioc-dada2-1.18.0+dfsg/NAMESPACE --- r-bioc-dada2-1.16.0+dfsg/NAMESPACE 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/NAMESPACE 2020-10-27 17:56:28.000000000 +0000 @@ -46,6 +46,7 @@ importFrom(Biostrings,DNAStringSet) importFrom(Biostrings,PDict) importFrom(Biostrings,end) +importFrom(Biostrings,matchPattern) importFrom(Biostrings,oligonucleotideFrequency) importFrom(Biostrings,quality) importFrom(Biostrings,readDNAStringSet) diff -Nru r-bioc-dada2-1.16.0+dfsg/NEWS r-bioc-dada2-1.18.0+dfsg/NEWS --- r-bioc-dada2-1.16.0+dfsg/NEWS 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/NEWS 2020-10-27 17:56:28.000000000 +0000 @@ -1,3 +1,29 @@ +CHANGES IN VERSION 1.16.0 +----------------------- + +SIGNIFICANT USER-VISIBLE CHANGES + + o The `dada-class$clustering` data.frame includes a new column `$birth_from` which encodes the cluster (ASV) from which the associated cluster (ASV) was divided (or "born"). + + o Short sequences are now given `NA` assignments at all levels, rather than stopping the `assignTaxonomy` function entirely. + +BUG FIXES + + o All uses of the `class(x) == "y"` idiom have been replaced by `is(x, "y")` to conform to current R best practices and Bioconductor build requirements. + + o A rare error in how the `dada-class$clustering$birth_pval` was calculated has been fixed. + + o Read lengths are now visualized correctly by `plotQualityProfile(..., aggregate=TRUE)`. + + +CHANGES IN VERSION 1.14.1 +----------------------- + +BUG FIXES + + o A segfault bug in `assignTaxonomy` has been fixed. This segfault occurred when multithreading was enabled and multiple exact matches to the query sequence existed in the reference database being queried against. + + CHANGES IN VERSION 1.14.0 ----------------------- @@ -5,7 +31,7 @@ o Directories containing fastq files (possibly compressed) can now be provided to core dada2 functions instead of a character vector of the fastq filenames. This functionality is supported by `filterAndTrim`, `learnErrors`, `dada`, `mergePairs` and `derepFastq`. Note, this feature requires fastqs in the provided directory to have standard file extensions: `.fastq`, `.fastq.gz` or `.fastq.bz2`. - o The new `DETECT_SINGLETONS` option removes the removes the “conditional” in the calculation of probabilties used in the core dada algorithm, which effectively discounts the first read of any novel sequence. In practice, setting `DETECT_SINGLETONS = TRUE` allows singletons to be detected (of course) and also increases sensitivity to other low abundance sequences slightly, i.e. those present in just 2/3/4 reads. Note, we do not generally recommend this option as it will result in a large increase in false positives in typical datasets. Instead we recommend `pool = "pseudo"` or `pool=TRUE` for typical datasets to increase sensitivity to rare sequences with less impact on specificity. + o The new `DETECT_SINGLETONS` option removes the “conditional” in the calculation of probabilties used in the core dada algorithm, which effectively discounts the first read of any novel sequence. In practice, setting `DETECT_SINGLETONS = TRUE` allows singletons to be detected (of course) and also increases sensitivity to other low abundance sequences slightly, i.e. those present in just 2/3/4 reads. Note, we do not generally recommend this option as it will result in a large increase in false positives in typical datasets. Instead we recommend `pool = "pseudo"` or `pool=TRUE` for typical datasets to increase sensitivity to rare sequences with less impact on specificity. SIGNIFICANT USER-VISIBLE CHANGES diff -Nru r-bioc-dada2-1.16.0+dfsg/R/filter.R r-bioc-dada2-1.18.0+dfsg/R/filter.R --- r-bioc-dada2-1.16.0+dfsg/R/filter.R 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/R/filter.R 2020-10-27 17:56:28.000000000 +0000 @@ -87,6 +87,7 @@ if(length(fn) != length(fout)) stop("Every input file must have a corresponding output file.") if(allow.indels) message("Primer matching with indels allowed is currently significantly (~4x) slower.") require.fwd <- TRUE; require.rev <- TRUE ### + first.multi.msg <- TRUE odirs <- unique(dirname(fout)) for(odir in odirs) { if(!dir.exists(odir)) { @@ -113,9 +114,11 @@ colnames(rval) <- c("reads.in", "reads.out") rownames(rval) <- basename(fn) for(i in seq_along(fn)) { - # Read in file + # Read in file and init filtering stats fq <- readFastq(fn[[i]]) inseqs <- length(fq) + outseqs <- 0 + rval[i,c("reads.in", "reads.out")] <- c(inseqs, outseqs) # Match patterns if(allow.indels) { # Use slower matchPattern because it supports indels match.fwd <- lapply(sread(fq), function(x) matchPattern(primer.fwd, x, max.mismatch=max.mismatch, with.indels=allow.indels, fixed=fixed.fwd)) @@ -151,10 +154,13 @@ if(has.rev) hits.rev <- sapply(match.rev, length) if(!require.fwd) stop("Currently, only require.fwd=TRUE is supported.") if(has.rev && !require.rev) stop("Currently, only require.rev=TRUE is supported when a reverse primer sequence is provided.") - if(require.fwd && sum(hits.fwd) == 0) stop("No sequences matched the provided forward primer sequence.") - if(has.rev && require.rev && sum(hits.rev) == 0) stop("Reverse primer sequences were provided, but no sequences matched the provided reverse primer sequence.") + if(require.fwd && sum(hits.fwd) == 0) { filt.print(inseqs, outseqs); next } + if(has.rev && require.rev && sum(hits.rev) == 0) { filt.print(inseqs, outseqs); next } if(any(hits.fwd>1) || (has.rev && any(hits.rev>1))) { - if(verbose) message("Multiple matches to the primer(s) in some sequences. Using the longest possible match.") + if(verbose && first.multi.msg) { + message("Multiple matches to the primer(s) in some sequences. Using the longest possible match.") + first.multi.msg <- FALSE + } match.fwd[hits.fwd>1] <- sapply(match.fwd[hits.fwd>1], `[`, 1) if(has.rev) match.rev[hits.rev>1] <- sapply(match.rev[hits.rev>1], function(x) rev(x)[1]) } @@ -162,7 +168,10 @@ hits.fwd.rc <- sapply(match.fwd.rc, length) if(has.rev) hits.rev.rc <- sapply(match.rev.rc, length) if(any(hits.fwd.rc>1) || (has.rev && any(hits.rev.rc>1))) { - if(verbose) message("Multiple matches to the primer(s) in some reverse-complement sequences. Using the longest possible match.") + if(verbose && first.multi.msg) { + message("Multiple matches to the primer(s) in some sequences. Using the longest possible match.") + first.multi.msg <- FALSE + } match.fwd.rc[hits.fwd.rc>1] <- sapply(match.fwd.rc[hits.fwd.rc>1], `[`, 1) if(has.rev) match.rev.rc[hits.rev.rc>1] <- sapply(match.rev.rc[hits.rev.rc>1], function(x) rev(x)[1]) } @@ -213,11 +222,7 @@ writeFastq(fq, fout[[i]], "w", compress=compress) outseqs <- length(fq) rval[i,c("reads.in", "reads.out")] <- c(inseqs, outseqs) - if(verbose) { - outperc <- round(outseqs * 100 / inseqs, 1) - outperc <- paste(" (", outperc, "%)", sep="") - message("Read in ", inseqs, ", output ", outseqs, outperc, " filtered sequences.", sep="") - } + if(verbose) filt.print(inseqs, outseqs) } if(all(rval[,"reads.out"]==0)) { warning("No reads passed the primer detection.") @@ -227,6 +232,12 @@ return(invisible(rval)) } +filt.print <- function(inseqs, outseqs) { + outperc <- round(outseqs * 100 / inseqs, 1) + outperc <- paste(" (", outperc, "%)", sep="") + message("Read in ", inseqs, ", output ", outseqs, outperc, " filtered sequences.", sep="") +} + #' Filter and trim fastq file(s). #' #' Filters and trims an input fastq file(s) (can be compressed) diff -Nru r-bioc-dada2-1.16.0+dfsg/R/paired.R r-bioc-dada2-1.18.0+dfsg/R/paired.R --- r-bioc-dada2-1.16.0+dfsg/R/paired.R 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/R/paired.R 2020-10-27 17:56:28.000000000 +0000 @@ -113,8 +113,9 @@ mapR <- getDerep(derepR[[i]])$map if(!(is.integer(mapF) && is.integer(mapR))) stop("Incorrect format of $map in derep-class arguments.") # if(any(is.na(rF)) || any(is.na(rR))) stop("Non-corresponding maps and dada-outputs.") - if(!(length(mapF) == length(mapR) && max(mapF) == length(dadaF[[i]]$map) && - max(mapR) == length(dadaR[[i]]$map))) { + if(!(length(mapF) == length(mapR) && + max(mapF, na.rm=TRUE) == length(dadaF[[i]]$map) && + max(mapR, na.rm=TRUE) == length(dadaR[[i]]$map))) { stop("Non-corresponding derep-class and dada-class objects.") } rF <- dadaF[[i]]$map[mapF] diff -Nru r-bioc-dada2-1.16.0+dfsg/R/plot-methods.R r-bioc-dada2-1.18.0+dfsg/R/plot-methods.R --- r-bioc-dada2-1.16.0+dfsg/R/plot-methods.R 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/R/plot-methods.R 2020-10-27 17:56:28.000000000 +0000 @@ -179,7 +179,7 @@ FIRST <- FALSE } else { plotdf <- rbind(plotdf, cbind(df, file=basename(f))) } statdf <- rbind(statdf, data.frame(Cycle=as.integer(rownames(means)), Mean=means, - Q25=as.vector(q25s), Q50=as.vector(q50s), Q75=as.vector(q75s), Cum=10*as.vector(cums)/rc, file=basename(f))) + Q25=as.vector(q25s), Q50=as.vector(q50s), Q75=as.vector(q75s), Cum=10*as.vector(cums)/min(rc, n), file=basename(f))) anndf <- rbind(anndf, data.frame(minScore=min(df$Score), label=basename(f), rclabel=rclabel, rc=rc, file=basename(f))) } anndf$minScore <- min(anndf$minScore) diff -Nru r-bioc-dada2-1.16.0+dfsg/R/RcppExports.R r-bioc-dada2-1.18.0+dfsg/R/RcppExports.R --- r-bioc-dada2-1.16.0+dfsg/R/RcppExports.R 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/R/RcppExports.R 2020-10-27 17:56:28.000000000 +0000 @@ -61,10 +61,6 @@ .Call('_dada2_C_nwvec', PACKAGE = 'dada2', s1, s2, match, mismatch, gap_p, band, endsfree) } -C_assign_taxonomy <- function(seqs, rcs, refs, ref_to_genus, genusmat, try_rc, verbose) { - .Call('_dada2_C_assign_taxonomy', PACKAGE = 'dada2', seqs, rcs, refs, ref_to_genus, genusmat, try_rc, verbose) -} - C_assign_taxonomy2 <- function(seqs, rcs, refs, ref_to_genus, genusmat, try_rc, verbose) { .Call('_dada2_C_assign_taxonomy2', PACKAGE = 'dada2', seqs, rcs, refs, ref_to_genus, genusmat, try_rc, verbose) } diff -Nru r-bioc-dada2-1.16.0+dfsg/R/sequenceIO.R r-bioc-dada2-1.18.0+dfsg/R/sequenceIO.R --- r-bioc-dada2-1.16.0+dfsg/R/sequenceIO.R 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/R/sequenceIO.R 2020-10-27 17:56:28.000000000 +0000 @@ -39,10 +39,10 @@ #' # Test that chunk-size, `n`, does not affect the result. #' testFastq = system.file("extdata", "sam1F.fastq.gz", package="dada2") #' derep1 = derepFastq(testFastq, verbose = TRUE) -#' derep1.35 = derepFastq(testFastq, 35, TRUE) +#' derep1.35 = derepFastq(testFastq, n = 35, verbose = TRUE) #' all.equal(getUniques(derep1), getUniques(derep1.35)[names(getUniques(derep1))]) #' -derepFastq <- function(fls, n = 1e6, verbose = FALSE, qualityType = "Auto"){ +derepFastq <- function(fls, n = 1e6, verbose = FALSE, qualityType = "Auto") { if(!is.character(fls)) { stop("File paths must be provided in character format.") } if(length(fls)==1 && dir.exists(fls)) { fls <- parseFastqDirectory(fls) } if(!all(file.exists(fls))) { stop("Not all provided files exist.") } @@ -56,7 +56,7 @@ f <- FastqStreamer(fl, n = n) suppressWarnings(fq <- yield(f, qualityType = qualityType)) - out <- qtables2(fq, FALSE) ###ITS + out <- qtables2(fq) ###ITS derepCounts <- out$uniques derepQuals <- out$cum_quals @@ -68,7 +68,7 @@ if(verbose){ message(".", appendLF = FALSE) } - out <- qtables2(fq, FALSE) + out <- qtables2(fq) # Augment quality matrices with NAs as needed to match ncol if(ncol(out$cum_quals) > ncol(derepQuals)) { derepQuals <- cbind(derepQuals, matrix(NA, nrow=nrow(derepQuals), ncol=(ncol(out$cum_quals)-ncol(derepQuals)))) @@ -134,6 +134,9 @@ #' @param qeff \code{logical(1)}. #' Calculate average quality by first transforming to expected error rate. #' +#' @param handle.zerolen \code{logical(1)}. +#' Default TRUE. If TRUE, gracefully excludes zero-length sequences. +#' #' @return List. #' Matches format of derep-class object. #' @@ -144,7 +147,15 @@ #' #' @keywords internal #' -qtables2 <- function(x, qeff = FALSE) { +qtables2 <- function(x, qeff = FALSE, handle.zerolen=TRUE) { + nread <- length(x) + npos <- sum(width(x) > 0) + if(npos == 0) { stop("Only zero-length sequences detected during dereplication.") } + if(handle.zerolen && npos < nread) { # Some zero length sequences in input + warning("Zero-length sequences detected during dereplication. They will be ignored.") + is.pos <- width(x) > 0 + x <- x[is.pos] + } # ranks are lexical rank srt <- srsort(x) # map from rank to sequence/quality/id rnk <- srrank(x) # map from read_i to rank (integer vec of ranks, ties take top rank) @@ -158,6 +169,11 @@ rnk2unqi <- rep(seq(length(uniques)), tab[tab>0]) # map from rank to uniques index map <- rnk2unqi[rnk] # map from read index to unique index + if(handle.zerolen && npos < nread) { # Fix mapping to include NAs for the zero-length input reads + foo <- map + map <- rep(as.integer(NA), nread) + map[is.pos] <- foo + } # do matrices qmat <- as(quality(srt), "matrix") # map from read_i to quality if(qeff) qmat <- 10^(-qmat/10) # Convert to nominal error probability first diff -Nru r-bioc-dada2-1.16.0+dfsg/R/taxonomy.R r-bioc-dada2-1.18.0+dfsg/R/taxonomy.R --- r-bioc-dada2-1.16.0+dfsg/R/taxonomy.R 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/R/taxonomy.R 2020-10-27 17:56:28.000000000 +0000 @@ -465,6 +465,136 @@ width=20000L, compress=compress) } +#' This function creates the dada2 assignTaxonomy training fasta for the official Silva NR99 +#' release files. If `include.species`=TRUE, a 7th taxonomic level (species) will be added based on the +#' Genus species binomial in the Silva taxonomy string, if present and valid. +#' +#' ## Silva release v138 +#' path <- "~/tax/Silva/v138" +#' dada2:::makeTaxonomyFasta_SilvaNR(file.path(path, "SILVA_138_SSURef_NR99_tax_silva.fasta.gz"), +#' file.path(path, "tax_slv_ssu_138.txt"), +#' "~/Desktop/silva_nr99_v138_train_set.fa.gz") +#' dada2:::makeTaxonomyFasta_SilvaNR(file.path(path, "SILVA_138_SSURef_NR99_tax_silva.fasta.gz"), +#' file.path(path, "tax_slv_ssu_138.txt"), +#' include.species=TRUE, "~/Desktop/silva_nr99_v138_wSpecies_train_set.fa.gz") +#' +#' @importFrom ShortRead readFasta +#' @importFrom ShortRead writeFasta +#' @importFrom ShortRead sread +#' @importFrom Biostrings BStringSet +#' @importFrom utils read.table +#' @keywords internal +makeTaxonomyFasta_SilvaNR <- function(fin, ftax, fout, include.species=FALSE, compress=TRUE) { + xset <- DNAStringSet(readRNAStringSet(fin, format="fasta")) + # taxl: The taxonmic strings or (l)ines associated with each entry. Named by the sequence ID/accession. + taxl <- names(xset) + names(taxl) <- sapply(strsplit(names(xset), "\\s"), `[`, 1) + if(any(duplicated(names(taxl)))) stop("Duplicated sequence IDs detected.") + names(xset) <- names(taxl) + taxl <- gsub("^[A-Za-z0-9.]+\\s", "", taxl) + # Fix Silva- or release-specific errors + taxl <- gsub(";YM;", ";", taxl) # YM bacterial suborder included in 138 Release in error (confirmed by Silva) + ## taxl <- gsub(";Rahnella1", ";Rahnella", taxl) # Rahnella1 genus seems like an error, shares same species w/ Rahnella, + ## But! also in official tax file. Maybe check with Silva on this one. + # taxa: A list of the ordered taxonomic levels corresponding to each reference sequence. Named by the sequence ID/accession. + taxa <- strsplit(taxl, ";") + # Read in the defined Silva taxonomic levels, e.g. Bacteria;Desulfobacterota;Desulfobulbia;Desulfobulbales;Desulfurivibrionaceae; + silva.taxa <- read.table(ftax, sep="\t", col.names=c("Taxon", "V2", "Level", "V4", "V5"), stringsAsFactors=FALSE) + silva.taxa <- silva.taxa[,c("Taxon", "Level")] + # Subset down to Bacteria and Archaea + kingdom <- sapply(strsplit(taxl, ";"), `[`, 1) + taxl.ba <- taxl[kingdom %in% c("Bacteria", "Archaea")] + taxa.ba <- taxa[names(taxl.ba)] + # Create 6-column matrix with Silva taxonomic assignment for each sequence at each level from Kingdom to Genus (NA if no assignment) + taxa.ba.mat <- matrix(sapply(taxa.ba, function(flds) { + c(flds[1], flds[2], flds[3], flds[4], flds[5], flds[6]) + }), ncol=6, byrow=TRUE) + rownames(taxa.ba.mat) <- names(taxl.ba) + # Create 6-column matrix with full Silva taxonomic string at each level for each sequence, from Kingdom to Genus + # Strings will include NA levels if no assignment at that level, e.g. Bacteria;Firmicutes;NA;NA + taxa.ba.mat.string <- matrix("UNDEF", nrow=nrow(taxa.ba.mat), ncol=ncol(taxa.ba.mat)) + rownames(taxa.ba.mat.string) <- names(taxl.ba) + taxa.ba.mat.string[,1] <- paste0(taxa.ba.mat[,1],";") + for(col in seq(2,6)) { + taxa.ba.mat.string[,col] <- paste0(taxa.ba.mat.string[,col-1], taxa.ba.mat[,col],";") + } + if(any(taxa.ba.mat.string == "UNDEF")) stop("Taxon string matrix was not fully initialized.") + # Define the set of valid taxonomic assignment by their appearance in the list of valid Silva taxonomic levels + taxa.ba.mat.is_valid <- matrix(taxa.ba.mat.string %in% silva.taxa$Taxon, ncol=6) + # Update taxa.ba.mat matrix by replacing invalid entries with NAs + taxa.ba.mat[!taxa.ba.mat.is_valid] <- NA + # Also replace "uncultured" taxonomic ranks with NAs (note, uncultured only shows up as the terminal "assigned" rank) + taxa.ba.mat[taxa.ba.mat %in% c("Uncultured", "uncultured")] <- NA + + ######### ADD SPECIES PART HERE ############## + if(include.species) { + # Add the 7th column, which will be the species column + taxa.ba.mat <- cbind(taxa.ba.mat, + matrix(sapply(taxa.ba, `[`, 7), ncol=1, byrow=TRUE)) + # Get validated genus from the matrix + genus <- taxa.ba.mat[,6] + genus <- gsub("Candidatus ", "", genus) + genus <- gsub("\\[", "", genus) + genus <- gsub("\\]", "", genus) + # Get the "binomial" string from the 7th field in the Silva taxonomic annotation + # The "binomial" field is not curated like the other Silva taxonomic levels, and can have varying info + # We assume that the first two words are the Genus species binomial, when there is a valid one in the field + # NOTE: the binomial is actually not always in the 7th field, so this isn't strictly correct. + # the binomial is in the "last" field, which may be <7 when not all the levels down to genus are assigned. + # But we are throwing away everything that doesn't match the genus anyway, so that case + # doesn't need to be handled correctly here. + binom <- taxa.ba.mat[,7] + binom <- gsub("Candidatus ", "", binom) + binom <- gsub("\\[", "", binom) + binom <- gsub("\\]", "", binom) + # Pull out the first two fields, and turn binom into a two column matrix (Genus, species) + binom <- cbind(sapply(strsplit(binom, "\\s"), `[`, 1), + sapply(strsplit(binom, "\\s"), `[`, 2)) + # Identify binomials that match the curated genus + gen.match <- mapply(dada2:::matchGenera, genus, binom[,1], split.glyph="-") + # Identify some other types of invalid species names + is.NA <- apply(binom, 1, function(x) any(is.na(x))) + is.sp <- grepl("sp\\.", binom[,2]) # "sp." is not a valid species name, just a generic + is.endo <- binom[,1] %in% "endosymbiont" | binom[,2] %in% "endosymbiont" + is.uncult <- grepl("[Uu]ncultured", binom[,1]) | grepl("[Uu]ncultured", binom[,2]) + is.unident <- grepl("[Uu]nidentified", binom[,1]) | grepl("[Uu]nidentified", binom[,2]) + # Define the "valid" species, and set invalid species to NA in the taxonomic matrix + valid.spec <- gen.match & !is.NA & !is.sp & !is.endo & !is.uncult & !is.unident + binom[!valid.spec,2] <- NA + taxa.ba.mat[,7] <- binom[,2] + } + # Organize a small number of Eukaryota sequences for outgroup purposes, keeping only the Eukaryota Kingdom taxonomic assignment + set.seed(100); N_EUK <- 100 + euk.keep <- sample(names(taxl)[kingdom %in% "Eukaryota"], N_EUK) + taxa.euk.mat <- matrix("", nrow=N_EUK, ncol=ncol(taxa.ba.mat)) + rownames(taxa.euk.mat) <- euk.keep + taxa.euk.mat[,1] <- "Eukaryota" + taxa.euk.mat[,2:ncol(taxa.euk.mat)] <- NA + + # Now need to make the final training fasta in DADA2 format. + taxa.mat.final <- rbind(taxa.ba.mat, taxa.euk.mat) + taxa.string.final <- apply(taxa.mat.final, 1, function(x) { + tst <- paste(x, collapse=";") + tst <- paste0(tst, ";") + tst <- gsub("NA;", "", tst) + tst + }) + + if(any(is.na(names(taxa.string.final)))) stop("NA names in the final set of taxon strings.") + if(!all(names(taxa.string.final) %in% names(xset))) stop("Some names of the final set of taxon strings don't match sequence names.") + xset.out <- xset[names(taxa.string.final)] + + ## Add some verbose output describing what happened. + cat(length(xset.out), "reference sequences were output.\n") + print(table(taxa.mat.final[,1], useNA="ifany")) + if(include.species) cat(sum(!is.na(taxa.mat.final[,7])), "entries include species names.\n") + + writeFasta(ShortRead(unname(xset.out), BStringSet(taxa.string.final)), fout, + width=20000L, compress=compress) +} + +#' DEPRECATED in favor of `makeTaxonomyFasta_SilvaNR`` +#' #' This function creates the dada2 assignTaxonomy training fasta for the Silva .align file #' generated by the Mothur project. #' diff -Nru r-bioc-dada2-1.16.0+dfsg/src/RcppExports.cpp r-bioc-dada2-1.18.0+dfsg/src/RcppExports.cpp --- r-bioc-dada2-1.16.0+dfsg/src/RcppExports.cpp 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/src/RcppExports.cpp 2020-10-27 17:56:28.000000000 +0000 @@ -234,23 +234,6 @@ return rcpp_result_gen; END_RCPP } -// C_assign_taxonomy -Rcpp::List C_assign_taxonomy(std::vector seqs, std::vector rcs, std::vector refs, std::vector ref_to_genus, Rcpp::IntegerMatrix genusmat, bool try_rc, bool verbose); -RcppExport SEXP _dada2_C_assign_taxonomy(SEXP seqsSEXP, SEXP rcsSEXP, SEXP refsSEXP, SEXP ref_to_genusSEXP, SEXP genusmatSEXP, SEXP try_rcSEXP, SEXP verboseSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< std::vector >::type seqs(seqsSEXP); - Rcpp::traits::input_parameter< std::vector >::type rcs(rcsSEXP); - Rcpp::traits::input_parameter< std::vector >::type refs(refsSEXP); - Rcpp::traits::input_parameter< std::vector >::type ref_to_genus(ref_to_genusSEXP); - Rcpp::traits::input_parameter< Rcpp::IntegerMatrix >::type genusmat(genusmatSEXP); - Rcpp::traits::input_parameter< bool >::type try_rc(try_rcSEXP); - Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); - rcpp_result_gen = Rcpp::wrap(C_assign_taxonomy(seqs, rcs, refs, ref_to_genus, genusmat, try_rc, verbose)); - return rcpp_result_gen; -END_RCPP -} // C_assign_taxonomy2 Rcpp::List C_assign_taxonomy2(std::vector seqs, std::vector rcs, std::vector refs, std::vector ref_to_genus, Rcpp::IntegerMatrix genusmat, bool try_rc, bool verbose); RcppExport SEXP _dada2_C_assign_taxonomy2(SEXP seqsSEXP, SEXP rcsSEXP, SEXP refsSEXP, SEXP ref_to_genusSEXP, SEXP genusmatSEXP, SEXP try_rcSEXP, SEXP verboseSEXP) { @@ -298,7 +281,6 @@ {"_dada2_C_matchRef", (DL_FUNC) &_dada2_C_matchRef, 4}, {"_dada2_C_matrixEE", (DL_FUNC) &_dada2_C_matrixEE, 1}, {"_dada2_C_nwvec", (DL_FUNC) &_dada2_C_nwvec, 7}, - {"_dada2_C_assign_taxonomy", (DL_FUNC) &_dada2_C_assign_taxonomy, 7}, {"_dada2_C_assign_taxonomy2", (DL_FUNC) &_dada2_C_assign_taxonomy2, 7}, {"_dada2_RcppExport_registerCCallable", (DL_FUNC) &_dada2_RcppExport_registerCCallable, 0}, {NULL, NULL, 0} diff -Nru r-bioc-dada2-1.16.0+dfsg/src/Rmain.cpp r-bioc-dada2-1.18.0+dfsg/src/Rmain.cpp --- r-bioc-dada2-1.16.0+dfsg/src/Rmain.cpp 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/src/Rmain.cpp 2020-10-27 17:56:28.000000000 +0000 @@ -192,7 +192,7 @@ // initialize with source and destination FinalSubsParallel(B *b, Sub **subs, Sub **birth_subs, int match, int mismatch, int gap, int homo_gap, - bool use_kmers, int band_size, bool vectorized_alignment, int SSE, bool gapless) + int band_size, bool use_kmers, bool vectorized_alignment, int SSE, bool gapless) : b(b), subs(subs), birth_subs(birth_subs), match(match), mismatch(mismatch), gap(gap), homo_gap(homo_gap), band_size(band_size), use_kmers(use_kmers), vectorized_alignment(vectorized_alignment), SSE(SSE), gapless(gapless) {} @@ -203,7 +203,7 @@ for(std::size_t i=begin;ibi[i]->nraw;r++) { raw = b->bi[i]->raw[r]; - subs[raw->index] = sub_new(b->bi[i]->center, raw, match, mismatch, gap, homo_gap, use_kmers, 1.0, band_size, vectorized_alignment, SSE, gapless); + subs[raw->index] = sub_new(b->bi[i]->center, raw, match, mismatch, gap, homo_gap, false, 1.0, band_size, vectorized_alignment, SSE, gapless); } // Make birth sub for that cluster if(i==0) { birth_subs[i] = NULL; } @@ -222,7 +222,7 @@ // Make subs for members of that cluster for(r=0;rbi[i]->nraw;r++) { raw = bb->bi[i]->raw[r]; - subs[raw->index] = sub_new(bb->bi[i]->center, raw, match, mismatch, gap, homo_gap, use_kmers, 1.0, band_size, vectorized_alignment, SSE, gapless); + subs[raw->index] = sub_new(bb->bi[i]->center, raw, match, mismatch, gap, homo_gap, false, 1.0, band_size, vectorized_alignment, SSE, gapless); } // Make birth sub for that cluster if(i==0) { birth_subs[i] = NULL; } diff -Nru r-bioc-dada2-1.16.0+dfsg/src/taxonomy.cpp r-bioc-dada2-1.18.0+dfsg/src/taxonomy.cpp --- r-bioc-dada2-1.16.0+dfsg/src/taxonomy.cpp 2020-04-27 20:45:33.000000000 +0000 +++ r-bioc-dada2-1.18.0+dfsg/src/taxonomy.cpp 2020-10-27 17:56:28.000000000 +0000 @@ -2,6 +2,8 @@ #include #include #include +#include +#define NBOOT 100 using namespace Rcpp; @@ -29,6 +31,7 @@ return(kmer); } +// Sets to 1 (TRUE) the value of kvec corresponding to each valid kmer index in the provided sequence void tax_kvec(const char *seq, unsigned int k, unsigned char *kvec) { unsigned int i; unsigned int len = strlen(seq); @@ -36,6 +39,7 @@ int kmer = 0; size_t n_kmers = (1 << (2*k)); // 4^k kmers for(i=0;i=0) kmer indices in the provided sequence to karray. Returns number written. unsigned int tax_karray(const char *seq, unsigned int k, int *karray) { unsigned int i, j; int kmer; @@ -61,41 +66,33 @@ j++; } } + std::sort(karray, karray+j); return(j); } -int get_best_genus(int *karray, double *out_logp, unsigned int arraylen, unsigned int n_kmers, unsigned int *genus_kmers, unsigned int ngenus, double *kmer_prior, double *genus_num_plus1) { +int get_best_genus(int *karray, float *out_logp, unsigned int arraylen, unsigned int n_kmers, unsigned int ngenus, float *lgk_probability) { unsigned int pos; - unsigned int *genus_kv; + float *lgk_v; int kmer, g, max_g = -1; - unsigned int log_step = 50; ///! Need log10(ngenus+1) * log_step < 300 (max double ~ 10^308) - double p, logp, max_logp = 1.0; // Init value to be replaced on first iteration + float logp, max_logp = -FLT_MAX; // Init value to be replaced on first iteration double rv; // Dummy random variable unsigned int nmax=0; // Number of times the current max logp has been seen std::random_device rd; //Will be used to obtain a seed for the random number engine std::mt19937 gen(rd()); //Standard mersenne_twister_engine seeded with rd() std::uniform_real_distribution<> cunif(0.0, 1.0); - + for(g=0;g sum of logs + // This is the rate limiting step of the entire assignTaxonomy (on query sets of non-trival size) for(pos=0;pos 0 || logp>max_logp) { // Store if new max max_logp = logp; max_g = g; @@ -103,7 +100,6 @@ } else if (max_logp == logp) { // With uniform prob, store if equal to current max nmax++; rv = (double) cunif(gen); - ///! rv = (double) Rcpp::runif(1)[0]; if(rv < 1.0/nmax) { max_g = g; } @@ -113,172 +109,13 @@ return max_g; } -//------------------------------------------------------------------ -// Assigns taxonomy to sequence based on provided ref seqs and corresponding taxonomies. -// Single-threaded only. -// DEPRECATED DEPREECATED DEPRECATED DEPRECATED DEPREECATED DEPRECATED DEPRECATED -// -// [[Rcpp::export]] -Rcpp::List C_assign_taxonomy(std::vector seqs, std::vector rcs, std::vector refs, std::vector ref_to_genus, Rcpp::IntegerMatrix genusmat, bool try_rc, bool verbose) { - size_t i, j, g; - int kmer; - unsigned int k=8; - size_t n_kmers = (1 << (2*k)); - size_t nseq = seqs.size(); - if(nseq == 0) Rcpp::stop("No seqs provided to classify."); - size_t nref = refs.size(); - if(nref != ref_to_genus.size()) Rcpp::stop("Length mismatch between number of references and map to genus."); - size_t ngenus = genusmat.nrow(); - - // Rprintf("Validated and 0-index ref_to_genus map.\n"); - for(i=0;i 0-index - if(ref_to_genus[i]<0 || ref_to_genus[i] >= ngenus) { - Rcpp::stop("Invalid map from references to genus."); - } - } - - // Rprintf("Count seqs in each genus (M_g).\n"); - double *genus_num_plus1 = (double *) calloc(ngenus, sizeof(double)); - if(genus_num_plus1 == NULL) Rcpp::stop("Memory allocation failed."); - for(i=0;i max_arraylen) { max_arraylen = seqlen-k+1; } - } - - // Rprintf("Allocate kmer array to be used by the seqs."); - int *karray = (int *) malloc(max_arraylen * sizeof(int)); - if(karray == NULL) Rcpp::stop("Memory allocation failed."); - int *karray_rc = (int *) malloc(max_arraylen * sizeof(int)); - if(karray_rc == NULL) Rcpp::stop("Memory allocation failed."); - - Rcpp::IntegerVector rval(nseq); - Rcpp::NumericVector unifs; - Rcpp::IntegerMatrix rboot(nseq, genusmat.ncol()); - Rcpp::IntegerMatrix rboot_tax(nseq, 100); - - int max_g, max_g_rc, boot_g; - unsigned int boot, booti, boot_match, arraylen, arraylen_rc; - double logp, logp_rc; - - // Rprintf("Allocate bootstrap array to be used by the seqs.\n"); - int *bootarray = (int *) malloc((max_arraylen/8) * sizeof(int)); - if(bootarray == NULL) Rcpp::stop("Memory allocation failed."); - - // Rprintf("Classify the sequences.\n"); - bool first_warning = true; - for(j=0;j logp) { // rev-comp is better, replace with it - max_g = max_g_rc; - memcpy(karray, karray_rc, arraylen * sizeof(int)); - } - } - - rval(j) = max_g+1; // 1-index for return - - // Generate random indices to be used for subsampling - unifs = Rcpp::runif(100*(arraylen/8)); - booti = 0; - boot_match = 0; - for(boot=0;boot<100;boot++) { - for(i=0;i<(arraylen/8);i++,booti++) { - bootarray[i] = karray[(int) (arraylen*unifs[booti])]; - } - boot_g = get_best_genus(bootarray, &logp, (arraylen/8), n_kmers, genus_kmers, ngenus, kmer_prior, genus_num_plus1); - rboot_tax(j,boot) = boot_g+1; // 1-index for return - for(i=0;i<(genusmat.ncol());i++) { - if(genusmat(boot_g,i) == genusmat(max_g,i)) { - rboot(j,i)++; - } else { - break; - } - } - if(boot_g == max_g) { boot_match++; } - } - } - Rcpp::checkUserInterrupt(); - } - - free(genus_num_plus1); - free(genus_kmers); - free(kmer_prior); - free(ref_kv); - free(karray); - free(karray_rc); - - return(Rcpp::List::create(_["tax"]=rval, _["boot"]=rboot, _["boot_tax"]=rboot_tax)); -} struct AssignParallel : public RcppParallel::Worker { // source data std::vector seqs; std::vector rcs; - double *genus_num_plus1; - unsigned int *genus_kmers; - double *kmer_prior; + float *lgk_probability; int *C_genusmat; double *C_unifs; int *C_rboot; @@ -295,10 +132,10 @@ bool try_rc; // initialize with source and destination - AssignParallel(std::vector seqs, std::vector rcs, double *genus_num_plus1, unsigned int *genus_kmers, - double *kmer_prior, int *C_genusmat, double *C_unifs, int *C_rboot, int *C_rboot_tax, int *C_rval, + AssignParallel(std::vector seqs, std::vector rcs, float *lgk_probability, + int *C_genusmat, double *C_unifs, int *C_rboot, int *C_rboot_tax, int *C_rval, unsigned int k, size_t n_kmers, size_t ngenus, size_t nlevel, unsigned int max_arraylen, bool try_rc) - : seqs(seqs), rcs(rcs), genus_num_plus1(genus_num_plus1), genus_kmers(genus_kmers), kmer_prior(kmer_prior), + : seqs(seqs), rcs(rcs), lgk_probability(lgk_probability), C_genusmat(C_genusmat), C_unifs(C_unifs), C_rboot(C_rboot), C_rboot_tax(C_rboot_tax), C_rval(C_rval), k(k), n_kmers(n_kmers), ngenus(ngenus), nlevel(nlevel), max_arraylen(max_arraylen), try_rc(try_rc) {} @@ -311,7 +148,7 @@ int karray_rc[9999]; int bootarray[9999/8]; double *unifs; - double logp, logp_rc; + float logp, logp_rc; for(std::size_t j=begin;j logp) { // rev-comp is better, replace with it max_g = max_g_rc; memcpy(karray, karray_rc, arraylen * sizeof(int)); @@ -344,12 +181,12 @@ unifs = &C_unifs[j*max_arraylen]; booti = 0; boot_match = 0; - for(boot=0;boot<100;boot++) { + for(boot=0;boot max_arraylen) { max_arraylen = seqlen-k+1; } } // Rprintf("Generate random numbers for bootstrapping."); Rcpp::NumericVector unifs; - unifs = Rcpp::runif(nseq*100*(max_arraylen/8)); + unifs = Rcpp::runif(nseq*NBOOT*(max_arraylen/8)); double *C_unifs = (double *) malloc(unifs.size() * sizeof(double)); //E for(i=0;i