Binary files /tmp/tmpehYfan/wluw9abOvJ/r-bioc-bsgenome-1.56.0/build/vignette.rds and /tmp/tmpehYfan/OPzkFgudhS/r-bioc-bsgenome-1.58.0/build/vignette.rds differ diff -Nru r-bioc-bsgenome-1.56.0/debian/changelog r-bioc-bsgenome-1.58.0/debian/changelog --- r-bioc-bsgenome-1.56.0/debian/changelog 2020-05-20 14:57:08.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/debian/changelog 2020-11-03 12:55:17.000000000 +0000 @@ -1,3 +1,13 @@ +r-bioc-bsgenome (1.58.0-1) unstable; urgency=medium + + * New upstream version + * debhelper-compat 13 (routine-update) + * No tab in license text (routine-update) + * Set upstream metadata fields: Bug-Database, Bug-Submit, Repository, + Repository-Browse. + + -- Andreas Tille Tue, 03 Nov 2020 13:55:17 +0100 + r-bioc-bsgenome (1.56.0-1) unstable; urgency=medium * Team upload. diff -Nru r-bioc-bsgenome-1.56.0/debian/control r-bioc-bsgenome-1.58.0/debian/control --- r-bioc-bsgenome-1.56.0/debian/control 2020-05-20 14:57:08.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/debian/control 2020-11-03 12:55:17.000000000 +0000 @@ -4,18 +4,18 @@ Section: gnu-r Testsuite: autopkgtest-pkg-r Priority: optional -Build-Depends: debhelper-compat (= 12), +Build-Depends: debhelper-compat (= 13), dh-r, r-base-dev, r-bioc-biocgenerics (>= 0.13.8), r-bioc-s4vectors (>= 0.17.28), r-bioc-iranges (>= 2.13.16), - r-bioc-genomeinfodb (>= 1.23.9), + r-bioc-genomeinfodb (>= 1.25.6), r-bioc-genomicranges (>= 1.31.10), r-bioc-biostrings (>= 2.47.6), r-bioc-rtracklayer (>= 1.39.7), r-cran-matrixstats, - r-bioc-xvector, + r-bioc-xvector (>= 0.29.3), r-bioc-rsamtools Standards-Version: 4.5.0 Vcs-Browser: https://salsa.debian.org/r-pkg-team/r-bioc-bsgenome diff -Nru r-bioc-bsgenome-1.56.0/debian/copyright r-bioc-bsgenome-1.58.0/debian/copyright --- r-bioc-bsgenome-1.56.0/debian/copyright 2020-05-20 14:57:08.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/debian/copyright 2020-11-03 12:55:17.000000000 +0000 @@ -13,9 +13,9 @@ License: Artistic-2.0 License: Artistic-2.0 - The "Artistic License" + The "Artistic License" . - Preamble + Preamble . 1. You may make and give away verbatim copies of the source form of the Standard Version of this Package without restriction, provided that diff -Nru r-bioc-bsgenome-1.56.0/debian/patches/remove_paragraphs_bound_to_fail_from_vignette.patch r-bioc-bsgenome-1.58.0/debian/patches/remove_paragraphs_bound_to_fail_from_vignette.patch --- r-bioc-bsgenome-1.56.0/debian/patches/remove_paragraphs_bound_to_fail_from_vignette.patch 2020-05-20 14:57:08.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/debian/patches/remove_paragraphs_bound_to_fail_from_vignette.patch 2020-11-03 12:55:17.000000000 +0000 @@ -11,7 +11,7 @@ --- a/vignettes/GenomeSearching.Rnw +++ b/vignettes/GenomeSearching.Rnw -@@ -107,135 +107,7 @@ The BSgenome data package for the ce2 ge +@@ -107,132 +107,7 @@ The BSgenome data package for the ce2 ge available in Bioconductor but they could be added if there is demand for them. @@ -31,17 +31,15 @@ -@ - -When displayed, some basic information about the origin of the --genome is shown (organism, provider, provider version, etc...) +-genome is shown (organism, genome, provider, etc...) -followed by the index of {\it single} sequences and eventually -an additional index of {\it multiple} sequences. -Methods (adequately called {\it accessor methods}) are defined -for individual access to this information: -<>= --organism(genome) --provider(genome) --providerVersion(genome) +-metadata(genome) -seqnames(genome) --mseqnames(genome) +-seqinfo(genome) -@ - -See the man page for the \Rclass{BSgenome} class (\Rfunction{?BSgenome}) @@ -143,12 +141,11 @@ -Note that the length of \Robject{m2[p2 == m2]} should be -equal to \Robject{countPattern(p2, chrI, max.mismatch=0)}. - -- +% DELETED - % --------------------------------------------------------------------------- -@@ -297,158 +169,7 @@ More precisely, here is the analysis we + % --------------------------------------------------------------------------- +@@ -295,158 +170,7 @@ More precisely, here is the analysis we \end{itemize} @@ -183,7 +180,7 @@ - genome <- BSgenome.Celegans.UCSC.ce2 - seqnames <- seqnames(genome) - seqnames_in1string <- paste(seqnames, collapse=", ") -- cat("Target:", providerVersion(genome), +- cat("Target:", metadata(genome)$genome, - "chromosomes", seqnames_in1string, "\n") - append <- FALSE - for (seqname in seqnames) { @@ -308,7 +305,7 @@ % --------------------------------------------------------------------------- -@@ -466,25 +187,8 @@ that actually implements the fast search +@@ -464,25 +188,8 @@ that actually implements the fast search So if you need to reuse the same pattern a high number of times, it's a good idea to convert it {\it before} to pass it to the \Rmethod{matchPattern} or \Rmethod{countPattern} method. @@ -336,7 +333,7 @@ % The above example is Raphael's use case discussed on BioC on Feb 2006. % In Biostrings 1, the equivalent would be: -@@ -528,14 +232,7 @@ repeats with period less than or equal t +@@ -526,14 +233,7 @@ repeats with period less than or equal t For a given package, all the sequences will always have the same number of masks. @@ -352,7 +349,7 @@ The built-in masks are named consistenly across all the BSgenome data packages available in Bioconductor: -@@ -578,128 +275,7 @@ The {\it masked width} is the total numb +@@ -576,128 +276,7 @@ The {\it masked width} is the total numb that are masked and the {\it masked ratio} is the {\it masked width} divided by the length of the sequence. @@ -482,7 +479,7 @@ % --------------------------------------------------------------------------- -@@ -766,16 +342,13 @@ runAnalysis2 <- function(dict0, outfile= +@@ -764,16 +343,13 @@ runAnalysis2 <- function(dict0, outfile= Remember that \Rfunction{matchPDict} only works if all the patterns in the input dictionary have the same length so for this 2nd analysis, we will truncate the patterns in \Robject{ce2dict0} to 15 nucleotides: @@ -504,7 +501,7 @@ % --------------------------------------------------------------------------- -@@ -788,3 +361,4 @@ sessionInfo() +@@ -786,3 +362,4 @@ sessionInfo() \end{document} diff -Nru r-bioc-bsgenome-1.56.0/debian/upstream/metadata r-bioc-bsgenome-1.58.0/debian/upstream/metadata --- r-bioc-bsgenome-1.56.0/debian/upstream/metadata 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/debian/upstream/metadata 2020-11-03 12:55:17.000000000 +0000 @@ -0,0 +1,5 @@ +--- +Bug-Database: https://github.com/Bioconductor/BSgenome/issues +Bug-Submit: https://github.com/Bioconductor/BSgenome/issues/new +Repository: https://github.com/Bioconductor/BSgenome.git +Repository-Browse: https://github.com/Bioconductor/BSgenome diff -Nru r-bioc-bsgenome-1.56.0/DESCRIPTION r-bioc-bsgenome-1.58.0/DESCRIPTION --- r-bioc-bsgenome-1.56.0/DESCRIPTION 2020-04-27 23:50:24.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/DESCRIPTION 2020-10-27 22:53:54.000000000 +0000 @@ -3,19 +3,22 @@ genomes and their SNPs Description: Infrastructure shared by all the Biostrings-based genome data packages. -Version: 1.56.0 -Encoding: UTF-8 -Author: Hervé Pagès -Maintainer: H. Pagès biocViews: Genetics, Infrastructure, DataRepresentation, SequenceMatching, Annotation, SNP +URL: https://bioconductor.org/packages/BSgenome +BugReports: https://github.com/Bioconductor/BSgenome/issues +Version: 1.58.0 +License: Artistic-2.0 +Encoding: UTF-8 +Author: Hervé Pagès +Maintainer: H. Pagès Depends: R (>= 2.8.0), methods, BiocGenerics (>= 0.13.8), S4Vectors (>= - 0.17.28), IRanges (>= 2.13.16), GenomeInfoDb (>= 1.23.9), + 0.17.28), IRanges (>= 2.13.16), GenomeInfoDb (>= 1.25.6), GenomicRanges (>= 1.31.10), Biostrings (>= 2.47.6), rtracklayer (>= 1.39.7) Imports: methods, utils, stats, matrixStats, BiocGenerics, S4Vectors, - IRanges, XVector, GenomeInfoDb, GenomicRanges, Biostrings, - Rsamtools, rtracklayer + IRanges, XVector (>= 0.29.3), GenomeInfoDb, GenomicRanges, + Biostrings, Rsamtools, rtracklayer Suggests: BiocManager, Biobase, BSgenome.Celegans.UCSC.ce2, BSgenome.Hsapiens.UCSC.hg38, BSgenome.Hsapiens.UCSC.hg38.masked, @@ -26,7 +29,6 @@ TxDb.Mmusculus.UCSC.mm10.knownGene, SNPlocs.Hsapiens.dbSNP144.GRCh38, XtraSNPlocs.Hsapiens.dbSNP144.GRCh38, hgu95av2probe, RUnit -License: Artistic-2.0 LazyLoad: yes Collate: utils.R OnDiskLongTable_old-class.R OnDiskLongTable-class.R OnDiskNamedSequences-class.R SNPlocs-class.R @@ -36,9 +38,9 @@ extractAt-methods.R bsapply.R BSgenomeViews-class.R BSgenome-utils.R export-methods.R BSgenomeForge.R git_url: https://git.bioconductor.org/packages/BSgenome -git_branch: RELEASE_3_11 -git_last_commit: 8364a0f -git_last_commit_date: 2020-04-27 -Date/Publication: 2020-04-27 +git_branch: RELEASE_3_12 +git_last_commit: 3a4926e +git_last_commit_date: 2020-10-27 +Date/Publication: 2020-10-27 NeedsCompilation: no -Packaged: 2020-04-27 23:50:24 UTC; biocbuild +Packaged: 2020-10-27 22:53:54 UTC; biocbuild Binary files /tmp/tmpehYfan/wluw9abOvJ/r-bioc-bsgenome-1.56.0/inst/doc/BSgenomeForge.pdf and /tmp/tmpehYfan/OPzkFgudhS/r-bioc-bsgenome-1.58.0/inst/doc/BSgenomeForge.pdf differ diff -Nru r-bioc-bsgenome-1.56.0/inst/doc/BSgenomeForge.R r-bioc-bsgenome-1.58.0/inst/doc/BSgenomeForge.R --- r-bioc-bsgenome-1.56.0/inst/doc/BSgenomeForge.R 2020-04-27 23:49:28.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/inst/doc/BSgenomeForge.R 2020-10-27 22:52:59.000000000 +0000 @@ -1,7 +1,7 @@ ### R code from vignette source 'BSgenomeForge.Rnw' ################################################### -### code chunk number 1: BSgenomeForge.Rnw:193-196 +### code chunk number 1: BSgenomeForge.Rnw:189-192 ################################################### library(Biostrings) file <- system.file("extdata", "ce2chrM.fa.gz", package="BSgenome") @@ -9,7 +9,7 @@ ################################################### -### code chunk number 2: BSgenomeForge.Rnw:429-440 +### code chunk number 2: BSgenomeForge.Rnw:422-433 ################################################### library(BSgenome) seed_files <- system.file("extdata", "GentlemanLab", package="BSgenome") @@ -25,14 +25,14 @@ ################################################### -### code chunk number 3: BSgenomeForge.Rnw:453-455 (eval = FALSE) +### code chunk number 3: BSgenomeForge.Rnw:446-448 (eval = FALSE) ################################################### ## library(BSgenome) ## forgeBSgenomeDataPkg("path/to/my/seed") ################################################### -### code chunk number 4: BSgenomeForge.Rnw:678-683 +### code chunk number 4: BSgenomeForge.Rnw:671-676 ################################################### library(BSgenome) seed_files <- system.file("extdata", "GentlemanLab", package="BSgenome") @@ -42,14 +42,14 @@ ################################################### -### code chunk number 5: BSgenomeForge.Rnw:698-700 (eval = FALSE) +### code chunk number 5: BSgenomeForge.Rnw:691-693 (eval = FALSE) ################################################### ## library(BSgenome) ## forgeMaskedBSgenomeDataPkg("path/to/my/seed") ################################################### -### code chunk number 6: BSgenomeForge.Rnw:742-743 +### code chunk number 6: BSgenomeForge.Rnw:735-736 ################################################### sessionInfo() diff -Nru r-bioc-bsgenome-1.56.0/inst/doc/BSgenomeForge.Rnw r-bioc-bsgenome-1.58.0/inst/doc/BSgenomeForge.Rnw --- r-bioc-bsgenome-1.56.0/inst/doc/BSgenomeForge.Rnw 2020-04-27 20:20:26.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/inst/doc/BSgenomeForge.Rnw 2020-10-27 17:17:33.000000000 +0000 @@ -75,13 +75,6 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -IMPORTANT NOTE: Starting with Bioconductor 2.14, the -\Rfunction{forgeBSgenomeDataPkg} function doesn't handle masks anymore -i.e. now it produces a \term{BSgenome data package} that contains only -the bare sequences. A new function \Rfunction{forgeMaskedBSgenomeDataPkg} -has been added to forge a \term{BSgenome data package} with masked -sequences. This vignette has been updated to reflect these changes. - This vignette describes the process of forging a \term{BSgenome data package}. It is intended for Bioconductor users who want to make a new \term{BSgenome data package}, not for regular users of these packages. @@ -128,9 +121,12 @@ \Rpackage{Biostrings} or \Rpackage{BSgenome} packages. Questions, comments or bug reports about this vignette or about the -BSgenomeForge functions are welcome. Please address them to the author -(\code{hpages@fredhutch.org}) or post them on the Bioconductor support -site \footnote{\url{https://support.bioconductor.org/}}. +BSgenomeForge functions are welcome. Please use the Bioconductor support +site \footnote{\url{https://support.bioconductor.org/}} if you have +questions or need help about forging a \term{BSgenome data package}, +or about anything related to the \Rpackage{BSgenome} package. For bug +reports or feature requests, please open an issue on GitHub at +\url{https://github.com/Bioconductor/BSgenome/issues}. In the next section (``How to forge a BSgenome data package with bare sequences''), we describe how to forge a \term{BSgenome data package} @@ -318,27 +314,24 @@ the \textit{List of UCSC genome releases} table \footnote{\url{http://genome.ucsc.edu/FAQ/FAQreleases\#release1}}. -\item \code{provider}: The provider of the \term{sequence data files} e.g. - \code{UCSC}, \code{NCBI}, \code{BDGP}, \code{FlyBase}, etc... - Should preferably match part 3 of the package name (field \code{Package}). - -\item \code{provider\_version}: The provider-side version of the genome. +\item \code{genome}: The name of the genome. Typically the name of an NCBI + assembly (e.g. \code{GRCh38.p12}, \code{WBcel235}, \code{TAIR10.1}, + \code{ARS-UCD1.2}, etc...) or UCSC genome (e.g. \code{hg38}, + \code{bosTau9}, \code{galGal6}, \code{ce11}, etc...). Should preferably match part 4 of the package name (field \code{Package}). For the packages built by the Bioconductor project from a UCSC genome, this field corresponds to the \code{UCSC VERSION} field of the \textit{List of UCSC genome releases} table. +\item \code{provider}: The provider of the \term{sequence data files} e.g. + \code{UCSC}, \code{NCBI}, \code{BDGP}, \code{FlyBase}, etc... + Should preferably match part 3 of the package name (field \code{Package}). + \item \code{release\_date}: When this assembly of the genome was released. For the packages built by the Bioconductor project from a UCSC genome, this field corresponds to the \code{RELEASE DATE} field of the \textit{List of UCSC genome releases} table. -\item \code{release\_name}: The release name or build number of this assembly - of the genome. - For the packages built by the Bioconductor project from a UCSC genome, - this field corresponds to the \code{RELEASE NAME} field of the - \textit{List of UCSC genome releases} table. - \item \code{source\_url}: The permanent URL where the \term{sequence data files} used to forge the \term{target package} can be found. Binary files /tmp/tmpehYfan/wluw9abOvJ/r-bioc-bsgenome-1.56.0/inst/doc/GenomeSearching.pdf and /tmp/tmpehYfan/OPzkFgudhS/r-bioc-bsgenome-1.58.0/inst/doc/GenomeSearching.pdf differ diff -Nru r-bioc-bsgenome-1.56.0/inst/doc/GenomeSearching.R r-bioc-bsgenome-1.58.0/inst/doc/GenomeSearching.R --- r-bioc-bsgenome-1.56.0/inst/doc/GenomeSearching.R 2020-04-27 23:50:23.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/inst/doc/GenomeSearching.R 2020-10-27 22:53:54.000000000 +0000 @@ -18,11 +18,9 @@ ################################################### ### code chunk number 3: b3 ################################################### -organism(genome) -provider(genome) -providerVersion(genome) +metadata(genome) seqnames(genome) -mseqnames(genome) +seqinfo(genome) ################################################### @@ -123,7 +121,7 @@ genome <- BSgenome.Celegans.UCSC.ce2 seqnames <- seqnames(genome) seqnames_in1string <- paste(seqnames, collapse=", ") - cat("Target:", providerVersion(genome), + cat("Target:", metadata(genome)$genome, "chromosomes", seqnames_in1string, "\n") append <- FALSE for (seqname in seqnames) { @@ -351,7 +349,7 @@ runOneStrandAnalysis <- function(dict0, bsgenome, seqnames, strand, outfile="", append=FALSE) { - cat("\nTarget: strand", strand, "of", providerVersion(bsgenome), + cat("\nTarget: strand", strand, "of", metadata(bsgenome)$genome, "chromosomes", paste(seqnames, collapse=", "), "\n") if (strand == "-") dict0 <- reverseComplement(dict0) diff -Nru r-bioc-bsgenome-1.56.0/inst/doc/GenomeSearching.Rnw r-bioc-bsgenome-1.58.0/inst/doc/GenomeSearching.Rnw --- r-bioc-bsgenome-1.56.0/inst/doc/GenomeSearching.Rnw 2020-04-27 20:20:26.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/inst/doc/GenomeSearching.Rnw 2020-10-27 17:17:33.000000000 +0000 @@ -123,17 +123,15 @@ @ When displayed, some basic information about the origin of the -genome is shown (organism, provider, provider version, etc...) +genome is shown (organism, genome, provider, etc...) followed by the index of {\it single} sequences and eventually an additional index of {\it multiple} sequences. Methods (adequately called {\it accessor methods}) are defined for individual access to this information: <>= -organism(genome) -provider(genome) -providerVersion(genome) +metadata(genome) seqnames(genome) -mseqnames(genome) +seqinfo(genome) @ See the man page for the \Rclass{BSgenome} class (\Rfunction{?BSgenome}) @@ -328,7 +326,7 @@ genome <- BSgenome.Celegans.UCSC.ce2 seqnames <- seqnames(genome) seqnames_in1string <- paste(seqnames, collapse=", ") - cat("Target:", providerVersion(genome), + cat("Target:", metadata(genome)$genome, "chromosomes", seqnames_in1string, "\n") append <- FALSE for (seqname in seqnames) { @@ -737,7 +735,7 @@ runOneStrandAnalysis <- function(dict0, bsgenome, seqnames, strand, outfile="", append=FALSE) { - cat("\nTarget: strand", strand, "of", providerVersion(bsgenome), + cat("\nTarget: strand", strand, "of", metadata(bsgenome)$genome, "chromosomes", paste(seqnames, collapse=", "), "\n") if (strand == "-") dict0 <- reverseComplement(dict0) diff -Nru r-bioc-bsgenome-1.56.0/inst/extdata/GentlemanLab/BSgenome.Athaliana.TAIR.TAIR10.1-seed r-bioc-bsgenome-1.58.0/inst/extdata/GentlemanLab/BSgenome.Athaliana.TAIR.TAIR10.1-seed --- r-bioc-bsgenome-1.56.0/inst/extdata/GentlemanLab/BSgenome.Athaliana.TAIR.TAIR10.1-seed 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/inst/extdata/GentlemanLab/BSgenome.Athaliana.TAIR.TAIR10.1-seed 2020-10-27 17:17:33.000000000 +0000 @@ -0,0 +1,16 @@ +Package: BSgenome.Athaliana.TAIR.TAIR10.1 +Title: Full genome sequences for Arabidopsis thaliana (TAIR10.1) +Description: Full genome sequences for Arabidopsis thaliana as provided by TAIR (TAIR10.1, RefSeq assembly accession: GCF_000001735.4) and stored in Biostrings objects. +Version: 1.5.0 +organism: Arabidopsis thaliana +common_name: Thale cress +genome: TAIR10.1 +provider: TAIR +release_date: 2018/03/15 +source_url: https://www.ncbi.nlm.nih.gov/assembly/GCF_000001735.4 +organism_biocview: Arabidopsis_thaliana +BSgenomeObjname: Athaliana +SrcDataFiles: GCF_000001735.4_TAIR10.1_genomic.fna.gz from https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/735/GCF_000001735.4_TAIR10.1/ +PkgExamples: genome[["1"]] +seqs_srcdir: /home/hpages/BSgenomeForge/srcdata/BSgenome.Athaliana.TAIR.TAIR10.1/seqs +ondisk_seq_format: rds diff -Nru r-bioc-bsgenome-1.56.0/inst/extdata/GentlemanLab/BSgenome.Athaliana.TAIR.TAIR10.1-tools/splitbigfasta.R r-bioc-bsgenome-1.58.0/inst/extdata/GentlemanLab/BSgenome.Athaliana.TAIR.TAIR10.1-tools/splitbigfasta.R --- r-bioc-bsgenome-1.56.0/inst/extdata/GentlemanLab/BSgenome.Athaliana.TAIR.TAIR10.1-tools/splitbigfasta.R 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/inst/extdata/GentlemanLab/BSgenome.Athaliana.TAIR.TAIR10.1-tools/splitbigfasta.R 2020-10-27 17:17:33.000000000 +0000 @@ -0,0 +1,19 @@ +### +library(Biostrings) +library(GenomeInfoDb) + +### Download GCF_000001735.4_TAIR10.1_genomic.fna.gz from +### https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/735/GCF_000001735.4_TAIR10.1/ +TAIR10.1 <- readDNAStringSet("GCF_000001735.4_TAIR10.1_genomic.fna.gz") +current_RefSeqAccn <- unlist(heads(strsplit(names(TAIR10.1), " ", fixed=TRUE), n=1L)) +chrominfo <- getChromInfoFromNCBI("TAIR10.1") +expected_RefSeqAccn <- chrominfo[ , "RefSeqAccn"] +stopifnot(identical(expected_RefSeqAccn, current_RefSeqAccn)) +names(TAIR10.1) <- chrominfo[ , "SequenceName"] + +for (i in seq_along(TAIR10.1)) { + filename <- paste0(names(TAIR10.1)[[i]], ".fa") + cat("writing ", filename, "\n", sep="") + writeXStringSet(TAIR10.1[i], file=filename, width=50L) +} + diff -Nru r-bioc-bsgenome-1.56.0/inst/extdata/GentlemanLab/BSgenome.Gmellonella.NCBI.ASM364042v2-seed r-bioc-bsgenome-1.58.0/inst/extdata/GentlemanLab/BSgenome.Gmellonella.NCBI.ASM364042v2-seed --- r-bioc-bsgenome-1.56.0/inst/extdata/GentlemanLab/BSgenome.Gmellonella.NCBI.ASM364042v2-seed 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/inst/extdata/GentlemanLab/BSgenome.Gmellonella.NCBI.ASM364042v2-seed 2020-10-27 17:17:33.000000000 +0000 @@ -0,0 +1,17 @@ +Package: BSgenome.Gmellonella.NCBI.ASM364042v2 +Title: Full genome sequences for Galleria mellonella (ASM364042v2) +Description: Full genome sequences for Galleria mellonella as provided by NCBI (assembly ASM364042v2, assembly accession GCF_003640425.2) and stored in Biostrings objects. +Version: 1.5.0 +organism: Galleria mellonella +common_name: Greater wax moth +genome: ASM364042v2 +provider: NCBI +release_date: 2019/11/06 +source_url: https://www.ncbi.nlm.nih.gov/assembly/GCF_003640425.2 +organism_biocview: Galleria_mellonella +BSgenomeObjname: Gmellonella +circ_seqs: "NC_028532.1" +SrcDataFiles: GCF_003640425.2_ASM364042v2_genomic.fna.gz from https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/640/425/GCF_003640425.2_ASM364042v2/ +PkgExamples: genome[["NC_028532.1"]] +seqs_srcdir: /home/hpages/BSgenomeForge/srcdata/BSgenome.Gmellonella.NCBI.ASM364042v2/seqs +seqfile_name: ASM364042v2.sorted.2bit diff -Nru r-bioc-bsgenome-1.56.0/inst/extdata/GentlemanLab/BSgenome.Gmellonella.NCBI.ASM364042v2-tools/fasta_to_sorted_2bit.R r-bioc-bsgenome-1.58.0/inst/extdata/GentlemanLab/BSgenome.Gmellonella.NCBI.ASM364042v2-tools/fasta_to_sorted_2bit.R --- r-bioc-bsgenome-1.56.0/inst/extdata/GentlemanLab/BSgenome.Gmellonella.NCBI.ASM364042v2-tools/fasta_to_sorted_2bit.R 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/inst/extdata/GentlemanLab/BSgenome.Gmellonella.NCBI.ASM364042v2-tools/fasta_to_sorted_2bit.R 2020-10-27 17:17:33.000000000 +0000 @@ -0,0 +1,26 @@ +### +library(Biostrings) + +### Download GCF_003640425.2_ASM364042v2_genomic.fna.gz from +### https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/640/425/GCF_003640425.2_ASM364042v2/ +dna <- readDNAStringSet("GCF_003640425.2_ASM364042v2_genomic.fna.gz") + +### Check seqnames. +current_RefSeqAccn <- unlist(heads(strsplit(names(dna), " ", fixed=TRUE), n=1L)) +library(GenomeInfoDb) +chrominfo <- getChromInfoFromNCBI("GCF_003640425.2") +expected_RefSeqAccn <- chrominfo[ , "RefSeqAccn"] +stopifnot(setequal(expected_RefSeqAccn, current_RefSeqAccn)) + +### Reorder sequences. +dna <- dna[match(expected_RefSeqAccn, current_RefSeqAccn)] + +### Rename sequences. An alternative would be to rename them to +### chrominfo[ , "SequenceName"] but these names are VERY ugly (e.g. +### "ScRZk8e_1;HRSCAF=1"). +names(dna) <- expected_RefSeqAccn + +### Export as 2bit. +library(rtracklayer) +export(dna, "ASM364042v2.sorted.2bit") + diff -Nru r-bioc-bsgenome-1.56.0/inst/extdata/GentlemanLab/BSgenome.Ppaniscus.UCSC.panPan1-seed r-bioc-bsgenome-1.58.0/inst/extdata/GentlemanLab/BSgenome.Ppaniscus.UCSC.panPan1-seed --- r-bioc-bsgenome-1.56.0/inst/extdata/GentlemanLab/BSgenome.Ppaniscus.UCSC.panPan1-seed 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/inst/extdata/GentlemanLab/BSgenome.Ppaniscus.UCSC.panPan1-seed 2020-10-27 17:17:33.000000000 +0000 @@ -0,0 +1,17 @@ +Package: BSgenome.Ppaniscus.UCSC.panPan1 +Title: Full genome sequences for Pan paniscus (UCSC version panPan1) +Description: Full genome sequences for Pan paniscus (Bonobo) as provided by UCSC (panPan1, May 2012) and stored in Biostrings objects. +Version: 1.4.3 +organism: Pan paniscus +common_name: Bonobo +provider: UCSC +provider_version: panPan1 +release_date: May 2012 +release_name: Max-Planck Institute panpan1 +source_url: https://hgdownload.cse.ucsc.edu/goldenPath/panPan1/bigZips/ +organism_biocview: Pan_paniscus +BSgenomeObjname: Ppaniscus +SrcDataFiles: panPan1.2bit from https://hgdownload.cse.ucsc.edu/goldenPath/panPan1/bigZips/ +PkgExamples: genome$AJFE01000001 # same as genome[["AJFE01000001"]] +seqs_srcdir: /home/hpages/BSgenomeForge/srcdata/BSgenome.Ppaniscus.UCSC.panPan1/seqs +seqfile_name: panPan1.2bit diff -Nru r-bioc-bsgenome-1.56.0/inst/extdata/GentlemanLab/BSgenome.Ppaniscus.UCSC.panPan2-seed r-bioc-bsgenome-1.58.0/inst/extdata/GentlemanLab/BSgenome.Ppaniscus.UCSC.panPan2-seed --- r-bioc-bsgenome-1.56.0/inst/extdata/GentlemanLab/BSgenome.Ppaniscus.UCSC.panPan2-seed 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/inst/extdata/GentlemanLab/BSgenome.Ppaniscus.UCSC.panPan2-seed 2020-10-27 17:17:33.000000000 +0000 @@ -0,0 +1,17 @@ +Package: BSgenome.Ppaniscus.UCSC.panPan2 +Title: Full genome sequences for Pan paniscus (UCSC version panPan2) +Description: Full genome sequences for Pan paniscus (Bonobo) as provided by UCSC (panPan2, Dec. 2015) and stored in Biostrings objects. +Version: 1.4.3 +organism: Pan paniscus +common_name: Bonobo +provider: UCSC +provider_version: panPan2 +release_date: Dec. 2015 +release_name: Max-Planck Institute for Evolutionary Anthropology panpan1.1 +source_url: https://hgdownload.cse.ucsc.edu/goldenPath/panPan2/bigZips/ +organism_biocview: Pan_paniscus +BSgenomeObjname: Ppaniscus +SrcDataFiles: panPan2.2bit from https://hgdownload.cse.ucsc.edu/goldenPath/panPan2/bigZips/ +PkgExamples: genome$chr1 # same as genome[["chr1"]] +seqs_srcdir: /home/hpages/BSgenomeForge/srcdata/BSgenome.Ppaniscus.UCSC.panPan2/seqs +seqfile_name: panPan2.2bit diff -Nru r-bioc-bsgenome-1.56.0/inst/extdata/GentlemanLab/ForgeAll.R r-bioc-bsgenome-1.58.0/inst/extdata/GentlemanLab/ForgeAll.R --- r-bioc-bsgenome-1.56.0/inst/extdata/GentlemanLab/ForgeAll.R 2020-04-27 20:20:26.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/inst/extdata/GentlemanLab/ForgeAll.R 2020-10-27 17:17:33.000000000 +0000 @@ -12,6 +12,7 @@ "BSgenome.Amellifera.UCSC.apiMel2", "BSgenome.Athaliana.TAIR.04232008", "BSgenome.Athaliana.TAIR.TAIR9", + "BSgenome.Athaliana.TAIR.TAIR10.1", "BSgenome.Btaurus.UCSC.bosTau3", "BSgenome.Btaurus.UCSC.bosTau4", "BSgenome.Btaurus.UCSC.bosTau6", @@ -35,6 +36,7 @@ "BSgenome.Dvirilis.Ensembl.dvircaf1", "BSgenome.Ecoli.NCBI.20080805", "BSgenome.Gaculeatus.UCSC.gasAcu1", + "BSgenome.Gmellonella.NCBI.ASM364042v2", "BSgenome.Ggallus.UCSC.galGal3", "BSgenome.Ggallus.UCSC.galGal4", "BSgenome.Ggallus.UCSC.galGal5", @@ -54,6 +56,8 @@ "BSgenome.Mmusculus.UCSC.mm8", "BSgenome.Mmusculus.UCSC.mm9", "BSgenome.Mmusculus.UCSC.mm10", + "BSgenome.Ppaniscus.UCSC.panPan1", + "BSgenome.Ppaniscus.UCSC.panPan2", "BSgenome.Ptroglodytes.UCSC.panTro2", "BSgenome.Ptroglodytes.UCSC.panTro3", "BSgenome.Ptroglodytes.UCSC.panTro5", diff -Nru r-bioc-bsgenome-1.56.0/inst/pkgtemplates/BSgenome_datapkg/DESCRIPTION r-bioc-bsgenome-1.58.0/inst/pkgtemplates/BSgenome_datapkg/DESCRIPTION --- r-bioc-bsgenome-1.56.0/inst/pkgtemplates/BSgenome_datapkg/DESCRIPTION 2020-04-27 20:20:26.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/inst/pkgtemplates/BSgenome_datapkg/DESCRIPTION 2020-10-27 17:17:33.000000000 +0000 @@ -4,15 +4,14 @@ Version: @PKGVERSION@ Author: @AUTHOR@ Maintainer: @MAINTAINER@ -Depends: BSgenome (>= @BSGENOMEVERSION@) +Depends: R (>= 3.5.0), BSgenome (>= @BSGENOMEVERSION@) Imports: BSgenome Suggests: @SUGGESTS@ License: @LICENSE@ organism: @ORGANISM@ common_name: @COMMONNAME@ +genome: @GENOME@ provider: @PROVIDER@ -provider_version: @PROVIDERVERSION@ release_date: @RELEASEDATE@ -release_name: @RELEASENAME@ source_url: @SOURCEURL@ biocViews: AnnotationData, Genetics, BSgenome, @ORGANISMBIOCVIEW@ diff -Nru r-bioc-bsgenome-1.56.0/inst/pkgtemplates/BSgenome_datapkg/NAMESPACE r-bioc-bsgenome-1.58.0/inst/pkgtemplates/BSgenome_datapkg/NAMESPACE --- r-bioc-bsgenome-1.56.0/inst/pkgtemplates/BSgenome_datapkg/NAMESPACE 2020-04-27 20:20:26.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/inst/pkgtemplates/BSgenome_datapkg/NAMESPACE 2020-10-27 17:17:33.000000000 +0000 @@ -1,9 +1,9 @@ import(BSgenome) -### Don't export @PKGNAME@ or @BSGENOMEOBJNAME@ (the new and old names of the -### BSgenome object defined in this package): the object is created and its 2 -### names are dynamically exported at load time (refer to R/zzz.R for the -### details). +### Don't export @PKGNAME@ or @BSGENOMEOBJNAME@ (the new and +### old names of the BSgenome object defined in this package): the object +### is created and its 2 names are dynamically exported at load time (refer +### to R/zzz.R for the details). #export(@PKGNAME@) #export(@BSGENOMEOBJNAME@) diff -Nru r-bioc-bsgenome-1.56.0/inst/pkgtemplates/BSgenome_datapkg/R/zzz.R r-bioc-bsgenome-1.58.0/inst/pkgtemplates/BSgenome_datapkg/R/zzz.R --- r-bioc-bsgenome-1.56.0/inst/pkgtemplates/BSgenome_datapkg/R/zzz.R 2020-04-27 20:20:26.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/inst/pkgtemplates/BSgenome_datapkg/R/zzz.R 2020-10-27 17:17:33.000000000 +0000 @@ -21,10 +21,9 @@ bsgenome <- BSgenome( organism="@ORGANISM@", common_name="@COMMONNAME@", + genome="@GENOME@", provider="@PROVIDER@", - provider_version="@PROVIDERVERSION@", release_date="@RELEASEDATE@", - release_name="@RELEASENAME@", source_url="@SOURCEURL@", seqnames=.seqnames, circ_seqs=.circ_seqs, diff -Nru r-bioc-bsgenome-1.56.0/inst/pkgtemplates/MaskedBSgenome_datapkg/DESCRIPTION r-bioc-bsgenome-1.58.0/inst/pkgtemplates/MaskedBSgenome_datapkg/DESCRIPTION --- r-bioc-bsgenome-1.56.0/inst/pkgtemplates/MaskedBSgenome_datapkg/DESCRIPTION 2020-04-27 20:20:26.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/inst/pkgtemplates/MaskedBSgenome_datapkg/DESCRIPTION 2020-10-27 17:17:33.000000000 +0000 @@ -10,9 +10,8 @@ License: @LICENSE@ organism: @ORGANISM@ common_name: @COMMONNAME@ +genome: @GENOME@ provider: @PROVIDER@ -provider_version: @PROVIDERVERSION@ release_date: @RELEASEDATE@ -release_name: @RELEASENAME@ source_url: @SOURCEURL@ biocViews: AnnotationData, Genetics, BSgenome, @ORGANISMBIOCVIEW@ diff -Nru r-bioc-bsgenome-1.56.0/man/available.genomes.Rd r-bioc-bsgenome-1.58.0/man/available.genomes.Rd --- r-bioc-bsgenome-1.56.0/man/available.genomes.Rd 2020-04-27 20:20:26.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/man/available.genomes.Rd 2020-10-27 17:17:33.000000000 +0000 @@ -38,9 +38,11 @@ } \item{genome}{ A \link{BSgenome} object, or the full name of an installed BSgenome - data package, or a short string specifying a genome assembly (a.k.a. - provider version) that refers unambiguously to an installed BSgenome - data package. + data package, or a short string specifying the name of an NCBI assembly + (e.g. \code{"GRCh38"}, \code{"TAIR10.1"}, etc...) or UCSC genome (e.g. + \code{"hg38"}, \code{"bosTau9"}, \code{"galGal6"}, \code{"ce11"}, etc...). + The supplied short string must refer unambiguously to an installed + BSgenome data package. } \item{masked}{ \code{TRUE} or \code{FALSE}. Whether to search for the \emph{masked} @@ -75,11 +77,10 @@ genome sequences. We formally refer to it as the \emph{provider} of the genome. E.g. \code{UCSC}, \code{NCBI}, \code{TAIR}, etc... - \item The 4th part is the release string or number used by this - organisation for this particular genome assembly. We formally - refer to it as the \emph{provider version} of the genome. - E.g. \code{hg38}, \code{GRCh38}, \code{hg19}, \code{mm10}, - \code{susScr11}, etc... + \item The 4th part is a short string specifying the name of an NCBI + assembly (e.g. \code{GRCh38}, \code{TAIR10.1}, etc...) or UCSC + genome (e.g. \code{hg38}, \code{mm10}, \code{susScr11}, + \code{bosTau9}, \code{galGal6}, \code{ce11}, etc...). \item If the package contains masked sequences, its name has the \code{.masked} suffix added to it, which is typically the 5th part. @@ -98,7 +99,7 @@ \code{installed.genomes}). If \code{splitNameParts=TRUE}, the list of packages is returned in a data frame with one row per package and the following columns: \code{pkgname} (character), \code{organism} (factor), - \code{provider} (factor), \code{provider_version} (character), and + \code{provider} (factor), \code{genome} (character), and \code{masked} (logical). For \code{getBSgenome}: the \link{BSgenome} object containing the sequences @@ -153,15 +154,15 @@ genome <- getBSgenome("BSgenome.Celegans.UCSC.ce2") genome -## Specify a genome assembly (a.k.a. provider version): +## Specify a UCSC genome: genome <- getBSgenome("hg38") class(genome) # BSgenome object -providerVersion(genome) +seqinfo(genome) genome$chrM genome <- getBSgenome("hg38", masked=TRUE) class(genome) # MaskedBSgenome object -providerVersion(genome) +seqinfo(genome) genome$chr22 } diff -Nru r-bioc-bsgenome-1.56.0/man/BSgenome-class.Rd r-bioc-bsgenome-1.58.0/man/BSgenome-class.Rd --- r-bioc-bsgenome-1.56.0/man/BSgenome-class.Rd 2020-04-27 20:20:26.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/man/BSgenome-class.Rd 2020-10-27 17:17:33.000000000 +0000 @@ -7,14 +7,22 @@ \alias{BSgenome} % Accessors: -\alias{length,BSgenome-method} +\alias{organism,BSgenome-method} +\alias{commonName,BSgenome-method} +\alias{provider,BSgenome-method} +\alias{providerVersion,BSgenome-method} +\alias{releaseDate,BSgenome-method} +\alias{releaseName,BSgenome-method} \alias{sourceUrl} \alias{sourceUrl,BSgenome-method} +\alias{length,BSgenome-method} \alias{mseqnames} \alias{mseqnames,BSgenome-method} \alias{names,BSgenome-method} \alias{masknames} \alias{masknames,BSgenome-method} +\alias{masknames,MaskedBSgenome-method} +\alias{seqnames,BSgenome-method} \alias{seqinfo,BSgenome-method} \alias{seqinfo<-,BSgenome-method} \alias{seqnames<-,BSgenome-method} @@ -24,6 +32,8 @@ % Other methods: \alias{as.list,BSgenome-method} +\alias{coerce,BSgenome,GenomeDescription-method} +\alias{bsgenomeName,BSgenome-method} \alias{show,BSgenome-method} \alias{[[,BSgenome-method} \alias{$,BSgenome-method} @@ -42,16 +52,31 @@ \section{Accessor methods}{ In the code snippets below, \code{x} is a BSgenome object. - Note that, because the BSgenome class contains the - \link[GenomeInfoDb]{GenomeDescription} class, then all the accessor methods - for \link[GenomeInfoDb]{GenomeDescription} objects can also be used on - \code{x}. \describe{ - \item{\code{sourceUrl(x)}}{ - Returns the source URL i.e. the permanent URL to the place where the - FASTA files used to produce the sequences contained in \code{x} can - be found (and downloaded). + \item{\code{metadata(x)}}{ + Returns a named list containing metadata associated with the BSgenome + object. The components of the list are: + \itemize{ + \item \code{organism}: The scientific name of the organism that this + BSgenome object is for. E.g. \code{"Homo sapiens"}, + \code{"Mus musculus"}, \code{"Caenorhabditis elegans"}, etc... + \item \code{common_name}: The common name of the organism that this + BSgenome object is for. E.g. \code{"Human"}, \code{"Mouse"}, + \code{"Worm"}, etc... + \item \code{genome}: The name of the genome. Typically the name of + an NCBI assembly (e.g. \code{"GRCh38.p12"}, \code{"WBcel235"}, + \code{"TAIR10.1"}, \code{"ARS-UCD1.2"}, etc...) or UCSC genome + (e.g. \code{"hg38"}, \code{"bosTau9"}, \code{"galGal6"}, + \code{"ce11"}, etc...). + \item \code{provider}: The provider of this genome. E.g. + \code{"UCSC"}, \code{"BDGP"}, \code{"FlyBase"}, etc... + \item \code{release_date}: The release date of this genome e.g. + \code{"Mar. 2006"}. + \item \code{source_url}: The permanent URL to the place where the + FASTA files used to produce the sequences contained in + \code{x} can be found (and downloaded). + } } \item{\code{seqnames(x)}, \code{seqnames(x) <- value}}{ Gets or sets the names of the single sequences contained in \code{x}. @@ -166,6 +191,8 @@ ## into memory: library(BSgenome.Celegans.UCSC.ce2) +metadata(Celegans) + ## Number of sequences in this genome: length(Celegans) diff -Nru r-bioc-bsgenome-1.56.0/man/BSgenomeForge.Rd r-bioc-bsgenome-1.58.0/man/BSgenomeForge.Rd --- r-bioc-bsgenome-1.56.0/man/BSgenomeForge.Rd 2020-04-27 20:20:26.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/man/BSgenomeForge.Rd 2020-10-27 17:17:33.000000000 +0000 @@ -10,7 +10,8 @@ \alias{forgeBSgenomeDataPkg,BSgenomeDataPkgSeed-method} \alias{forgeBSgenomeDataPkg,list-method} \alias{forgeBSgenomeDataPkg,character-method} -\alias{forgeSeqlengthsFile} +\alias{forgeSeqlengthsRdsFile} +\alias{forgeSeqlengthsRdaFile} \alias{forgeSeqFiles} \alias{forgeMasksFiles} @@ -28,14 +29,19 @@ ## Low-level BSgenomeForge functions: -forgeSeqlengthsFile(seqnames, prefix="", suffix=".fa", - seqs_srcdir=".", seqs_destdir=".", verbose=TRUE) +forgeSeqlengthsRdsFile(seqnames, prefix="", suffix=".fa", + seqs_srcdir=".", seqs_destdir=".", + genome=NA_character_, verbose=TRUE) + +forgeSeqlengthsRdaFile(seqnames, prefix="", suffix=".fa", + seqs_srcdir=".", seqs_destdir=".", + genome=NA_character_, verbose=TRUE) -forgeSeqFiles(provider, provider_version, +forgeSeqFiles(provider, genome, seqnames, mseqnames=NULL, seqfile_name=NA, prefix="", suffix=".fa", seqs_srcdir=".", seqs_destdir=".", - ondisk_seq_format=c("2bit", "rda", "fa.rz", "fa"), + ondisk_seq_format=c("2bit", "rds", "rda", "fa.rz", "fa"), verbose=TRUE) forgeMasksFiles(seqnames, nmask_per_seq, @@ -74,11 +80,11 @@ The provider of the \emph{sequence data files} e.g. \code{"UCSC"}, \code{"NCBI"}, \code{"BDGP"}, \code{"FlyBase"}, etc... } - \item{provider_version}{ - The provider-side version of the genome e.g. \code{"hg38"} - or \code{"bosTau9"} if the genome is provided by UCSC, - or \code{"GRCh38.p12"} or \code{"ARS-UCD1.2"} if it's provided - by NCBI. + \item{genome}{ + The name of the genome. Typically the name of an NCBI assembly (e.g. + \code{"GRCh38.p12"}, \code{"WBcel235"}, \code{"TAIR10.1"}, + \code{"ARS-UCD1.2"}, etc...) or UCSC genome (e.g. \code{"hg38"}, + \code{"bosTau9"}, \code{"galGal6"}, \code{"ce11"}, etc...). } \item{seqnames, mseqnames}{ A character vector containing the names of the single (for \code{seqnames}) @@ -98,17 +104,21 @@ serialized objects should be saved. These directories must already exist. - \code{forgeSeqlengthsFile} will produce a single .rda file. + Both \code{forgeSeqlengthsRdsFile} and \code{forgeSeqlengthsRdaFile} + will produce a single \code{.rds} or \code{.rda} file. Both \code{forgeSeqFiles} and \code{forgeMasksFiles} will produce one - .rda file per sequence. + file per sequence (all files being either \code{.rds} or \code{.rda} + files). } \item{ondisk_seq_format}{ Specifies how the single sequences should be stored in the forged package. - Can be \code{"2bit"}, \code{"rda"}, \code{"fa.rz"}, or \code{"fa"}. + Can be \code{"2bit"}, \code{"rds"}, \code{"rda"}, \code{"fa.rz"}, + or \code{"fa"}. If \code{"2bit"} (the default), then all the single sequences are stored in a single twoBit file. - If \code{"rda"}, then each single sequence is stored in a separated - serialized \link[Biostrings]{XString} object (one per single sequence). + If \code{"rds"} or \code{"rda"}, then each single sequence is stored in + a separated serialized \link[Biostrings]{XString} derivative (one per + single sequence). If \code{"fa.rz"} or \code{"fa"}, then all the single sequences are stored in a single FASTA file (compressed in the RAZip format if \code{"fa.rz"}). } @@ -139,31 +149,31 @@ seqs_srcdir <- system.file("extdata", package="BSgenome") seqnames <- c("chrX", "chrM") -## Forge .rda sequence files: +## Forge .2bit sequence files: forgeSeqFiles("UCSC", "ce2", seqnames, prefix="ce2", suffix=".fa.gz", seqs_srcdir=seqs_srcdir, - seqs_destdir=tempdir(), ondisk_seq_format="rda") + seqs_destdir=tempdir(), ondisk_seq_format="2bit") -## Forge .2bit sequence files: +## Forge .rds sequence files: forgeSeqFiles("UCSC", "ce2", seqnames, prefix="ce2", suffix=".fa.gz", seqs_srcdir=seqs_srcdir, - seqs_destdir=tempdir(), ondisk_seq_format="2bit") + seqs_destdir=tempdir(), ondisk_seq_format="rds") ## Sanity checks: library(BSgenome.Celegans.UCSC.ce2) genome <- BSgenome.Celegans.UCSC.ce2 -load(file.path(tempdir(), "chrX.rda")) -stopifnot(genome$chrX == chrX) -load(file.path(tempdir(), "chrM.rda")) -stopifnot(genome$chrM == chrM) - ce2_sequences <- import(file.path(tempdir(), "single_sequences.2bit")) ce2_sequences0 <- DNAStringSet(list(chrX=genome$chrX, chrM=genome$chrM)) -stopifnot(identical(names(ce2_sequences0), names(ce2_sequences)) && +stopifnot(identical(names(ce2_sequences0), names(ce2_sequences)), all(ce2_sequences0 == ce2_sequences)) + +chrX <- readRDS(file.path(tempdir(), "chrX.rds")) +stopifnot(genome$chrX == chrX) +chrM <- readRDS(file.path(tempdir(), "chrM.rds")) +stopifnot(genome$chrM == chrM) } \keyword{manip} diff -Nru r-bioc-bsgenome-1.56.0/man/getSeq-methods.Rd r-bioc-bsgenome-1.58.0/man/getSeq-methods.Rd --- r-bioc-bsgenome-1.56.0/man/getSeq-methods.Rd 2020-04-27 20:20:26.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/man/getSeq-methods.Rd 2020-10-27 17:17:33.000000000 +0000 @@ -68,7 +68,6 @@ \code{TRUE} or \code{FALSE}. Should the extracted sequences be returned in a standard character vector? } - \item{...}{Additional arguments. (Currently ignored.)} } \details{ diff -Nru r-bioc-bsgenome-1.56.0/NAMESPACE r-bioc-bsgenome-1.58.0/NAMESPACE --- r-bioc-bsgenome-1.56.0/NAMESPACE 2020-04-27 20:20:26.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/NAMESPACE 2020-10-27 17:17:33.000000000 +0000 @@ -110,6 +110,9 @@ getRowsFromOnDiskLongTable, getRowsByIdFromOnDiskLongTable, + ## ODLT_SNPlocs-class.R: + inferRefAndAltAlleles, + ## OldFashionSNPlocs-class.R: newSNPlocs, @@ -136,7 +139,8 @@ writeBSgenomeToTwobit, ## BSgenomeForge.R: - forgeSeqlengthsFile, + forgeSeqlengthsRdsFile, + forgeSeqlengthsRdaFile, forgeSeqFiles, forgeMasksFiles ) @@ -157,7 +161,6 @@ referenceGenome, compatibleGenomes, snpcount, snplocs, snpsBySeqname, snpsByOverlaps, snpsById, - inferRefAndAltAlleles, ## OldFashionSNPlocs-class.R: snpid2loc, snpid2alleles, snpid2grange, diff -Nru r-bioc-bsgenome-1.56.0/R/available.genomes.R r-bioc-bsgenome-1.58.0/R/available.genomes.R --- r-bioc-bsgenome-1.56.0/R/available.genomes.R 2020-04-27 20:20:26.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/R/available.genomes.R 2020-10-27 17:17:33.000000000 +0000 @@ -20,7 +20,7 @@ ans <- data.frame(pkgname=character(0), organism=factor(), provider=factor(), - provider_version=character(0), + genome=character(0), masked=logical(0), stringsAsFactors=FALSE) return(ans) @@ -54,7 +54,7 @@ data.frame(pkgname=pkgs, organism=factor(uparts[idx4 - 2L]), provider=factor(uparts[idx4 - 1L]), - provider_version=uparts[idx4], + genome=uparts[idx4], masked=is_masked, stringsAsFactors=FALSE) } @@ -99,7 +99,7 @@ .stopOnMoreThanOneAvailablePkg <- function(genome, masked, is.source=FALSE) stop("Looks like there is more than one available ", ifelse(masked, "masked ", ""), "BSgenome data package\n", - " that matches genome assembly (a.k.a. provider version): ", + " that matches genome: ", genome, "\n", " You first need to choose one (use 'available.genomes(", ifelse(is.source, "type=\"source\"", ""), ")' to get\n the list). ", @@ -108,18 +108,18 @@ " install(\"\"", ifelse(is.source, ", type=\"source\"", ""), ")") -.getInstalledPkgnameFromProviderVersion <- function(genome, masked=FALSE) +.getInstalledPkgnameFromGenome <- function(genome, masked=FALSE) { ## 1) Search installed packages. inst_pkgs <- installed.genomes(splitNameParts=TRUE) inst_pkgs <- inst_pkgs[inst_pkgs[ , "masked"] == masked, , drop=FALSE] - idx <- which(genome == inst_pkgs[ , "provider_version"]) + idx <- which(genome == inst_pkgs[ , "genome"]) if (length(idx) == 1L) return(inst_pkgs[idx , "pkgname"]) if (length(idx) >= 2L) stop("Looks like you have more than one installed ", ifelse(masked, "masked ", ""), "BSgenome data package\n", - " that matches genome assembly (a.k.a. provider version): ", + " that matches genome: ", genome, "\n", " Please disambiguate by specifying the full name of the ", "package you want\n to use (use 'installed.genomes()' to ", @@ -128,7 +128,7 @@ ## 2) Search available packages. av_pkgs <- available.genomes(splitNameParts=TRUE) av_pkgs <- av_pkgs[av_pkgs[ , "masked"] == masked, ] - idx <- which(genome == av_pkgs[ , "provider_version"]) + idx <- which(genome == av_pkgs[ , "genome"]) if (length(idx) == 1L) { genome <- av_pkgs[idx , "pkgname"] .stopOnAvailablePkg(genome) @@ -140,7 +140,7 @@ if (getOption("pkgType") != "source") { av_srcpkgs <- available.genomes(splitNameParts=TRUE, type="source") av_srcpkgs <- av_srcpkgs[av_srcpkgs[ , "masked"] == masked, ] - idx <- which(genome == av_srcpkgs[ , "provider_version"]) + idx <- which(genome == av_srcpkgs[ , "genome"]) if (length(idx) == 1L) { genome <- av_srcpkgs[idx , "pkgname"] .stopOnAvailablePkg(genome, is.source=TRUE) @@ -152,7 +152,7 @@ ## All searches have failed. stop("Couldn't find a ", ifelse(masked, "masked ", ""), "BSgenome data package ", - "that matches genome assembly\n (a.k.a. provider version): ", + "that matches genome assembly:\n ", genome, "\n\n", " Please use 'available.genomes()' ", "(or 'available.genomes(type=\"source\")')\n", @@ -210,11 +210,10 @@ if (!isSingleString(genome)) stop("'genome' must be a BSgenome object, or the full name of an ", "installed\n BSgenome data package, or a short string ", - "specifying a genome assembly (a.k.a.\n provider version) ", - "that refers unambiguously to an installed BSgenome data\n", - " package") + "specifying a genome assembly that\n refers unambiguously", + "to an installed BSgenome data package") if (!.has_BSgenome_prefix(genome)) { - genome <- .getInstalledPkgnameFromProviderVersion(genome, masked=masked) + genome <- .getInstalledPkgnameFromGenome(genome, masked=masked) } else if (masked) { warning("'masked' is ignored when 'genome' is supplied as ", "a full package name") diff -Nru r-bioc-bsgenome-1.56.0/R/BSgenome-class.R r-bioc-bsgenome-1.58.0/R/BSgenome-class.R --- r-bioc-bsgenome-1.56.0/R/BSgenome-class.R 2020-04-27 20:20:26.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/R/BSgenome-class.R 2020-10-27 17:17:33.000000000 +0000 @@ -4,7 +4,16 @@ setClass("BSgenome", - contains="GenomeDescription", + ## Expected metadata data are: + ## - organism + ## - common_name + ## - genome + ## - provider + ## - release_date + ## - source_url: permanent URL to the place where the 2bit and/or FASTA + ## files used to produce the "single" and "multiple" sequences can be + ## found (and downloaded). + contains="Annotated", representation( ## Name of the BSgenome data package where the BSgenome object is ## defined. @@ -14,15 +23,11 @@ single_sequences="OnDiskNamedSequences", multiple_sequences="RdaCollection", - ## Permanent URL to the place where the FASTA files that were used - ## to produce the "single" and "multiple" sequences above can be - ## found (and downloaded). - source_url="character", + ## Original seqinfo. + seqinfo="Seqinfo", ## Named vector representing the translation table from the original - ## seqnames (as stored in self@seqinfo@seqnames, the 'seqinfo' slot - ## being inherited from the GenomeDescription class) to the user - ## seqnames. + ## seqnames (as stored in self@seqinfo@seqnames) to the user seqnames. user_seqnames="character", ## For SNPs injection. @@ -30,13 +35,7 @@ ## Used for caching the single and multiple on-disk sequences. .seqs_cache="environment", - .link_counts="environment", - - ## TODO: Remove the 2 slots below once all the BSgenome data packages - ## with masks have been split in 2 packages (naked BSgenome + - ## MaskedBSgenome). - nmask_per_seq="integer", - masks="RdaCollection" + .link_counts="environment" ) ) @@ -137,13 +136,42 @@ ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -### Accessors. +### Accessors ### -setMethod("length", "BSgenome", function(x) length(names(x))) - +### Metadata. +setMethod("organism", "BSgenome", + function(object) metadata(object)$organism +) +setMethod("commonName", "BSgenome", + function(object) metadata(object)$common_name +) +setMethod("provider", "BSgenome", function(x) metadata(x)$provider) +setMethod("providerVersion", "BSgenome", + function(x) + { + msg <- c("Using providerVersion() on a ", class(x), " object ", + "is deprecated. Please use 'metadata(x)$genome' instead.") + .Deprecated(msg=c(" ", wmsg(msg))) + metadata(x)$genome + } +) +setMethod("releaseDate", "BSgenome", function(x) metadata(x)$release_date) +setMethod("releaseName", "BSgenome", + function(x) + { + msg <- c("Starting with Bioconductor 3.12, BSgenome objects no ", + "longer have a \"release name\". As a consequence of ", + "this change calling releaseName() on a BSgenome object ", + "now always returns NA and is deprecated.") + .Deprecated(msg=c(" ", wmsg(msg))) + NA_character_ + } +) setGeneric("sourceUrl", function(x) standardGeneric("sourceUrl")) -setMethod("sourceUrl", "BSgenome", function(x) x@source_url) +setMethod("sourceUrl", "BSgenome", function(x) metadata(x)$source_url) + +setMethod("length", "BSgenome", function(x) length(names(x))) setGeneric("mseqnames", function(x) standardGeneric("mseqnames")) setMethod("mseqnames", "BSgenome", @@ -159,13 +187,14 @@ setMethod("names", "BSgenome", function(x) c(seqnames(x), mseqnames(x))) setGeneric("masknames", function(x) standardGeneric("masknames")) -setMethod("masknames", "BSgenome", + +setMethod("masknames", "BSgenome", function(x) NULL) + +setMethod("masknames", "MaskedBSgenome", function(x) { - if (x@nmask_per_seq == 0L) - return(NULL) - ## TODO: Put this kind of checking in a validity method for BSgenome - ## objects (that's what validity methods are for). + ## TODO: Put this kind of checking in a validity method for + ## MaskedBSgenome objects (that's what validity methods are for). if (x@nmask_per_seq > length(BUILTIN_MASKNAMES)) stop("internal anomaly: x@nmask_per_seq > ", length(BUILTIN_MASKNAMES)) @@ -178,62 +207,69 @@ ### seqinfo() accessor and related. ### +setMethod("seqnames", "BSgenome", function(x) unname(x@user_seqnames)) + setMethod("seqinfo", "BSgenome", function(x) { ans <- x@seqinfo - seqlevels(ans) <- x@user_seqnames + seqlevels(ans) <- seqnames(x) ans } ) -### This is a restricted "seqinfo<-" method for BSgenome objects that -### only supports replacement of the sequence names, i.e., except for their -### sequence names, Seqinfo objects 'value' and 'seqinfo(x)' must be identical. -setReplaceMethod("seqinfo", "BSgenome", +### We implement a restricted seqinfo() setter for BSgenome object 'x' that +### supports altering **only** the seqlevels and/or genome of 'seqinfo(x)'. +### It does NOT allow subsetting 'seqinfo(x)' (by dropping/reordering some +### of its seqlevels), or altering its seqlengths or circularity flags! +### In other words, except for their seqnames() and genome(), Seqinfo +### objects 'new_seqinfo' and 'old_seqinfo' must be identical. This is all +### we need to make the seqlevelsStyle() setter work on a BSgenome object. +.check_new2old_and_new_seqinfo <- + function(new2old, new_seqinfo, old_seqinfo, context="") +{ + if (length(new_seqinfo) != length(old_seqinfo)) + stop(wmsg("the supplied 'seqinfo' must have the same ", + "length as the current 'seqinfo'", context)) + if (!(is.null(new2old) || identical(new2old, seq_along(new_seqinfo)))) + stop(wmsg("'new2old' can only be set to NULL or ", + "'seq_along(seqinfo(x))'", context)) + seqnames(old_seqinfo) <- seqnames(new_seqinfo) + genome(old_seqinfo) <- genome(new_seqinfo) + if (!identical(new_seqinfo, old_seqinfo)) + stop(wmsg("seqlengths() and isCircular() of the supplied 'seqinfo' ", + "must be identical to seqlengths() and isCircular() of ", + "the current 'seqinfo'", context)) +} + +.set_BSgenome_seqinfo <- function(x, new2old=NULL, - pruning.mode=c("error", "coarse", "fine", "tidy"), - value) - { - if (!is(value, "Seqinfo")) - stop("the supplied 'seqinfo' must be a Seqinfo object") - IN_THIS_CONTEXT <- paste0("when replacing the 'seqinfo' ", - "of a BSgenome object") - pruning.mode <- match.arg(pruning.mode) - if (pruning.mode != "error") - stop("'pruning.mode' not supported ", IN_THIS_CONTEXT) - x_seqinfo <- seqinfo(x) - if (is.null(new2old)) { - ## Support no-op seqinfo(x) <- seqinfo(x). - if (!identical(value, x_seqinfo)) - stop("'new2old' must be specified ", IN_THIS_CONTEXT) - return(x) - } - if (length(value) != length(x_seqinfo)) - stop("the supplied 'seqinfo' must have the same length ", - "as the current 'seqinfo' ", IN_THIS_CONTEXT) - if (!identical(new2old, seq_along(value))) - stop("'new2old' must be NULL or equal to 'seq_along(value)' ", - IN_THIS_CONTEXT) - new_seqnames <- seqnames(value) - seqnames(x_seqinfo) <- new_seqnames - if (!identical(value, x_seqinfo)) - stop("the supplied and current 'seqinfo' can differ only ", - "in their sequence names ", IN_THIS_CONTEXT) - if (any(new_seqnames %in% mseqnames(x))) - stop("the supplied 'seqnames' cannot match any of the ", - "multiple sequence names (as returned by 'mseqnames(x)')") - x@user_seqnames[] <- new_seqnames # using [] to preserve the names - x - } -) + pruning.mode=c("error", "coarse", "fine", "tidy"), + value) +{ + if (!is(value, "Seqinfo")) + stop(wmsg("the supplied 'seqinfo' must be a Seqinfo object")) + context <- paste0(" when replacing the 'seqinfo' of a ", + classNameForDisplay(x), " object") + pruning.mode <- match.arg(pruning.mode) + if (pruning.mode != "error") + stop(wmsg("'pruning.mode' is not supported", context)) + .check_new2old_and_new_seqinfo(new2old, value, seqinfo(x), context) + new_seqnames <- seqnames(value) + if (any(new_seqnames %in% mseqnames(x))) + stop(wmsg("the supplied 'seqnames' cannot match any of the ", + "multiple sequence names (as returned by 'mseqnames(x)')")) + x@user_seqnames[] <- new_seqnames # using [] to preserve the names + genome(x@seqinfo) <- unname(genome(value)) + x +} + +setReplaceMethod("seqinfo", "BSgenome", .set_BSgenome_seqinfo) setReplaceMethod("seqnames", "BSgenome", function(x, value) { - x_seqinfo <- seqinfo(x) - seqnames(x_seqinfo) <- value - seqinfo(x, new2old=seq_along(x_seqinfo)) <- x_seqinfo + seqnames(seqinfo(x)) <- value x } ) @@ -261,9 +297,8 @@ ### ### 'seqnames' only used for sanity check. -.makeBSgenomeSeqinfo <- function(single_sequences, - circ_seqs, provider_version, - seqnames) +.make_BSgenome_seqinfo <- function(single_sequences, circ_seqs, + genome, seqnames) { seqlengths <- seqlengths(single_sequences) if (length(seqnames) == 0L) { @@ -282,73 +317,53 @@ Seqinfo(seqnames=seqnames, seqlengths=seqlengths, isCircular=is_circ, - genome=provider_version) + genome=genome) } ### NOTES: -### - In BioC 2.14, the 'seqs_pkgname' and 'masks_pkgname' BSgenome slots -### were replaced with the 'pkgname' slot but the 2 corresponding -### arguments were kept for backward compatibility with existing -### BSgenome packages. +### - In BioC 2.14, the 'seqs_pkgname' BSgenome slot was renamed 'pkgname' +### but the corresponding argument was not renamed for backward +### compatibility with existing BSgenome packages. ### - In BioC 3.1, the 'species' argument was replaced with the 'common_name' ### argument but the former was kept for backward compatibility with ### existing BSgenome packages. -BSgenome <- function(organism, common_name, provider, provider_version, +### - In BioC 3.12, the 'provider_version' argument was replaced with the +### 'genome' argument, and the 'release_name' became ignored and was only +### kept for backward compatibility. +BSgenome <- function(organism, common_name, genome, + provider, provider_version, release_date, release_name, source_url, seqnames, circ_seqs=NA, mseqnames, seqs_pkgname, seqs_dirpath, - nmask_per_seq=0, masks_pkgname=NA, masks_dirpath=NA, species=NA_character_) { - twobit_filename <- "single_sequences.2bit" - twobit_filepath <- file.path(seqs_dirpath, twobit_filename) - if (file.exists(twobit_filepath)) { - single_sequences <- TwobitNamedSequences(twobit_filepath) - } else { - fa_filename <- "single_sequences.fa" - fa_filepath <- file.path(seqs_dirpath, fa_filename) - farz_filename <- paste0(fa_filename, ".rz") - farz_filepath <- file.path(seqs_dirpath, farz_filename) - if (file.exists(farz_filepath)) { - single_sequences <- FastaNamedSequences(farz_filepath) - } else if (file.exists(fa_filepath)) { - single_sequences <- FastaNamedSequences(fa_filepath) - } else { - single_sequences <- RdaNamedSequences(seqs_dirpath, seqnames) - } - } - seqinfo <- .makeBSgenomeSeqinfo(single_sequences, - circ_seqs, provider_version, - seqnames) + single_sequences <- OnDiskNamedSequences(seqs_dirpath, seqnames=seqnames) + if (missing(genome)) + genome <- provider_version + seqinfo <- .make_BSgenome_seqinfo(single_sequences, circ_seqs, + genome, seqnames) seqnames <- seqnames(seqinfo) if (missing(common_name)) common_name <- species - genome_description <- GenomeDescription(organism, common_name, - provider, provider_version, - release_date, release_name, - seqinfo) + metadata <- list(organism=organism, + common_name=common_name, + genome=genome, + provider=provider, + release_date=release_date, + source_url=source_url) if (is.null(mseqnames)) mseqnames <- character(0) multiple_sequences <- RdaCollection(seqs_dirpath, mseqnames) names(user_seqnames) <- user_seqnames <- seqnames - nmask_per_seq <- as.integer(nmask_per_seq) - masks_dirpath <- as.character(masks_dirpath) - if (nmask_per_seq == 0L || length(seqnames) == 0L) { - masks <- RdaCollection(masks_dirpath, character(0)) - } else { - masks <- RdaCollection(masks_dirpath, paste0(seqnames, ".masks")) - } - new("BSgenome", genome_description, - pkgname=seqs_pkgname, - single_sequences=single_sequences, - multiple_sequences=multiple_sequences, - source_url=source_url, - user_seqnames=user_seqnames, - .seqs_cache=new.env(parent=emptyenv()), - .link_counts=new.env(parent=emptyenv()), - nmask_per_seq=nmask_per_seq, - masks=masks + new("BSgenome", metadata=metadata, + pkgname=seqs_pkgname, + single_sequences=single_sequences, + multiple_sequences=multiple_sequences, + seqinfo=seqinfo, + user_seqnames=user_seqnames, + .seqs_cache=new.env(parent=emptyenv()), + .link_counts=new.env(parent=emptyenv()) ) } @@ -376,11 +391,46 @@ function(seqname) x[[seqname]]) ) +setAs("BSgenome", "GenomeDescription", + function(from) + { + metadata <- metadata(from) + GenomeDescription(organism=metadata$organism, + common_name=metadata$common_name, + provider=metadata$provider, + provider_version=metadata$genome, + release_date=metadata$release_date, + release_name=NA_character_, + seqinfo=seqinfo(from)) + } +) + + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### bsgenomeName() +### + +setMethod("bsgenomeName", "BSgenome", + function(x) bsgenomeName(as(x, "GenomeDescription")) +) + ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ### The "show" method ### +.print_BSgenome_metadata <- function(metadata, prefix="") +{ + common_name <- metadata$common_name + if (!is.na(common_name)) + cat(common_name, "genome:\n") + cat(prefix, "organism: ", metadata$organism, + " (", common_name, ")\n", sep="") + cat(prefix, "genome: ", metadata$genome, "\n", sep="") + cat(prefix, "provider: ", metadata$provider, "\n", sep="") + cat(prefix, "release date: ", metadata$release_date, "\n", sep="") +} + .SHOW_BSGENOME_PREFIX <- "# " .SHOW_SEQSECTION_PREFIX <- "# " @@ -390,10 +440,7 @@ mystrwrap <- function(line) writeLines(strwrap(line, width=getOption("width")+1, exdent=0L, prefix=.SHOW_BSGENOME_PREFIX)) - if (!is.na(object@common_name)) - cat(object@common_name, "genome:\n") - GenomeInfoDb:::showGenomeDescription(object, - margin=.SHOW_BSGENOME_PREFIX) + .print_BSgenome_metadata(metadata(object), prefix=.SHOW_BSGENOME_PREFIX) if (!is.null(SNPlocs_pkgname(object))) cat(.SHOW_BSGENOME_PREFIX, "with SNPs injected from package: ", SNPlocs_pkgname(object), "\n", sep="") if (length(mseqnames(object)) == 0L) { @@ -466,7 +513,7 @@ return(ans) } # single sequence - ans <- x@single_sequences[[seqname]] + ans <- getListElement(x@single_sequences, seqname) ## Check the length of the sequence if (length(ans) != seqlengths(x)[[user_seqname]]) { stop(user_seqname, " sequence does not have the expected length. ", @@ -482,17 +529,17 @@ masks_objname <- paste0(seqname, ".masks") builtinmasks <- x@masks[[masks_objname]] if (length(builtinmasks) < nmask_per_seq) { - masks_filepath <- rdaPath(x@masks, masks_objname) + masks_path <- rdaPath(x@masks, masks_objname) stop("expecting ", nmask_per_seq, " built-in masks per ", "single sequence, found only ", length(builtinmasks), - " in file '", masks_filepath, "'. ", + " in file '", masks_path, "'. ", "May be the data on disk is corrupted?") } if (length(builtinmasks) > nmask_per_seq) builtinmasks <- builtinmasks[seq_len(nmask_per_seq)] if (!identical(names(builtinmasks), masknames(x))) { - masks_filepath <- rdaPath(x@masks, masks_objname) - stop("mask names found in file '", masks_filepath, "' are not ", + masks_path <- rdaPath(x@masks, masks_objname) + stop("mask names found in file '", masks_path, "' are not ", "identical to the names returned by masknames(). ", "May be the data on disk is corrupted?") } @@ -555,10 +602,10 @@ ## to be of length <= 1 if (length(subscripts) == 0L) stop("no index specified") - i <- subscripts[[1]] - if (length(i) < 1) + i <- subscripts[[1L]] + if (length(i) < 1L) stop("attempt to select less than one element") - if (length(i) > 1) + if (length(i) > 1L) stop("attempt to select more than one element") if (is.character(i)) { user_seqname <- try(match.arg(i, names(x)), silent=TRUE) diff -Nru r-bioc-bsgenome-1.56.0/R/BSgenomeForge.R r-bioc-bsgenome-1.58.0/R/BSgenomeForge.R --- r-bioc-bsgenome-1.56.0/R/BSgenomeForge.R 2020-04-27 20:20:26.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/R/BSgenomeForge.R 2020-10-27 17:17:33.000000000 +0000 @@ -3,30 +3,32 @@ ### ------------------------------------------------------------------------- -.get_circ_seqs <- function(provider, provider_version, circ_seqs) -{ - if (provider == "UCSC") { - UCSC_genomes <- registered_UCSC_genomes() - if (provider_version %in% UCSC_genomes[ , "genome"]) { - chrom_info <- getChromInfoFromUCSC(provider_version) - circ_idx <- which(chrom_info[ , "circular"]) - circ_seqs <- deparse(chrom_info[circ_idx, "chrom"]) - } - } - circ_seqs -} - .getMasksObjname <- function(seqnames) { if (length(seqnames) == 0) return(character(0)) - paste(seqnames, ".masks", sep="") + paste0(seqnames, ".masks") } -.saveObject <- function(object, objname, destdir=".", verbose=TRUE) +.saveObjectToRdsFile <- function(object, objname, destdir=".", verbose=TRUE) +{ + destfile <- file.path(destdir, paste0(objname, ".rds")) + if (verbose) + cat("Saving '", objname, "' object to compressed data file '", + destfile, "' ... ", sep="") + ## Using compress="xz" (instead of compress="gzip") would produce a .rds + ## file that is about 20% smaller on disk but it would also take almost 3x + ## longer to load it later on with load(). Tested on hg19 chr1 with R-2.14 + ## (2011-09-20 r57033). This is why we stick to compress="gzip". + saveRDS(object, file=destfile, compress="gzip") + if (verbose) + cat("DONE\n") +} + +.saveObjectToRdaFile <- function(object, objname, destdir=".", verbose=TRUE) { assign(objname, object) - destfile <- file.path(destdir, paste(objname, ".rda", sep="")) + destfile <- file.path(destdir, paste0(objname, ".rda")) if (verbose) cat("Saving '", objname, "' object to compressed data file '", destfile, "' ... ", sep="") @@ -49,9 +51,9 @@ { if (!is.null(seqnames) && !is.character(seqnames)) stop("'seqnames' must be a character vector (or NULL)") - if (length(seqnames) == 0) { + if (length(seqnames) == 0L) { warning("'seqnames' is empty") - return(character(0)) + return(setNames(character(0), character(0))) } if (!isSingleString(prefix)) stop("'prefix' must be a single string") @@ -59,7 +61,7 @@ stop("'suffix' must be a single string") if (!isSingleString(seqs_srcdir)) stop("'seqs_srcdir' must be a single string") - srcfiles <- paste(prefix, seqnames, suffix, sep="") + srcfiles <- paste0(prefix, seqnames, suffix) ans <- file.path(seqs_srcdir, srcfiles) is_OK <- file.exists(ans) if (!all(is_OK)) { @@ -72,53 +74,93 @@ ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -### The getSeqlengths() and forgeSeqlengthsFile() functions. +### getSeqlengths(), forgeSeqlengthsRdsFile(), and forgeSeqlengthsRdaFile() ### -getSeqlengths <- function(seqnames, prefix="", suffix=".fa", seqs_srcdir=".") +getSeqlengths <- function(seqnames, prefix="", suffix=".fa", seqs_srcdir=".", + genome=NA_character_) { - if (length(seqnames) == 0) { - warning("'seqnames' is empty") - return(integer(0)) - } srcpaths <- getSeqSrcpaths(seqnames, prefix=prefix, suffix=suffix, seqs_srcdir=seqs_srcdir) - sapply(seqnames, function(seqname) - { - srcpath <- srcpaths[[seqname]] - ans <- fasta.seqlengths(srcpath) - if (length(ans) == 0) - stop("In file '", srcpath, "': no sequence found") - if (length(ans) > 1) - warning("In file '", srcpath, "': ", length(ans), - " sequences found, using first sequence only") - if (names(ans)[1] != seqname) - warning("In file '", srcpath, "': sequence description \"", - names(ans), "\" doesn't match user-specified ", - "sequence name \"", seqname, "\"") - ans[[1]] - }, - USE.NAMES=TRUE + ans <- vapply(seqnames, + function(seqname) { + srcpath <- srcpaths[[seqname]] + ans <- fasta.seqlengths(srcpath) + if (length(ans) == 0L) + stop("In file '", srcpath, "': no sequence found") + if (length(ans) > 1L) + warning("In file '", srcpath, "': ", length(ans), + " sequences found, using first sequence only") + if (names(ans)[[1L]] != seqname) + warning("In file '", srcpath, "': sequence description \"", + names(ans), "\" doesn't match user-specified ", + "sequence name \"", seqname, "\"") + ans[[1L]] + }, + integer(1), + USE.NAMES=TRUE ) + if (!is.na(genome)) { + expected <- seqlengths(Seqinfo(genome=genome)) + if (!identical(expected[names(ans)], ans)) + stop("the sequences in the files have lengths that don't match ", + "the lengths reported by 'Seqinfo(genome=\"", genome, "\")'") + } + ans } -forgeSeqlengthsFile <- function(seqnames, prefix="", suffix=".fa", - seqs_srcdir=".", seqs_destdir=".", - verbose=TRUE) +forgeSeqlengthsRdsFile <- function(seqnames, prefix="", suffix=".fa", + seqs_srcdir=".", seqs_destdir=".", + genome=NA_character_, verbose=TRUE) { + if (!isSingleString(seqs_destdir)) + stop("'seqs_destdir' must be a single string") seqlengths <- getSeqlengths(seqnames, prefix=prefix, suffix=suffix, - seqs_srcdir=seqs_srcdir) + seqs_srcdir=seqs_srcdir, genome=genome) + .saveObjectToRdsFile(seqlengths, "seqlengths", destdir=seqs_destdir, + verbose=verbose) +} + +forgeSeqlengthsRdaFile <- function(seqnames, prefix="", suffix=".fa", + seqs_srcdir=".", seqs_destdir=".", + genome=NA_character_, verbose=TRUE) +{ if (!isSingleString(seqs_destdir)) stop("'seqs_destdir' must be a single string") - .saveObject(seqlengths, "seqlengths", destdir=seqs_destdir, - verbose=verbose) + seqlengths <- getSeqlengths(seqnames, prefix=prefix, suffix=suffix, + seqs_srcdir=seqs_srcdir, genome=genome) + .saveObjectToRdaFile(seqlengths, "seqlengths", destdir=seqs_destdir, + verbose=verbose) } ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -### The "forgeSeqFiles" function. +### forgeSeqFiles() ### +.forgeRdsSeqFile <- function(name, prefix, suffix, seqs_srcdir, seqs_destdir, + is.single.seq=TRUE, verbose=TRUE) +{ + if (!isSingleString(name)) + stop("'name' must be a single string") + srcpath <- getSeqSrcpaths(name, prefix=prefix, suffix=suffix, + seqs_srcdir=seqs_srcdir) + if (verbose) + cat("Loading FASTA file '", srcpath, "' ... ", sep="") + seq <- readDNAStringSet(srcpath, "fasta") + if (verbose) + cat("DONE\n") + if (is.single.seq) { + if (length(seq) == 0L) + stop("file contains no DNA sequence") + if (length(seq) > 1L) + warning("file contains ", length(seq), " sequences, ", + "using the first sequence only") + seq <- seq[[1L]] # now 'seq' is a DNAString object + } + .saveObjectToRdsFile(seq, name, destdir=seqs_destdir, verbose=verbose) +} + .forgeRdaSeqFile <- function(name, prefix, suffix, seqs_srcdir, seqs_destdir, is.single.seq=TRUE, verbose=TRUE) { @@ -133,14 +175,14 @@ if (verbose) cat("DONE\n") if (is.single.seq) { - if (length(seq) == 0) + if (length(seq) == 0L) stop("file contains no DNA sequence") - if (length(seq) > 1) + if (length(seq) > 1L) warning("file contains ", length(seq), " sequences, ", "using the first sequence only") seq <- seq[[1]] # now 'seq' is a DNAString object } - .saveObject(seq, name, destdir=seqs_destdir, verbose=verbose) + .saveObjectToRdaFile(seq, name, destdir=seqs_destdir, verbose=verbose) } .forgeFastaRzFileFromFastaFiles <- function(seqnames, prefix, suffix, @@ -300,12 +342,12 @@ cat("DONE\n") } -.sortUCSCTwobitFile <- function(provider_version, seqs_dir, verbose=TRUE) +.sortUCSCTwobitFile <- function(genome, seqs_dir, verbose=TRUE) { if (verbose) cat("Getting chrom info from UCSC with 'getChromInfoFromUCSC(\"", - provider_version, "\")' ... ", sep="") - chrom_info <- getChromInfoFromUCSC(provider_version) + genome, "\")' ... ", sep="") + chrom_info <- getChromInfoFromUCSC(genome) if (verbose) cat("DONE\n") filename <- "single_sequences.2bit" @@ -318,10 +360,10 @@ m <- match(chrom_info[ , "chrom"], names(seqs)) if (nrow(chrom_info) != length(seqs) || anyNA(m)) stop(wmsg("nb of sequences in 'chromInfo' table and 2bit file ", - "don't match for UCSC genome ", provider_version)) + "don't match for UCSC genome ", genome)) if (verbose) cat("Sorting sequences as in 'getChromInfoFromUCSC(\"", - provider_version, "\")' ... ", sep="") + genome, "\")' ... ", sep="") seqs <- seqs[m] if (verbose) cat("DONE\n") @@ -329,7 +371,7 @@ cat("Checking the sequence lengths ... ") if (!identical(chrom_info[ , "size"], width(seqs))) stop(wmsg("sequence lengths in 'chromInfo' table and 2bit file ", - "don't match for UCSC genome ", provider_version)) + "don't match for UCSC genome ", genome)) if (verbose) cat("OK\n") if (verbose) @@ -339,11 +381,11 @@ cat("DONE\n") } -forgeSeqFiles <- function(provider, provider_version, +forgeSeqFiles <- function(provider, genome, seqnames, mseqnames=NULL, seqfile_name=NA, prefix="", suffix=".fa", seqs_srcdir=".", seqs_destdir=".", - ondisk_seq_format=c("2bit", "rda", "fa.rz", "fa"), + ondisk_seq_format=c("2bit", "rds", "rda", "fa.rz", "fa"), verbose=TRUE) { if (length(seqnames) == 0L && is.na(seqfile_name)) @@ -353,7 +395,12 @@ if (!isSingleString(seqs_destdir)) stop("'seqs_destdir' must be a single string") ondisk_seq_format <- match.arg(ondisk_seq_format) - if (ondisk_seq_format == "rda") { # "rda" format + if (ondisk_seq_format == "rds") { # "rds" format + for (name in seqnames) { + .forgeRdsSeqFile(name, prefix, suffix, seqs_srcdir, seqs_destdir, + is.single.seq=TRUE, verbose=verbose) + } + } else if (ondisk_seq_format == "rda") { # "rda" format for (name in seqnames) { .forgeRdaSeqFile(name, prefix, suffix, seqs_srcdir, seqs_destdir, is.single.seq=TRUE, verbose=verbose) @@ -373,8 +420,8 @@ verbose=verbose) if (provider == "UCSC") { UCSC_genomes <- registered_UCSC_genomes() - if (provider_version %in% UCSC_genomes[ , "genome"]) - .sortUCSCTwobitFile(provider_version, seqs_destdir, + if (genome %in% UCSC_genomes[ , "genome"]) + .sortUCSCTwobitFile(genome, seqs_destdir, verbose=verbose) } } @@ -391,7 +438,7 @@ ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -### The "forgeMasksFiles" function. +### forgeMasksFiles() ### ## AGAPS is the mask of "assembly gaps" (inter-contig Ns). @@ -417,7 +464,7 @@ desc(ans) <- "assembly gaps (empty)" } else { if (is.na(filename)) - filename <- paste(fileprefix, seqname, filesuffix, sep="") + filename <- paste0(fileprefix, seqname, filesuffix) filepath <- file.path(masks_srcdir, filename) if (filetype == "gap") ans <- read.gapMask(filepath, seqname=seqname, mask.width=mask_width) @@ -441,7 +488,7 @@ ans <- collapse(masks(seq)[-1]) desc(ans) <- "intra-contig ambiguities" if (isEmpty(ans)) - desc(ans) <- paste(desc(ans), "(empty)") + desc(ans) <- paste0(desc(ans), " (empty)") active(ans) <- TRUE names(ans) <- "AMB" ans @@ -459,7 +506,7 @@ if (!isSingleStringOrNA(filesuffix)) stop("'filesuffix' must be a single string or NA") if (is.na(filename)) - filename <- paste(fileprefix, seqname, filesuffix, sep="") + filename <- paste0(fileprefix, seqname, filesuffix) filepath <- file.path(masks_srcdir, filename) if (file.exists(filepath)) { ans <- read.rmMask(filepath, seqname=seqname, mask.width=mask_width) @@ -486,7 +533,7 @@ if (!isSingleStringOrNA(filesuffix)) stop("'filesuffix' must be a single string or NA") if (is.na(filename)) - filename <- paste(fileprefix, seqname, filesuffix, sep="") + filename <- paste0(fileprefix, seqname, filesuffix) filepath <- file.path(masks_srcdir, filename) if (file.exists(filepath)) { ans <- read.trfMask(filepath, seqname=seqname, mask.width=mask_width) @@ -528,7 +575,7 @@ ## Load the sequence. ondisk_seq_format <- match.arg(ondisk_seq_format) if (ondisk_seq_format == "rda") { # "rda" format - seqfile <- file.path(seqs_destdir, paste(seqname, ".rda", sep="")) + seqfile <- file.path(seqs_destdir, paste0(seqname, ".rda")) load(seqfile) seq <- get(seqname) remove(list=seqname) @@ -576,7 +623,7 @@ masks <- append(masks, TRFmask) } objname <- .getMasksObjname(seqname) - .saveObject(masks, objname, destdir=masks_destdir, verbose=verbose) + .saveObjectToRdaFile(masks, objname, destdir=masks_destdir, verbose=verbose) } forgeMasksFiles <- function(seqnames, nmask_per_seq, @@ -589,7 +636,7 @@ TRFfiles_name=NA, TRFfiles_prefix="", TRFfiles_suffix=".bed", verbose=TRUE) { - if (length(seqnames) == 0) + if (length(seqnames) == 0L) warning("'seqnames' is empty") for (seqname in seqnames) { .forgeMasksFile(seqname, nmask_per_seq, @@ -622,10 +669,13 @@ License="character", organism="character", common_name="character", + ## Should be accepted by 'Seqinfo(genome=genome)' e.g. "TAIR10.1", + ## "hg38", "ce11" etc... + genome="character", provider="character", - provider_version="character", + provider_version="character", # deprecated in favor of 'genome' release_date="character", - release_name="character", + release_name="character", # deprecated (not replaced by anything) source_url="character", organism_biocview="character", BSgenomeObjname="character", @@ -645,9 +695,12 @@ Maintainer="Bioconductor Package Maintainer ", Suggests="", License="Artistic-2.0", + genome=NA_character_, + provider_version=NA_character_, + release_name=NA_character_, source_url="-- information not available --", - seqnames="NULL", # equivalent to "character(0)" - circ_seqs="NULL", # equivalent to "character(0)" + seqnames=NA_character_, + circ_seqs=NA_character_, mseqnames="NULL", # equivalent to "character(0)" PkgDetails="", SrcDataFiles="-- information not available --", @@ -760,7 +813,34 @@ template_path <- system.file("pkgtemplates", "BSgenome_datapkg", package="BSgenome") BSgenome_version <- installed.packages()['BSgenome','Version'] - circ_seqs <- .get_circ_seqs(x@provider, x@provider_version, x@circ_seqs) + if (is.na(x@genome)) { + if (is.na(x@provider_version)) + stop("'genome' field is missing in seed file") + warning("field 'provider_version' is deprecated ", + "in favor of 'genome'") + x@genome <- x@provider_version + } + if (!is.na(x@release_name)) + warning("field 'release_name' is deprecated") + seqnames <- x@seqnames + if (!is.na(seqnames)) { + .seqnames <- eval(parse(text=seqnames)) + } else { + if (is.na(x@seqfile_name)) { + .seqnames <- seqlevels(Seqinfo(genome=x@genome)) + } else { + .seqnames <- NULL + } + seqnames <- deparse(.seqnames) + } + circ_seqs <- x@circ_seqs + if (!is.na(circ_seqs)) { + .circ_seqs <- eval(parse(text=circ_seqs)) + } else { + si <- Seqinfo(genome=x@genome) + .circ_seqs <- seqlevels(si)[isCircular(si)] + circ_seqs <- deparse(.circ_seqs) + } symvals <- list( PKGTITLE=x@Title, PKGDESCRIPTION=x@Description, @@ -772,14 +852,13 @@ LICENSE=x@License, ORGANISM=x@organism, COMMONNAME=x@common_name, + GENOME=x@genome, PROVIDER=x@provider, - PROVIDERVERSION=x@provider_version, RELEASEDATE=x@release_date, - RELEASENAME=x@release_name, SOURCEURL=x@source_url, ORGANISMBIOCVIEW=x@organism_biocview, BSGENOMEOBJNAME=x@BSgenomeObjname, - SEQNAMES=ifelse(is.na(x@seqfile_name), x@seqnames, "NULL"), + SEQNAMES=seqnames, CIRCSEQS=circ_seqs, MSEQNAMES=x@mseqnames, PKGDETAILS=x@PkgDetails, @@ -800,20 +879,30 @@ Biobase::createPackage(x@Package, destdir, template_path, symvals) pkgdir <- file.path(destdir, x@Package) - .seqnames <- eval(parse(text=x@seqnames)) .mseqnames <- eval(parse(text=x@mseqnames)) seqs_destdir <- file.path(pkgdir, "inst", "extdata") - if (x@ondisk_seq_format == "rda") { + if (x@ondisk_seq_format == "rds") { + ## Forge the "seqlengths.rds" file + forgeSeqlengthsRdsFile(.seqnames, + prefix=x@seqfiles_prefix, + suffix=x@seqfiles_suffix, + seqs_srcdir=seqs_srcdir, + seqs_destdir=seqs_destdir, + genome=x@genome, + verbose=verbose) + } else if (x@ondisk_seq_format == "rda") { ## Forge the "seqlengths.rda" file - forgeSeqlengthsFile(.seqnames, - prefix=x@seqfiles_prefix, - suffix=x@seqfiles_suffix, - seqs_srcdir=seqs_srcdir, - seqs_destdir=seqs_destdir, - verbose=verbose) + forgeSeqlengthsRdaFile(.seqnames, + prefix=x@seqfiles_prefix, + suffix=x@seqfiles_suffix, + seqs_srcdir=seqs_srcdir, + seqs_destdir=seqs_destdir, + genome=x@genome, + verbose=verbose) } - ## Forge the sequence files (either "2bit", "rda", "fa.rz", or "fa") - forgeSeqFiles(x@provider, x@provider_version, + ## Forge the sequence files (either "2bit", "rds", "rda", "fa.rz", + ## or "fa"). + forgeSeqFiles(x@provider, x@genome, .seqnames, mseqnames=.mseqnames, seqfile_name=x@seqfile_name, prefix=x@seqfiles_prefix, @@ -879,7 +968,7 @@ seed_dir <- system.file("extdata", "GentlemanLab", package="BSgenome") file <- file.path(seed_dir, file) if (!file.exists(file)) { - file <- paste(file, "-seed", sep="") + file <- paste0(file, "-seed") if (!file.exists(file)) stop("seed file '", file0, "' not found") } @@ -943,12 +1032,11 @@ REFPKGNAME=x@RefPkgname, SUGGESTS=x@Suggests, LICENSE=x@License, - ORGANISM=organism(ref_bsgenome), - COMMONNAME=commonName(ref_bsgenome), - PROVIDER=provider(ref_bsgenome), - PROVIDERVERSION=providerVersion(ref_bsgenome), - RELEASEDATE=releaseDate(ref_bsgenome), - RELEASENAME=releaseName(ref_bsgenome), + ORGANISM=metadata(ref_bsgenome)$organism, + COMMONNAME=metadata(ref_bsgenome)$common_name, + GENOME=metadata(ref_bsgenome)$genome, + PROVIDER=metadata(ref_bsgenome)$provider, + RELEASEDATE=metadata(ref_bsgenome)$release_date, SOURCEURL=x@source_url, ORGANISMBIOCVIEW=x@organism_biocview, NMASKPERSEQ=as.character(x@nmask_per_seq), diff -Nru r-bioc-bsgenome-1.56.0/R/OnDiskNamedSequences-class.R r-bioc-bsgenome-1.58.0/R/OnDiskNamedSequences-class.R --- r-bioc-bsgenome-1.56.0/R/OnDiskNamedSequences-class.R 2020-04-27 20:20:26.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/R/OnDiskNamedSequences-class.R 2020-10-27 17:17:33.000000000 +0000 @@ -12,7 +12,7 @@ ### - seqlengthsFilepath() ### - seqinfo API (implementing seqinfo() is enough to make seqlengths(), ### seqlevels(), etc... work) -### - [[ -- load full sequence as XString object +### - getListElement() -- load full sequence as XString derivative ### Default methods are provided for the following: ### - length() ### - seqnames() @@ -30,18 +30,28 @@ ### Low-level utilities -### Works on an XString object or any object 'x' for which seqlengths() is -### defined. +.check_getListElement_index <- function(i, what) +{ + if (!is.character(i)) + stop(wmsg(what, " can only be subsetted by name")) + if (length(i) < 1L) + stop(wmsg("attempt to select less than one element")) + if (length(i) > 1L) + stop(wmsg("attempt to select more than one element")) +} + +### Works on an XString derivative or any object 'x' for which seqlengths() +### is defined. .get_seqlength <- function(x, seqname) { if (!isSingleString(seqname)) - stop("'seqname' must be a single string") + stop(wmsg("'seqname' must be a single string")) if (is(x, "XString")) return(length(x)) x_seqlengths <- seqlengths(x) idx <- match(seqname, names(x_seqlengths)) if (is.na(idx)) - stop("invalid sequence name: ", seqname) + stop(wmsg("invalid sequence name: ", seqname)) x_seqlengths[[idx]] } @@ -60,7 +70,7 @@ } ) -### Load regions from a single sequence as an XStringSet object. +### Load regions from a single sequence as an XStringSet derivative. ### 'seqname' is ignored. setMethod("loadSubseqsFromLinearSequence", "XString", @@ -70,15 +80,104 @@ setMethod("loadSubseqsFromLinearSequence", "OnDiskNamedSequences", function(x, seqname, ranges) - loadSubseqsFromLinearSequence(x[[seqname]], seqname, ranges) + { + seq <- getListElement(x, seqname) + loadSubseqsFromLinearSequence(seq, seqname, ranges) + } +) + + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### RdsNamedSequences objects +### +### The 'dirpath' slot should point to a directory that contains one .rds +### file per XString derivative + a seqlengths.rds file that contains a +### serialized named integer vector with the sequence names and lengths. +### + +setClass("RdsNamedSequences", + contains=c("RdsCollection", "OnDiskNamedSequences"), + prototype=prototype( + elementType="XString" + ) +) + +setMethod("seqlevels", "RdsNamedSequences", function(x) names(x)) + +setMethod("seqlengthsFilepath", "RdsNamedSequences", + function(x) file.path(path(x), "seqlengths.rds") +) + +.read_seqlengths_from_file <- function(x) readRDS(seqlengthsFilepath(x)) + +setMethod("seqlengths", "RdsNamedSequences", + function(x) + { + noext_ends <- nchar(x@filenames) - nchar(".rds") + noext_filenames <- substr(x@filenames, 1L, noext_ends) + setNames(.read_seqlengths_from_file(x)[noext_filenames], names(x)) + } +) + +setMethod("seqinfo", "RdsNamedSequences", + function(x) + { + x_seqlengths <- seqlengths(x) + Seqinfo(names(x_seqlengths), unname(x_seqlengths)) + } +) + +setAs("RdsCollection", "RdsNamedSequences", + function(from) + { + ans <- new2("RdsNamedSequences", from, elementType="XString", + check=FALSE) + seqlengths_from_file <- .read_seqlengths_from_file(ans) + if (!is.integer(seqlengths_from_file)) + stop(wmsg("object serialized in ", + "file '", seqlengthsFilepath(ans), "' ", + "must be a named integer vector")) + if (!all(names(from) %in% names(seqlengths_from_file))) + stop(wmsg("the names on the RdsCollection object to coerce ", + "to RdsNamedSequences must be a subset of ", + "the names on the integer vector serialized ", + "in file '", seqlengthsFilepath(ans), "'")) + ans + } +) + +### Constructor. +RdsNamedSequences <- function(path, seqnames) +{ + filenames <- paste0(seqnames, ".rds") + as(RdsCollection(path, filenames), "RdsNamedSequences") +} + +### Load a full sequence as an XString derivative. +setMethod("getListElement", "RdsNamedSequences", + function(x, i, exact=TRUE) + { + .check_getListElement_index(i, "an RdsNamedSequences object") + ans <- callNextMethod() + if (!is(ans, "XString")) { + filepath <- file.path(path(x), x@filenames[[i]]) + stop(wmsg("serialized object in file '", filepath, "' ", + "must be an XString derivative")) + } + updateObject(ans) + } ) ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ### RdaNamedSequences objects ### -### The "dirpath" slot should contain 1 serialized XString object per -### sequence + a serialized named integer vector ('seqlengths.rda') +### June 2020: THE RdaNamedSequences CLASS IS SUPERSEDED BY THE +### RdsNamedSequences CLASS! +### TODO: Deprecate the RdaNamedSequences class. +### +### The "dirpath" slot should contain 1 serialized XString derivative +### per sequence + a serialized named integer vector ('seqlengths.rda') ### containing the sequence names and lengths. ### @@ -100,8 +199,8 @@ { ans <- x@seqlengths[["seqlengths"]] if (!is.integer(ans) || is.null(names(ans))) - stop("serialized object in file '", seqlengthsFilepath(x), "' ", - "must be a named integer vector") + stop(wmsg("serialized object in file '", seqlengthsFilepath(x), + "' must be a named integer vector")) ans } ) @@ -123,14 +222,15 @@ new("RdaNamedSequences", sequences, seqlengths=seqlengths) } -### Load a full sequence as an XString object. -setMethod("[[", "RdaNamedSequences", - function(x, i, j, ...) +### Load a full sequence as an XString derivative. +setMethod("getListElement", "RdaNamedSequences", + function(x, i, exact=TRUE) { - ans <- callNextMethod() + .check_getListElement_index(i, "an RdaNamedSequences object") + ans <- x[[i]] if (!is(ans, "XString")) - stop("serialized object in file '", rdaPath(x, i), "' ", - "must be an XString object") + stop(wmsg("serialized object in file '", rdaPath(x, i), "' ", + "must be an XString derivative")) updateObject(ans) } ) @@ -165,19 +265,11 @@ new("FastaNamedSequences", fafile=fafile) } -### We only support subetting by name. ### Load a full sequence as a DNAString object. -setMethod("[[", "FastaNamedSequences", - function(x, i, j, ...) +setMethod("getListElement", "FastaNamedSequences", + function(x, i, exact=TRUE) { - if (!missing(j) || length(list(...)) > 0L) - stop("invalid subsetting") - if (!is.character(i)) - stop("a FastaNamedSequences object can only be subsetted by name") - if (length(i) < 1L) - stop("attempt to select less than one element") - if (length(i) > 1L) - stop("attempt to select more than one element") + .check_getListElement_index(i, "a FastaNamedSequences object") fafile <- x@fafile seqlength <- .get_seqlength(fafile, i) param <- GRanges(i, IRanges(1L, seqlength)) @@ -193,13 +285,13 @@ function(x, seqname, ranges) { if (!is(ranges, "IntegerRanges")) - stop("'ranges' must be an IntegerRanges object") + stop(wmsg("'ranges' must be an IntegerRanges object")) fafile <- x@fafile seqlength <- .get_seqlength(fafile, seqname) if (length(ranges) != 0L && (min(start(ranges)) < 1L || max(end(ranges)) > seqlength)) - stop("trying to load regions beyond the boundaries ", - "of non-circular sequence \"", seqname, "\"") + stop(wmsg("trying to load regions beyond the boundaries ", + "of non-circular sequence \"", seqname, "\"")) param <- GRanges(seqname, ranges) scanFa(fafile, param=param) } @@ -234,19 +326,11 @@ new("TwobitNamedSequences", twobitfile=twobitfile) } -### We only support subetting by name. ### Load a full sequence as a DNAString object. -setMethod("[[", "TwobitNamedSequences", - function(x, i, j, ...) +setMethod("getListElement", "TwobitNamedSequences", + function(x, i, exact=TRUE) { - if (!missing(j) || length(list(...)) > 0L) - stop("invalid subsetting") - if (!is.character(i)) - stop("a FastaNamedSequences object can only be subsetted by name") - if (length(i) < 1L) - stop("attempt to select less than one element") - if (length(i) > 1L) - stop("attempt to select more than one element") + .check_getListElement_index(i, "a TwobitNamedSequences object") twobitfile <- x@twobitfile seqlength <- .get_seqlength(twobitfile, i) which <- GRanges(i, IRanges(1L, seqlength)) @@ -259,13 +343,13 @@ function(x, seqname, ranges) { if (!is(ranges, "IntegerRanges")) - stop("'ranges' must be an IntegerRanges object") + stop(wmsg("'ranges' must be an IntegerRanges object")) twobitfile <- x@twobitfile seqlength <- .get_seqlength(twobitfile, seqname) if (length(ranges) != 0L && (min(start(ranges)) < 1L || max(end(ranges)) > seqlength)) - stop("trying to load regions beyond the boundaries ", - "of non-circular sequence \"", seqname, "\"") + stop(wmsg("trying to load regions beyond the boundaries ", + "of non-circular sequence \"", seqname, "\"")) which <- GRanges(seqname, ranges) import(twobitfile, which=which) } @@ -279,7 +363,7 @@ .loadSubseqsFromCircularSequence <- function(x, seqname, ranges) { if (!is(ranges, "IntegerRanges")) - stop("'ranges' must be an IntegerRanges object") + stop(wmsg("'ranges' must be an IntegerRanges object")) seqlength <- .get_seqlength(x, seqname) LRranges <- splitLRranges(ranges, seqlength, seqname) Lans <- loadSubseqsFromLinearSequence(x, seqname, LRranges$L) @@ -291,9 +375,9 @@ is_circular=NA) { if (length(ranges) != length(strand)) - stop("'ranges' and 'strand' must have the same length") + stop(wmsg("'ranges' and 'strand' must have the same length")) if (!is.logical(is_circular) || length(is_circular) != 1L) - stop("'is_circular' must be a single logical") + stop(wmsg("'is_circular' must be a single logical")) if (is_circular %in% c(NA, FALSE)) { loadFUN <- loadSubseqsFromLinearSequence } else { @@ -301,7 +385,43 @@ } ans <- loadFUN(x, seqname, ranges) idx <- which(strand == "-") - ans[idx] <- reverseComplement(ans[idx]) + if (length(idx) != 0L) + ans[idx] <- reverseComplement(ans[idx]) ans } + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### OnDiskNamedSequences() constructor +### + +OnDiskNamedSequences <- function(dirpath, seqnames=NULL) +{ + filename <- "seqlengths.rds" + filepath <- file.path(dirpath, filename) + if (file.exists(filepath)) + return(RdsNamedSequences(dirpath, seqnames)) + + filename <- "seqlengths.rda" + filepath <- file.path(dirpath, filename) + if (file.exists(filepath)) + return(RdaNamedSequences(dirpath, seqnames)) + + filename <- "single_sequences.fa" + filepath <- file.path(dirpath, filename) + if (file.exists(filepath)) + return(FastaNamedSequences(filepath)) + + filename <- "single_sequences.fa.rz" + filepath <- file.path(dirpath, filename) + if (file.exists(filepath)) + return(FastaNamedSequences(filepath)) + + filename <- "single_sequences.2bit" + filepath <- file.path(dirpath, filename) + if (file.exists(filepath)) + return(TwobitNamedSequences(filepath)) + + stop(wmsg("invalid directory content at ", dirpath)) +} + diff -Nru r-bioc-bsgenome-1.56.0/vignettes/BSgenomeForge.Rnw r-bioc-bsgenome-1.58.0/vignettes/BSgenomeForge.Rnw --- r-bioc-bsgenome-1.56.0/vignettes/BSgenomeForge.Rnw 2020-04-27 20:20:26.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/vignettes/BSgenomeForge.Rnw 2020-10-27 17:17:33.000000000 +0000 @@ -75,13 +75,6 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -IMPORTANT NOTE: Starting with Bioconductor 2.14, the -\Rfunction{forgeBSgenomeDataPkg} function doesn't handle masks anymore -i.e. now it produces a \term{BSgenome data package} that contains only -the bare sequences. A new function \Rfunction{forgeMaskedBSgenomeDataPkg} -has been added to forge a \term{BSgenome data package} with masked -sequences. This vignette has been updated to reflect these changes. - This vignette describes the process of forging a \term{BSgenome data package}. It is intended for Bioconductor users who want to make a new \term{BSgenome data package}, not for regular users of these packages. @@ -128,9 +121,12 @@ \Rpackage{Biostrings} or \Rpackage{BSgenome} packages. Questions, comments or bug reports about this vignette or about the -BSgenomeForge functions are welcome. Please address them to the author -(\code{hpages@fredhutch.org}) or post them on the Bioconductor support -site \footnote{\url{https://support.bioconductor.org/}}. +BSgenomeForge functions are welcome. Please use the Bioconductor support +site \footnote{\url{https://support.bioconductor.org/}} if you have +questions or need help about forging a \term{BSgenome data package}, +or about anything related to the \Rpackage{BSgenome} package. For bug +reports or feature requests, please open an issue on GitHub at +\url{https://github.com/Bioconductor/BSgenome/issues}. In the next section (``How to forge a BSgenome data package with bare sequences''), we describe how to forge a \term{BSgenome data package} @@ -318,27 +314,24 @@ the \textit{List of UCSC genome releases} table \footnote{\url{http://genome.ucsc.edu/FAQ/FAQreleases\#release1}}. -\item \code{provider}: The provider of the \term{sequence data files} e.g. - \code{UCSC}, \code{NCBI}, \code{BDGP}, \code{FlyBase}, etc... - Should preferably match part 3 of the package name (field \code{Package}). - -\item \code{provider\_version}: The provider-side version of the genome. +\item \code{genome}: The name of the genome. Typically the name of an NCBI + assembly (e.g. \code{GRCh38.p12}, \code{WBcel235}, \code{TAIR10.1}, + \code{ARS-UCD1.2}, etc...) or UCSC genome (e.g. \code{hg38}, + \code{bosTau9}, \code{galGal6}, \code{ce11}, etc...). Should preferably match part 4 of the package name (field \code{Package}). For the packages built by the Bioconductor project from a UCSC genome, this field corresponds to the \code{UCSC VERSION} field of the \textit{List of UCSC genome releases} table. +\item \code{provider}: The provider of the \term{sequence data files} e.g. + \code{UCSC}, \code{NCBI}, \code{BDGP}, \code{FlyBase}, etc... + Should preferably match part 3 of the package name (field \code{Package}). + \item \code{release\_date}: When this assembly of the genome was released. For the packages built by the Bioconductor project from a UCSC genome, this field corresponds to the \code{RELEASE DATE} field of the \textit{List of UCSC genome releases} table. -\item \code{release\_name}: The release name or build number of this assembly - of the genome. - For the packages built by the Bioconductor project from a UCSC genome, - this field corresponds to the \code{RELEASE NAME} field of the - \textit{List of UCSC genome releases} table. - \item \code{source\_url}: The permanent URL where the \term{sequence data files} used to forge the \term{target package} can be found. diff -Nru r-bioc-bsgenome-1.56.0/vignettes/GenomeSearching.Rnw r-bioc-bsgenome-1.58.0/vignettes/GenomeSearching.Rnw --- r-bioc-bsgenome-1.56.0/vignettes/GenomeSearching.Rnw 2020-04-27 20:20:26.000000000 +0000 +++ r-bioc-bsgenome-1.58.0/vignettes/GenomeSearching.Rnw 2020-10-27 17:17:33.000000000 +0000 @@ -123,17 +123,15 @@ @ When displayed, some basic information about the origin of the -genome is shown (organism, provider, provider version, etc...) +genome is shown (organism, genome, provider, etc...) followed by the index of {\it single} sequences and eventually an additional index of {\it multiple} sequences. Methods (adequately called {\it accessor methods}) are defined for individual access to this information: <>= -organism(genome) -provider(genome) -providerVersion(genome) +metadata(genome) seqnames(genome) -mseqnames(genome) +seqinfo(genome) @ See the man page for the \Rclass{BSgenome} class (\Rfunction{?BSgenome}) @@ -328,7 +326,7 @@ genome <- BSgenome.Celegans.UCSC.ce2 seqnames <- seqnames(genome) seqnames_in1string <- paste(seqnames, collapse=", ") - cat("Target:", providerVersion(genome), + cat("Target:", metadata(genome)$genome, "chromosomes", seqnames_in1string, "\n") append <- FALSE for (seqname in seqnames) { @@ -737,7 +735,7 @@ runOneStrandAnalysis <- function(dict0, bsgenome, seqnames, strand, outfile="", append=FALSE) { - cat("\nTarget: strand", strand, "of", providerVersion(bsgenome), + cat("\nTarget: strand", strand, "of", metadata(bsgenome)$genome, "chromosomes", paste(seqnames, collapse=", "), "\n") if (strand == "-") dict0 <- reverseComplement(dict0)