Binary files /tmp/tmp2sjhyil9/ls2zZw_ys0/r-bioc-fishpond-2.0.1+ds/build/vignette.rds and /tmp/tmp2sjhyil9/EGIFf3Vtyl/r-bioc-fishpond-2.2.0+ds/build/vignette.rds differ diff -Nru r-bioc-fishpond-2.0.1+ds/debian/changelog r-bioc-fishpond-2.2.0+ds/debian/changelog --- r-bioc-fishpond-2.0.1+ds/debian/changelog 2021-12-11 20:32:29.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/debian/changelog 2022-05-18 02:58:07.000000000 +0000 @@ -1,3 +1,22 @@ +r-bioc-fishpond (2.2.0+ds-2) unstable; urgency=medium + + * Team upload. + * Add test-dep on r-cran-data.table + * Add test dep on tximportdata + * Do not run tests requiring non-existing data + + -- Nilesh Patra Wed, 18 May 2022 08:28:07 +0530 + +r-bioc-fishpond (2.2.0+ds-1) unstable; urgency=medium + + * Team upload. + * New upstream version + * Standards-Version: 4.6.1 (routine-update) + * Reorder sequence of d/control fields by cme (routine-update) + * Upstream has removed test so we do not need to patch it + + -- Andreas Tille Mon, 16 May 2022 07:11:33 +0200 + r-bioc-fishpond (2.0.1+ds-2) unstable; urgency=medium * Team Upload. diff -Nru r-bioc-fishpond-2.0.1+ds/debian/control r-bioc-fishpond-2.2.0+ds/debian/control --- r-bioc-fishpond-2.0.1+ds/debian/control 2021-12-09 21:14:08.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/debian/control 2022-05-18 02:54:41.000000000 +0000 @@ -1,9 +1,13 @@ Source: r-bioc-fishpond -Maintainer: Debian R Packages Maintainers -Uploaders: Steffen Moeller Section: gnu-r -Testsuite: autopkgtest-pkg-r Priority: optional +Maintainer: Debian R Packages Maintainers +Uploaders: Steffen Moeller +Vcs-Browser: https://salsa.debian.org/r-pkg-team/r-bioc-fishpond +Vcs-Git: https://salsa.debian.org/r-pkg-team/r-bioc-fishpond.git +Homepage: https://bioconductor.org/packages/fishpond/ +Standards-Version: 4.6.1 +Rules-Requires-Root: no Build-Depends: debhelper-compat (= 13), dh-r, r-base-dev, @@ -11,18 +15,16 @@ r-cran-gtools, r-bioc-qvalue, r-bioc-s4vectors, + r-bioc-iranges, r-bioc-summarizedexperiment, + r-bioc-genomicranges, r-cran-matrixstats, r-cran-svmisc, r-cran-rcpp, r-cran-matrix, r-bioc-singlecellexperiment, r-cran-jsonlite -Standards-Version: 4.6.0 -Vcs-Browser: https://salsa.debian.org/r-pkg-team/r-bioc-fishpond -Vcs-Git: https://salsa.debian.org/r-pkg-team/r-bioc-fishpond.git -Homepage: https://bioconductor.org/packages/fishpond/ -Rules-Requires-Root: no +Testsuite: autopkgtest-pkg-r Package: r-bioc-fishpond Architecture: any diff -Nru r-bioc-fishpond-2.0.1+ds/debian/patches/fix-test-dataset-path.patch r-bioc-fishpond-2.2.0+ds/debian/patches/fix-test-dataset-path.patch --- r-bioc-fishpond-2.0.1+ds/debian/patches/fix-test-dataset-path.patch 2021-12-09 20:59:01.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/debian/patches/fix-test-dataset-path.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,15 +0,0 @@ -Description: tximportdata does not contain the desired file as of yet, hence source the data from fishpond package itself -Author: Nilesh Patra -Forwarded: not-needed -Last-Update: 2021-12-10 ---- a/tests/testthat/test_readEDS.R -+++ b/tests/testthat/test_readEDS.R -@@ -2,7 +2,7 @@ - library(fishpond) - test_that("Reading in Alevin EDS format works", { - -- dir <- system.file("extdata/alevin/neurons_900_v014/alevin", package="tximportData") -+ dir <- system.file("extdata/alevin/example-quants/fry-usa-basic/alevin", package="fishpond") - files <- file.path(dir,"quants_mat.gz") - file.exists(files) - barcode.file <- file.path(dir, "quants_mat_rows.txt") diff -Nru r-bioc-fishpond-2.0.1+ds/debian/patches/series r-bioc-fishpond-2.2.0+ds/debian/patches/series --- r-bioc-fishpond-2.0.1+ds/debian/patches/series 2021-12-09 20:57:43.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/debian/patches/series 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ -fix-test-dataset-path.patch diff -Nru r-bioc-fishpond-2.0.1+ds/debian/tests/autopkgtest-pkg-r.conf r-bioc-fishpond-2.2.0+ds/debian/tests/autopkgtest-pkg-r.conf --- r-bioc-fishpond-2.0.1+ds/debian/tests/autopkgtest-pkg-r.conf 2021-12-10 10:28:45.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/debian/tests/autopkgtest-pkg-r.conf 2022-05-18 02:58:07.000000000 +0000 @@ -1 +1 @@ -extra_depends=r-cran-testthat,r-cran-samr,r-bioc-deseq2 +extra_depends=r-cran-testthat,r-cran-samr,r-bioc-deseq2,r-cran-data.table,r-bioc-tximportdata diff -Nru r-bioc-fishpond-2.0.1+ds/debian/tests/autopkgtest-pkg-r.hook r-bioc-fishpond-2.2.0+ds/debian/tests/autopkgtest-pkg-r.hook --- r-bioc-fishpond-2.0.1+ds/debian/tests/autopkgtest-pkg-r.hook 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/debian/tests/autopkgtest-pkg-r.hook 2022-05-18 02:58:07.000000000 +0000 @@ -0,0 +1,3 @@ +#!/bin/sh -e + +rm -f tests/testthat/test_alevinEC.R tests/testthat/test_salmonEC.R diff -Nru r-bioc-fishpond-2.0.1+ds/debian/tests/control r-bioc-fishpond-2.2.0+ds/debian/tests/control --- r-bioc-fishpond-2.0.1+ds/debian/tests/control 2021-12-09 20:14:01.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/debian/tests/control 2022-05-18 02:58:07.000000000 +0000 @@ -1,3 +1,3 @@ Tests: run-unit-test -Depends: @, r-cran-testthat, r-cran-samr, r-bioc-deseq2 +Depends: @, r-cran-testthat, r-cran-samr, r-bioc-deseq2, r-cran-data.table, r-bioc-tximportdata Restrictions: allow-stderr diff -Nru r-bioc-fishpond-2.0.1+ds/debian/tests/run-unit-test r-bioc-fishpond-2.2.0+ds/debian/tests/run-unit-test --- r-bioc-fishpond-2.0.1+ds/debian/tests/run-unit-test 2021-12-09 19:43:26.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/debian/tests/run-unit-test 2022-05-18 02:58:07.000000000 +0000 @@ -9,6 +9,8 @@ cd $AUTOPKGTEST_TMP cp -a /usr/share/doc/$debname/tests $AUTOPKGTEST_TMP gunzip -r * + +rm -f tests/testthat/test_alevinEC.R tests/testthat/test_salmonEC.R cd tests for testfile in *.R; do echo "BEGIN TEST $testfile" diff -Nru r-bioc-fishpond-2.0.1+ds/DESCRIPTION r-bioc-fishpond-2.2.0+ds/DESCRIPTION --- r-bioc-fishpond-2.0.1+ds/DESCRIPTION 2021-12-02 09:28:13.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/DESCRIPTION 2022-04-26 20:42:09.000000000 +0000 @@ -1,32 +1,36 @@ Package: fishpond -Title: Fishpond: differential transcript and gene expression with - inferential replicates -Version: 2.0.1 +Title: Fishpond: downstream methods and tools for expression data +Version: 2.2.0 Authors@R: c( - person("Anzi", "Zhu", role=c("aut","ctb")), + person("Anqi", "Zhu", role=c("aut","ctb")), person("Michael", "Love", email="michaelisaiahlove@gmail.com", role=c("aut","cre")), person("Avi", "Srivastava", role=c("aut","ctb")), person("Rob", "Patro", role=c("aut","ctb")), person("Joseph", "Ibrahim", role=c("aut","ctb")), person("Hirak", "Sarkar", role="ctb"), person("Euphy", "Wu", role="ctb"), + person("Noor", "Pratap Singh", role="ctb"), person("Scott", "Van Buren", role="ctb"), person("Dongze", "He", role="ctb"), person("Steve", "Lianoglou", role="ctb"), - person("Wes", "Wilson", role="ctb") + person("Wes", "Wilson", role="ctb"), + person("Jeroen", "Gilis", role="ctb") ) Maintainer: Michael Love Description: Fishpond contains methods for differential transcript and gene expression analysis of RNA-seq data using inferential replicates for uncertainty of abundance quantification, as generated by Gibbs sampling or bootstrap sampling. Also the - package contains utilities for working with Salmon and Alevin - quantification files. + package contains a number of utilities for working with Salmon + and Alevin quantification files. Imports: graphics, stats, utils, methods, abind, gtools, qvalue, - S4Vectors, SummarizedExperiment, matrixStats, svMisc, Rcpp, - Matrix, SingleCellExperiment, jsonlite + S4Vectors, IRanges, SummarizedExperiment, GenomicRanges, + matrixStats, svMisc, Rcpp, Matrix, SingleCellExperiment, + jsonlite Suggests: testthat, knitr, rmarkdown, macrophage, tximeta, - org.Hs.eg.db, samr, DESeq2, apeglm, tximportData, limma + org.Hs.eg.db, samr, DESeq2, apeglm, tximportData, limma, + ensembldb, EnsDb.Hsapiens.v86, GenomicFeatures, AnnotationDbi, + pheatmap, Gviz, GenomeInfoDb, data.table LinkingTo: Rcpp SystemRequirements: C++11 License: GPL-2 @@ -39,20 +43,22 @@ VignetteBuilder: knitr RoxygenNote: 7.1.2 git_url: https://git.bioconductor.org/packages/fishpond -git_branch: RELEASE_3_14 -git_last_commit: 64edeb7 -git_last_commit_date: 2021-11-30 -Date/Publication: 2021-12-02 +git_branch: RELEASE_3_15 +git_last_commit: 5a790b0 +git_last_commit_date: 2022-04-26 +Date/Publication: 2022-04-26 NeedsCompilation: yes -Packaged: 2021-12-02 09:28:13 UTC; biocbuild -Author: Anzi Zhu [aut, ctb], +Packaged: 2022-04-26 20:42:09 UTC; biocbuild +Author: Anqi Zhu [aut, ctb], Michael Love [aut, cre], Avi Srivastava [aut, ctb], Rob Patro [aut, ctb], Joseph Ibrahim [aut, ctb], Hirak Sarkar [ctb], Euphy Wu [ctb], + Noor Pratap Singh [ctb], Scott Van Buren [ctb], Dongze He [ctb], Steve Lianoglou [ctb], - Wes Wilson [ctb] + Wes Wilson [ctb], + Jeroen Gilis [ctb] diff -Nru r-bioc-fishpond-2.0.1+ds/inst/CITATION r-bioc-fishpond-2.2.0+ds/inst/CITATION --- r-bioc-fishpond-2.0.1+ds/inst/CITATION 2021-10-26 19:25:38.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/inst/CITATION 2022-04-26 18:25:58.000000000 +0000 @@ -8,6 +8,7 @@ year = 2019, journal = "Nucleic Acids Research", doi = "10.1093/nar/gkz622", + url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6765120", volume = 47, issue = 18, pages = "e105", diff -Nru r-bioc-fishpond-2.0.1+ds/inst/doc/allelic.html r-bioc-fishpond-2.2.0+ds/inst/doc/allelic.html --- r-bioc-fishpond-2.0.1+ds/inst/doc/allelic.html 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/inst/doc/allelic.html 2022-04-26 20:39:53.000000000 +0000 @@ -0,0 +1,1952 @@ + + + + + + + + + + + + + + +SEESAW - Allelic expression analysis with Salmon and Swish + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+
+
+
+
+ +
+ + + + + + + + +
+

Introduction

+

In this vignette, we describe usage of a suite of tools, SEESAW, Statistical Estimation of allelic Expression using Salmon and Swish.

+

Running SEESAW involves generation of a diploid transcriptome (e.g. using g2gtools, construction of a diploid Salmon index (specifying --keepDuplicates), followed by Salmon quantification with a number of bootstrap inferential replicates (we recommend 30 bootstrap replicates). These three steps (diploid reference preparation, indexing, quantification with bootstraps) provide the input data for the following statistical analyses in R/Bioconductor. The steps shown in this vignette leverage Bioconductor infrastructure including SummarizedExperiment for storage of input data and results, tximport for data import, and GRanges and Gviz for plotting.

+

In short the SEESAW steps are as listed, and diagrammed below:

+
    +
  1. g2gtools (diploid reference preparation)
  2. +
  3. Salmon indexing with --keepDuplicates
  4. +
  5. Salmon quantification with bootstraps
  6. +
  7. makeTx2Tss() aggregates data to TSS-level (optional)
  8. +
  9. importAllelicCounts() creates a SummarizedExperiment
  10. +
  11. Swish analysis: labelKeep() and swish() (skip scaling)
  12. +
  13. Plotting
  14. +
+

+

SEESAW allows for testing global allelic imbalance across all samples (pairwise testing within each individual), as well as differential or dynamic allelic imbalance (pairwise allelic fold changes estimated within individual, followed by testing across or along an additional covariate). Each of these allelic imbalance (AI) analyses takes into account the potentially heterogeneous amount of inferential uncertainty per sample, per feature (transcript, transcript-group, or gene), and per allele.

+

Below we demonstrate an analysis where transcripts are grouped by their transcription start site (TSS), although gene-level or transcript-level analysis is also possible. New plotting functions added to fishpond facilitate visualization of allelic and isoform changes at different resolutions, alongside gene models. In the first example, we perform global AI testing, and in the second example we perform dynamic AI testing, in both cases on simulated data associated with human genes.

+
+
+

Linking transcripts to TSS

+

We begin assuming steps 1-3 have been completed. We can use the makeTx2Tss function to generate a GRanges object t2g that connects transcripts to transcript groups.

+ +
## DataFrame with 216741 rows and 2 columns
+##                           tx_id               group_id
+##                     <character>            <character>
+## ENST00000456328 ENST00000456328  ENSG00000223972-11869
+## ENST00000450305 ENST00000450305  ENSG00000223972-12010
+## ENST00000488147 ENST00000488147  ENSG00000227232-29570
+## ENST00000619216 ENST00000619216  ENSG00000278267-17436
+## ENST00000473358 ENST00000473358  ENSG00000243485-29554
+## ...                         ...                    ...
+## ENST00000420810 ENST00000420810 ENSG00000224240-2654..
+## ENST00000456738 ENST00000456738 ENSG00000227629-2659..
+## ENST00000435945 ENST00000435945 ENSG00000237917-2663..
+## ENST00000435741 ENST00000435741 ENSG00000231514-2662..
+## ENST00000431853 ENST00000431853 ENSG00000235857-5685..
+

Alternatively for gene-level analysis, one could either prepare a t2g data.frame with tx_id and gene_id columns, or a t2g GRanges object with a column group_id that is equal to gene_id.

+
+
+

Importing allelic counts

+

Here we will use simulated data, but we can import allelic counts with the importAllelicCounts() function. It is best to read over the manual page for this function. For TSS-level analysis, the t2g GRanges generated above should be passed to the tx2gene argument. This will summarize transcript-level counts to the TSS level, and will attach rowRanges that provide the genomic locations of the grouped transcripts.

+
+
+

Filtering features that have no information

+

Because we use --keepDuplicates in the step when we build the Salmon index, there will be a number of features in which there is no information about the allelic expression in the reads. We can find these features in bootstrap data by examining when the inferential replicates are nearly identical for the two alleles, as this is how the EM will split the reads. Removing these features avoids downstream problems during differential testing. Code for this filtering follows:

+ +
+
+

Testing for allelic imbalance across samples

+

We begin by generating a simulated data object that resembles what one would obtain with importAllelicCounts(). The import function arranges the a2 (non-effect) allelic counts first, followed by the a1 (effect) allelic counts. Allelic ratios are calculated at a1/a2.

+ + +
## DataFrame with 20 rows and 2 columns
+##          allele   sample
+##        <factor> <factor>
+## s1-a2        a2  sample1
+## s2-a2        a2  sample2
+## s3-a2        a2  sample3
+## s4-a2        a2  sample4
+## s5-a2        a2  sample5
+## ...         ...      ...
+## s6-a1        a1 sample6 
+## s7-a1        a1 sample7 
+## s8-a1        a1 sample8 
+## s9-a1        a1 sample9 
+## s10-a1       a1 sample10
+ +
## [1] "a2" "a1"
+

A hidden code chunk is used to add ranges from the EnsDb to the simulated dataset. For a real dataset, the ranges would be added either by importAllelicCounts (if using tx2gene) or could be added manually for transcript- or gene-level analysis, using the rowRanges<- setter function. The ranges are only needed for the plotAllelicGene plotting function below.

+
<hidden code chunk>
+

We can already plot a heatmap of allelic ratios, before performing statistical testing. We can see in the first gene, ADSS, there appear to be two groups of transcripts with opposing allelic fold change. SEESAW makes use of pheatmap for plotting a heatmap of allelic ratios.

+ +

+

The following two functions perform a Swish analysis, comparing the allelic counts within sample, while accounting for uncertainty in the assignment of the reads. The underlying test statistic is a Wilcoxon signed-rank statistic.

+ +
+
+

Plotting results

+

We can return to the heatmap, and now add q-values, etc. For details on adding metadata to a pheatmap plot object, see ?pheatmap.

+ +

+

In order to visualize the inferential uncertainty, we can make use of plotInfReps():

+ +

+

SEESAW provides plotAllelicGene() in order to build visualization of Swish test statistics, allelic proportions, and isoform proportions, in a genomic context, making use of Gviz. The first three arguments are the SummarizedExperiment object, the name of a gene (should match gene_id column), and a TxDb or EnsDb to use for plotting the gene model at the top. The statistics and proportions are then plotted at the first position of the feature (start for + features and end for - features).

+ +

+

You can also specify the gene using symbol:

+ +

+

In the allelic proportion and isoform proportion tracks, a line is drawn through the mean proportion for a2 and a1 allele, and for the isoform proportion, across samples, at the start site for each transcript group. The line is meant only to help visualize the mean value as it may change across transcript groups, but the line has no meaning in the ranges in between features. That is, unlike continuous genomic features (methylation or accessibility), there is no meaning to the allelic proportion or isoform proportion outside of measured start sites of transcription.

+

We can further customize the plot, for example, changing the labels displayed on the gene model, and changing the labels for the alleles. An ideogram can be added with ideogram=TRUE, although this requires connecting to an external FTP site.

+

See importAllelicGene() manual page for more details.

+ +

+

We can also customize the display of the alleles in the plotInfReps() plots, by adding a new factor, while carefully noting the existing and new allele labels, to make sure the annotation is correct:

+ +
## [1] "a2" "a1"
+ +

+
+
+

Testing for dynamic allelic imbalance

+

Above, we tested for global AI, where the allelic fold change is consistent across all samples. We can also test for differential or dynamic AI, by adding specification of a cov (covariate) which can be either a two-group factor, or a continuous variable. For continuous variable, the user should specify a correlation test, either cor="pearson" or "spearman".

+ +
## DataFrame with 20 rows and 3 columns
+##          allele   sample      time
+##        <factor> <factor> <numeric>
+## s1-a2        a2  sample1      0.00
+## s2-a2        a2  sample2      0.11
+## s3-a2        a2  sample3      0.22
+## s4-a2        a2  sample4      0.33
+## s5-a2        a2  sample5      0.44
+## ...         ...      ...       ...
+## s6-a1        a1 sample6       0.56
+## s7-a1        a1 sample7       0.67
+## s8-a1        a1 sample8       0.78
+## s9-a1        a1 sample9       0.89
+## s10-a1       a1 sample10      1.00
+

Again, a hidden code chunk adds ranges to our simulation data.

+
<hidden code chunk>
+

In the following, we test for changes in allelic imbalance within sample that correlate with a covariate time.

+ +

Note the first two features have small q-values and opposite test statistic; here the test statistic is the average Pearson correlation of the allelic log fold change with the time variable, averaging over bootstrap replicates.

+ +
## DataFrame with 2 rows and 2 columns
+##                     stat    qvalue
+##                <numeric> <numeric>
+## ADSS-244452134  0.870969     0.005
+## ADSS-244419273 -0.861573     0.005
+

For plotting inferential replicates over a continuous variable, we must first compute summary statistics of inferential mean and variance:

+ +

Now we can examine the allelic counts across the time variable:

+ +

+

With a little more code, we can add a lowess line for each series:

+ +

+

Visualizing the allelic proportion in a heatmap helps to see relationships with the time variable, while also showing data from multiple features at once:

+ +

+

Finally, by binning the time covariate into a few groups, we can again draw the allelic and isoform proportions in the genomic context, now facetting across time.

+

First we create the binned covariate using cut, and rename the labels for nicer labels in our plot:

+ +
## 
+## time-1 time-2 time-3 
+##      3      4      3
+

We can then make our facetted allelic proportion plot:

+ +

+

If we also want to visualize how isoform proportions may be changing, we can add covFacetIsoform=TRUE, which additionally facets the isoform proportion plot by the covariate:

+ +

+

For further questions about the SEESAW steps, please post to one of these locations:

+ +
+
+

Session info

+ +
## R version 4.2.0 RC (2022-04-19 r82224)
+## Platform: x86_64-pc-linux-gnu (64-bit)
+## Running under: Ubuntu 20.04.4 LTS
+## 
+## Matrix products: default
+## BLAS:   /home/biocbuild/bbs-3.15-bioc/R/lib/libRblas.so
+## LAPACK: /home/biocbuild/bbs-3.15-bioc/R/lib/libRlapack.so
+## 
+## locale:
+##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
+##  [3] LC_TIME=en_GB              LC_COLLATE=C              
+##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
+##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
+##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
+## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
+## 
+## attached base packages:
+## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
+## [8] base     
+## 
+## other attached packages:
+##  [1] SummarizedExperiment_1.26.0 MatrixGenerics_1.8.0       
+##  [3] matrixStats_0.62.0          fishpond_2.2.0             
+##  [5] EnsDb.Hsapiens.v86_2.99.0   ensembldb_2.20.0           
+##  [7] AnnotationFilter_1.20.0     GenomicFeatures_1.48.0     
+##  [9] AnnotationDbi_1.58.0        Biobase_2.56.0             
+## [11] GenomicRanges_1.48.0        GenomeInfoDb_1.32.0        
+## [13] IRanges_2.30.0              S4Vectors_0.34.0           
+## [15] BiocGenerics_0.42.0        
+## 
+## loaded via a namespace (and not attached):
+##   [1] colorspace_2.0-3            rjson_0.2.21               
+##   [3] ellipsis_0.3.2              htmlTable_2.4.0            
+##   [5] biovizBase_1.44.0           qvalue_2.28.0              
+##   [7] XVector_0.36.0              base64enc_0.1-3            
+##   [9] dichromat_2.0-0             rstudioapi_0.13            
+##  [11] farver_2.1.0                bit64_4.0.5                
+##  [13] fansi_1.0.3                 xml2_1.3.3                 
+##  [15] splines_4.2.0               cachem_1.0.6               
+##  [17] knitr_1.38                  Formula_1.2-4              
+##  [19] jsonlite_1.8.0              Rsamtools_2.12.0           
+##  [21] cluster_2.1.3               dbplyr_2.1.1               
+##  [23] png_0.1-7                   pheatmap_1.0.12            
+##  [25] compiler_4.2.0              httr_1.4.2                 
+##  [27] backports_1.4.1             assertthat_0.2.1           
+##  [29] Matrix_1.4-1                fastmap_1.1.0              
+##  [31] lazyeval_0.2.2              cli_3.3.0                  
+##  [33] htmltools_0.5.2             prettyunits_1.1.1          
+##  [35] tools_4.2.0                 gtable_0.3.0               
+##  [37] glue_1.6.2                  GenomeInfoDbData_1.2.8     
+##  [39] reshape2_1.4.4              dplyr_1.0.8                
+##  [41] rappdirs_0.3.3              Rcpp_1.0.8.3               
+##  [43] jquerylib_0.1.4             vctrs_0.4.1                
+##  [45] Biostrings_2.64.0           rtracklayer_1.56.0         
+##  [47] xfun_0.30                   stringr_1.4.0              
+##  [49] lifecycle_1.0.1             restfulr_0.0.13            
+##  [51] gtools_3.9.2                XML_3.99-0.9               
+##  [53] zlibbioc_1.42.0             scales_1.2.0               
+##  [55] BSgenome_1.64.0             VariantAnnotation_1.42.0   
+##  [57] hms_1.1.1                   ProtGenerics_1.28.0        
+##  [59] parallel_4.2.0              RColorBrewer_1.1-3         
+##  [61] SingleCellExperiment_1.18.0 yaml_2.3.5                 
+##  [63] curl_4.3.2                  gridExtra_2.3              
+##  [65] memoise_2.0.1               ggplot2_3.3.5              
+##  [67] sass_0.4.1                  rpart_4.1.16               
+##  [69] biomaRt_2.52.0              latticeExtra_0.6-29        
+##  [71] stringi_1.7.6               RSQLite_2.2.12             
+##  [73] highr_0.9                   BiocIO_1.6.0               
+##  [75] checkmate_2.1.0             filelock_1.0.2             
+##  [77] BiocParallel_1.30.0         rlang_1.0.2                
+##  [79] pkgconfig_2.0.3             bitops_1.0-7               
+##  [81] evaluate_0.15               lattice_0.20-45            
+##  [83] purrr_0.3.4                 htmlwidgets_1.5.4          
+##  [85] GenomicAlignments_1.32.0    bit_4.0.4                  
+##  [87] tidyselect_1.1.2            plyr_1.8.7                 
+##  [89] magrittr_2.0.3              R6_2.5.1                   
+##  [91] generics_0.1.2              Hmisc_4.7-0                
+##  [93] DelayedArray_0.22.0         DBI_1.1.2                  
+##  [95] pillar_1.7.0                foreign_0.8-82             
+##  [97] svMisc_1.2.3                survival_3.3-1             
+##  [99] KEGGREST_1.36.0             abind_1.4-5                
+## [101] RCurl_1.98-1.6              nnet_7.3-17                
+## [103] tibble_3.1.6                crayon_1.5.1               
+## [105] utf8_1.2.2                  BiocFileCache_2.4.0        
+## [107] rmarkdown_2.14              jpeg_0.1-9                 
+## [109] progress_1.2.2              grid_4.2.0                 
+## [111] data.table_1.14.2           blob_1.2.3                 
+## [113] digest_0.6.29               munsell_0.5.0              
+## [115] Gviz_1.40.0                 bslib_0.3.1
+
+ + + +
+
+ +
+ + + + + + + + + + + + + + + + diff -Nru r-bioc-fishpond-2.0.1+ds/inst/doc/allelic.R r-bioc-fishpond-2.2.0+ds/inst/doc/allelic.R --- r-bioc-fishpond-2.0.1+ds/inst/doc/allelic.R 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/inst/doc/allelic.R 2022-04-26 20:39:52.000000000 +0000 @@ -0,0 +1,152 @@ +## ----setup, echo=FALSE, results="hide"---------------------------------------- +knitr::opts_chunk$set(tidy=FALSE, cache=FALSE, dev="png", + message=FALSE, error=FALSE, warning=FALSE) + +## ----echo=FALSE--------------------------------------------------------------- +knitr::include_graphics("images/SEESAW.png") + +## ----------------------------------------------------------------------------- +suppressPackageStartupMessages(library(ensembldb)) +library(EnsDb.Hsapiens.v86) +library(fishpond) +edb <- EnsDb.Hsapiens.v86 +t2g <- makeTx2Tss(edb) # GRanges object +mcols(t2g)[,c("tx_id","group_id")] + +## ----eval=FALSE--------------------------------------------------------------- +# n <- ncol(y)/2 +# rep1a1 <- assay(y, "infRep1")[,y$allele == "a1"] +# rep1a2 <- assay(y, "infRep1")[,y$allele == "a2"] +# mcols(y)$someInfo <- rowSums(abs(rep1a1 - rep1a2) < 1) < n +# y <- y[ mcols(y)$someInfo, ] + +## ----------------------------------------------------------------------------- +suppressPackageStartupMessages(library(SummarizedExperiment)) + +## ----------------------------------------------------------------------------- +set.seed(1) +y <- makeSimSwishData(allelic=TRUE) +colData(y) +levels(y$allele) # a1/a2 allelic fold changes + +## ----echo=FALSE--------------------------------------------------------------- +# hidden chunk to add ranges to the `se` +tss <- t2g[!duplicated(t2g$group_id)] +tss$symbol <- mapIds(edb, tss$gene_id, "SYMBOL", "GENEID") +names(tss) <- paste0(tss$symbol, "-", tss$tss) +mcols(tss) <- mcols(tss)[,c("tx_id","gene_id","tss","group_id","symbol")] +# slow... +#tx_id <- CharacterList(split(t2g$tx_id,t2g$group_id)) +#tss$tx_id <- tx_id[names(tss)] +tab <- table(tss$gene_id) +tss$ntss <- as.numeric(tab[tss$gene_id]) +tss <- tss[tss$ntss > 1 & tss$ntss < 5 & seqnames(tss) == "1"] +tss <- tss[order(tss$gene_id),] +tss <- tss[43:1042] +# swap 2nd and 3rd isoform of first gene +tss[2:3] <- tss[3:2] +rowRanges(y) <- tss + +## ----fig.dim=c(7,3.5)--------------------------------------------------------- +y <- computeInfRV(y) # for posterior mean, variance +gene <- rowRanges(y)$gene_id[1] +idx <- mcols(y)$gene_id == gene +plotAllelicHeatmap(y, idx=idx) + +## ----------------------------------------------------------------------------- +y <- labelKeep(y) +y <- swish(y, x="allele", pair="sample") + +## ----fig.dim=c(8,4)----------------------------------------------------------- +dat <- data.frame(minusLogQ=-log10(mcols(y)$qvalue[idx]), + row.names=rownames(y)[idx]) +plotAllelicHeatmap(y, idx=idx, annotation_row=dat) + +## ----fig.dim=c(5,5)----------------------------------------------------------- +par(mfrow=c(2,1), mar=c(1,4.1,2,2)) +plotInfReps(y, idx=1, x="allele", cov="sample", xaxis=FALSE, xlab="") +plotInfReps(y, idx=2, x="allele", cov="sample", xaxis=FALSE, xlab="") + +## ----fig.dim=c(8,7)----------------------------------------------------------- +gene <- rowRanges(y)$gene_id[1] +plotAllelicGene(y, gene, edb) + +## ----fig.dim=c(8,7)----------------------------------------------------------- +plotAllelicGene(y, symbol="ADSS", db=edb) + +## ----fig.dim=c(8,7)----------------------------------------------------------- +plotAllelicGene(y, gene, edb, + transcriptAnnotation="transcript", + labels=list(a2="maternal",a1="paternal")) + +## ----fig.dim=c(5,4)----------------------------------------------------------- +y$allele_new <- y$allele +# note a2 is non-effect, a1 is effect: +levels(y$allele) +# replace a2 then a1: +levels(y$allele_new) <- c("maternal","paternal") +plotInfReps(y, idx=1, x="allele_new", + legend=TRUE, legendPos="bottom") + +## ----------------------------------------------------------------------------- +set.seed(1) +y <- makeSimSwishData(dynamic=TRUE) +colData(y) + +## ----echo=FALSE--------------------------------------------------------------- +rowRanges(y) <- tss + +## ----------------------------------------------------------------------------- +y <- labelKeep(y) +y <- swish(y, x="allele", pair="sample", cov="time", cor="pearson") + +## ----------------------------------------------------------------------------- +mcols(y)[1:2,c("stat","qvalue")] + +## ----------------------------------------------------------------------------- +y <- computeInfRV(y) + +## ----fig.dim=c(7,7)----------------------------------------------------------- +par(mfrow=c(2,1), mar=c(2.5,4,2,2)) +plotInfReps(y, idx=1, x="time", cov="allele", shiftX=.01, xaxis=FALSE, xlab="", main="") +par(mar=c(4.5,4,0,2)) +plotInfReps(y, idx=2, x="time", cov="allele", shiftX=.01, main="") + +## ----fig.dim=c(7,5)----------------------------------------------------------- +plotInfReps(y, idx=1, x="time", cov="allele", shiftX=.01) +dat <- data.frame( + time = y$time[1:10], + a2 = assay(y, "mean")[1,y$allele=="a2"], + a1 = assay(y, "mean")[1,y$allele=="a1"]) +lines(lowess(dat[,c(1,2)]), col="dodgerblue") +lines(lowess(dat[,c(1,3)]), col="goldenrod4") + +## ----fig.dim=c(8,4)----------------------------------------------------------- +idx <- c(1:4) +row_dat <- data.frame(minusLogQ=-log10(mcols(y)$qvalue[idx]), + row.names=rownames(y)[idx]) +col_dat <- data.frame(time=y$time[1:10], + row.names=paste0("s",1:10)) +plotAllelicHeatmap(y, idx=idx, + annotation_row=row_dat, + annotation_col=col_dat) + +## ----------------------------------------------------------------------------- +y$time_bins <- cut(y$time,breaks=c(0,.25,.75,1), + include.lowest=TRUE, labels=FALSE) +y$time_bins <- paste0("time-",y$time_bins) +table(y$time_bins[ y$allele == "a2" ]) + +## ----fig.dim=c(8,7)----------------------------------------------------------- +gene <- rowRanges(y)$gene_id[1] +plotAllelicGene(y, gene, edb, cov="time_bins", + qvalue=FALSE, log2FC=FALSE) + +## ----fig.dim=c(8,7)----------------------------------------------------------- +plotAllelicGene(y, gene, edb, cov="time_bins", + covFacetIsoform=TRUE, + qvalue=FALSE, log2FC=FALSE) + +## ----------------------------------------------------------------------------- +sessionInfo() + diff -Nru r-bioc-fishpond-2.0.1+ds/inst/doc/allelic.Rmd r-bioc-fishpond-2.2.0+ds/inst/doc/allelic.Rmd --- r-bioc-fishpond-2.0.1+ds/inst/doc/allelic.Rmd 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/inst/doc/allelic.Rmd 2022-04-26 18:25:58.000000000 +0000 @@ -0,0 +1,401 @@ +--- +title: "SEESAW - Allelic expression analysis with Salmon and Swish" +date: "`r format(Sys.Date(), '%m/%d/%Y')`" +output: + rmarkdown::html_document: + highlight: tango + toc: true + toc_float: true +bibliography: library.bib +vignette: | + %\VignetteIndexEntry{2. SEESAW - Allelic expression analysis with Salmon and Swish} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + + + +```{r setup, echo=FALSE, results="hide"} +knitr::opts_chunk$set(tidy=FALSE, cache=FALSE, dev="png", + message=FALSE, error=FALSE, warning=FALSE) +``` + +# Introduction + +In this vignette, we describe usage of a suite of tools, **SEESAW**, +Statistical Estimation of allelic Expression using Salmon and +Swish. + +Running SEESAW involves generation of a diploid transcriptome +(e.g. using [g2gtools](http://churchill-lab.github.io/g2gtools/), +construction of a diploid Salmon index (specifying +`--keepDuplicates`), followed by Salmon quantification with a number of +[bootstrap inferential replicates](https://salmon.readthedocs.io/en/latest/salmon.html#numbootstraps) +(we recommend 30 bootstrap replicates). +These three steps (diploid reference preparation, indexing, quantification +with bootstraps) provide the input data for the following statistical +analyses in R/Bioconductor. The steps shown in this vignette leverage +Bioconductor infrastructure including *SummarizedExperiment* for +storage of input data and results, *tximport* for data import, and +*GRanges* and *Gviz* for plotting. + +In short the SEESAW steps are as listed, and diagrammed below: + +1. g2gtools (diploid reference preparation) +2. Salmon indexing with `--keepDuplicates` +3. Salmon quantification with bootstraps +4. `makeTx2Tss()` aggregates data to TSS-level (optional) +5. `importAllelicCounts()` creates a *SummarizedExperiment* +6. Swish analysis: `labelKeep()` and `swish()` (skip scaling) +7. Plotting + +```{r echo=FALSE} +knitr::include_graphics("images/SEESAW.png") +``` + +SEESAW allows for testing *global allelic imbalance* across all +samples (pairwise testing within each individual), as well as +*differential or dynamic allelic imbalance* (pairwise allelic fold +changes estimated within individual, followed by testing across or +along an additional covariate). Each of these allelic imbalance (AI) +analyses takes into account the potentially heterogeneous amount of +inferential uncertainty per sample, per feature (transcript, +transcript-group, or gene), and per allele. + +Below we demonstrate an analysis where transcripts are grouped by their +transcription start site (TSS), although gene-level or +transcript-level analysis is also possible. New plotting functions +added to *fishpond* facilitate visualization of allelic and isoform +changes at different resolutions, alongside gene models. In the first +example, we perform global AI testing, and in the second example we +perform dynamic AI testing, in both cases on simulated data associated +with human genes. + +# Linking transcripts to TSS + +We begin assuming steps 1-3 have been completed. We can use the +`makeTx2Tss` function to generate a *GRanges* object `t2g` that +connects transcripts to transcript groups. + + +```{r} +suppressPackageStartupMessages(library(ensembldb)) +library(EnsDb.Hsapiens.v86) +library(fishpond) +edb <- EnsDb.Hsapiens.v86 +t2g <- makeTx2Tss(edb) # GRanges object +mcols(t2g)[,c("tx_id","group_id")] +``` + +Alternatively for gene-level analysis, one could either prepare a +`t2g` data.frame with `tx_id` and `gene_id` columns, or a `t2g` +*GRanges* object with a column `group_id` that is equal to `gene_id`. + +# Importing allelic counts + +Here we will use simulated data, but we can import allelic counts with +the `importAllelicCounts()` function. It is best to read over the +manual page for this function. For TSS-level analysis, the `t2g` +*GRanges* generated above should be passed to the `tx2gene` +argument. This will summarize transcript-level counts to the TSS +level, and will attach `rowRanges` that provide the genomic locations +of the grouped transcripts. + +# Filtering features that have no information + +Because we use `--keepDuplicates` in the step when we build the Salmon +index, there will be a number of features in which there is no +information about the allelic expression in the reads. We can find +these features in bootstrap data by examining when the inferential +replicates are nearly identical for the two alleles, as this is how +the EM will split the reads. Removing these features avoids downstream +problems during differential testing. Code for this filtering follows: + +```{r eval=FALSE} +n <- ncol(y)/2 +rep1a1 <- assay(y, "infRep1")[,y$allele == "a1"] +rep1a2 <- assay(y, "infRep1")[,y$allele == "a2"] +mcols(y)$someInfo <- rowSums(abs(rep1a1 - rep1a2) < 1) < n +y <- y[ mcols(y)$someInfo, ] +``` + +# Testing for allelic imbalance across samples + +We begin by generating a simulated data object that resembles what one +would obtain with `importAllelicCounts()`. The import function +arranges the `a2` (non-effect) allelic counts first, followed by the +`a1` (effect) allelic counts. Allelic ratios are calculated at +`a1/a2`. + +```{r} +suppressPackageStartupMessages(library(SummarizedExperiment)) +``` + +```{r} +set.seed(1) +y <- makeSimSwishData(allelic=TRUE) +colData(y) +levels(y$allele) # a1/a2 allelic fold changes +``` + +A hidden code chunk is used to add ranges from the *EnsDb* to the +simulated dataset. For a real dataset, the ranges would be added +either by `importAllelicCounts` (if using `tx2gene`) or could be added +manually for transcript- or gene-level analysis, using the +`rowRanges<-` setter function. The ranges are only needed for the +`plotAllelicGene` plotting function below. + +``` + +``` + +```{r echo=FALSE} +# hidden chunk to add ranges to the `se` +tss <- t2g[!duplicated(t2g$group_id)] +tss$symbol <- mapIds(edb, tss$gene_id, "SYMBOL", "GENEID") +names(tss) <- paste0(tss$symbol, "-", tss$tss) +mcols(tss) <- mcols(tss)[,c("tx_id","gene_id","tss","group_id","symbol")] +# slow... +#tx_id <- CharacterList(split(t2g$tx_id,t2g$group_id)) +#tss$tx_id <- tx_id[names(tss)] +tab <- table(tss$gene_id) +tss$ntss <- as.numeric(tab[tss$gene_id]) +tss <- tss[tss$ntss > 1 & tss$ntss < 5 & seqnames(tss) == "1"] +tss <- tss[order(tss$gene_id),] +tss <- tss[43:1042] +# swap 2nd and 3rd isoform of first gene +tss[2:3] <- tss[3:2] +rowRanges(y) <- tss +``` + +We can already plot a heatmap of allelic ratios, before performing +statistical testing. We can see in the first gene, *ADSS*, there +appear to be two groups of transcripts with opposing allelic fold +change. SEESAW makes use of *pheatmap* for plotting a heatmap of +allelic ratios. + +```{r fig.dim=c(7,3.5)} +y <- computeInfRV(y) # for posterior mean, variance +gene <- rowRanges(y)$gene_id[1] +idx <- mcols(y)$gene_id == gene +plotAllelicHeatmap(y, idx=idx) +``` + +The following two functions perform a Swish analysis, comparing the +allelic counts within sample, while accounting for uncertainty in the +assignment of the reads. The underlying test statistic is a Wilcoxon +signed-rank statistic. + +```{r} +y <- labelKeep(y) +y <- swish(y, x="allele", pair="sample") +``` + +# Plotting results + +We can return to the heatmap, and now add q-values, etc. For details +on adding metadata to a *pheatmap* plot object, see `?pheatmap`. + +```{r fig.dim=c(8,4)} +dat <- data.frame(minusLogQ=-log10(mcols(y)$qvalue[idx]), + row.names=rownames(y)[idx]) +plotAllelicHeatmap(y, idx=idx, annotation_row=dat) +``` + +In order to visualize the inferential uncertainty, we can make use of +`plotInfReps()`: + +```{r fig.dim=c(5,5)} +par(mfrow=c(2,1), mar=c(1,4.1,2,2)) +plotInfReps(y, idx=1, x="allele", cov="sample", xaxis=FALSE, xlab="") +plotInfReps(y, idx=2, x="allele", cov="sample", xaxis=FALSE, xlab="") +``` + +SEESAW provides `plotAllelicGene()` in order to build visualization of +Swish test statistics, allelic proportions, and isoform proportions, +in a genomic context, making use of *Gviz*. The first three arguments +are the *SummarizedExperiment* object, the name of a gene (should +match `gene_id` column), and a *TxDb* or *EnsDb* to use for plotting +the gene model at the top. The statistics and proportions are then +plotted at the first position of the feature (`start` for `+` features +and `end` for `-` features). + +```{r fig.dim=c(8,7)} +gene <- rowRanges(y)$gene_id[1] +plotAllelicGene(y, gene, edb) +``` + +You can also specify the gene using `symbol`: + +```{r fig.dim=c(8,7)} +plotAllelicGene(y, symbol="ADSS", db=edb) +``` + +In the allelic proportion and isoform proportion tracks, a line is +drawn through the mean proportion for a2 and a1 allele, and for the +isoform proportion, across samples, at the start site for each +transcript group. The line is meant only to help visualize the mean +value as it may change across transcript groups, but the line has no +meaning in the ranges in between features. That is, unlike continuous +genomic features (methylation or accessibility), there is no meaning +to the allelic proportion or isoform proportion outside of measured +start sites of transcription. + +We can further customize the plot, for example, +changing the labels displayed on the gene model, and changing the +labels for the alleles. An ideogram can be added with `ideogram=TRUE`, +although this requires connecting to an external FTP site. + +See `importAllelicGene()` manual page for more details. + +```{r fig.dim=c(8,7)} +plotAllelicGene(y, gene, edb, + transcriptAnnotation="transcript", + labels=list(a2="maternal",a1="paternal")) +``` + +We can also customize the display of the alleles in the +`plotInfReps()` plots, by adding a new factor, while carefully noting +the existing and new allele labels, to make sure the annotation is +correct: + +```{r fig.dim=c(5,4)} +y$allele_new <- y$allele +# note a2 is non-effect, a1 is effect: +levels(y$allele) +# replace a2 then a1: +levels(y$allele_new) <- c("maternal","paternal") +plotInfReps(y, idx=1, x="allele_new", + legend=TRUE, legendPos="bottom") +``` + +# Testing for dynamic allelic imbalance + +Above, we tested for global AI, where the allelic fold change is +consistent across all samples. We can also test for differential or +dynamic AI, by adding specification of a `cov` (covariate) which can +be either a two-group factor, or a continuous variable. For continuous +variable, the user should specify a correlation test, either +`cor="pearson"` or `"spearman"`. + +```{r} +set.seed(1) +y <- makeSimSwishData(dynamic=TRUE) +colData(y) +``` + +Again, a hidden code chunk adds ranges to our simulation data. + +``` + +``` + +```{r echo=FALSE} +rowRanges(y) <- tss +``` + +In the following, we test for changes in allelic imbalance within +sample that correlate with a covariate `time`. + +```{r} +y <- labelKeep(y) +y <- swish(y, x="allele", pair="sample", cov="time", cor="pearson") +``` + +Note the first two features have small q-values and opposite test +statistic; here the test statistic is the average Pearson correlation +of the allelic log fold change with the `time` variable, averaging +over bootstrap replicates. + +```{r} +mcols(y)[1:2,c("stat","qvalue")] +``` + +For plotting inferential replicates over a continuous variable, we +must first compute summary statistics of inferential mean and +variance: + +```{r} +y <- computeInfRV(y) +``` + +Now we can examine the allelic counts across the `time` variable: + +```{r fig.dim=c(7,7)} +par(mfrow=c(2,1), mar=c(2.5,4,2,2)) +plotInfReps(y, idx=1, x="time", cov="allele", shiftX=.01, xaxis=FALSE, xlab="", main="") +par(mar=c(4.5,4,0,2)) +plotInfReps(y, idx=2, x="time", cov="allele", shiftX=.01, main="") +``` + +With a little more code, we can add a `lowess` line for each series: + +```{r fig.dim=c(7,5)} +plotInfReps(y, idx=1, x="time", cov="allele", shiftX=.01) +dat <- data.frame( + time = y$time[1:10], + a2 = assay(y, "mean")[1,y$allele=="a2"], + a1 = assay(y, "mean")[1,y$allele=="a1"]) +lines(lowess(dat[,c(1,2)]), col="dodgerblue") +lines(lowess(dat[,c(1,3)]), col="goldenrod4") +``` + +Visualizing the allelic proportion in a heatmap helps to see +relationships with the `time` variable, while also showing data from +multiple features at once: + +```{r fig.dim=c(8,4)} +idx <- c(1:4) +row_dat <- data.frame(minusLogQ=-log10(mcols(y)$qvalue[idx]), + row.names=rownames(y)[idx]) +col_dat <- data.frame(time=y$time[1:10], + row.names=paste0("s",1:10)) +plotAllelicHeatmap(y, idx=idx, + annotation_row=row_dat, + annotation_col=col_dat) +``` + +Finally, by binning the `time` covariate into a few groups, we can +again draw the allelic and isoform proportions in the genomic context, +now facetting across time. + +First we create the binned covariate using `cut`, and rename the +labels for nicer labels in our plot: + +```{r} +y$time_bins <- cut(y$time,breaks=c(0,.25,.75,1), + include.lowest=TRUE, labels=FALSE) +y$time_bins <- paste0("time-",y$time_bins) +table(y$time_bins[ y$allele == "a2" ]) +``` + +We can then make our facetted allelic proportion plot: + +```{r fig.dim=c(8,7)} +gene <- rowRanges(y)$gene_id[1] +plotAllelicGene(y, gene, edb, cov="time_bins", + qvalue=FALSE, log2FC=FALSE) +``` + +If we also want to visualize how isoform proportions may be changing, +we can add `covFacetIsoform=TRUE`, which additionally facets the +isoform proportion plot by the covariate: + +```{r fig.dim=c(8,7)} +plotAllelicGene(y, gene, edb, cov="time_bins", + covFacetIsoform=TRUE, + qvalue=FALSE, log2FC=FALSE) +``` + +For further questions about the SEESAW steps, please post to one of +these locations: + +* Bioconductor support site and use + the tag `fishpond` or `swish` +* GitHub Issue + +# Session info + +```{r} +sessionInfo() +``` diff -Nru r-bioc-fishpond-2.0.1+ds/inst/doc/swish.R r-bioc-fishpond-2.2.0+ds/inst/doc/swish.R --- r-bioc-fishpond-2.0.1+ds/inst/doc/swish.R 2021-12-02 09:28:11.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/inst/doc/swish.R 2022-04-26 20:42:07.000000000 +0000 @@ -6,10 +6,10 @@ # # 'coldata.csv': sample information table # coldata <- read.csv("coldata.csv") # library(tximeta) -# y <- tximeta(coldata) # reads in counts -# library(swish) +# y <- tximeta(coldata) # reads in counts and inf reps +# library(fishpond) # y <- scaleInfReps(y) # scales counts -# y <- labelKeep(y) # labels genes to keep +# y <- labelKeep(y) # labels features to keep # set.seed(1) # y <- swish(y, x="condition") # simplest Swish case @@ -77,7 +77,7 @@ ## ----------------------------------------------------------------------------- y <- se -y <- y[seqnames(y) == "chr1",] +y <- y[seqnames(y) == "chr4",] ## ----------------------------------------------------------------------------- y <- y[,y$condition %in% c("naive","IFNg")] @@ -89,7 +89,7 @@ y <- labelKeep(y) y <- y[mcols(y)$keep,] set.seed(1) -y <- swish(y, x="condition", pair="line", nperms=64) +y <- swish(y, x="condition", pair="line") ## ----------------------------------------------------------------------------- table(mcols(y)$qvalue < .05) @@ -141,7 +141,7 @@ ## ----------------------------------------------------------------------------- gse <- summarizeToGene(se) gy <- gse -gy <- gy[seqnames(gy) == "chr1",] +gy <- gy[seqnames(gy) == "chr4",] ## ----------------------------------------------------------------------------- gy <- gy[,gy$condition %in% c("naive","IFNg")] @@ -152,7 +152,7 @@ gy <- labelKeep(gy) gy <- gy[mcols(gy)$keep,] set.seed(1) -gy <- swish(gy, x="condition", pair="line", nperms=64) +gy <- swish(gy, x="condition", pair="line") ## ----------------------------------------------------------------------------- table(mcols(gy)$qvalue < .05) @@ -195,8 +195,7 @@ ## ----------------------------------------------------------------------------- # run on the transcript-level dataset iso <- isoformProportions(y) -# `nperms=64` here is atypical, usually not specified, see note above -iso <- swish(iso, x="condition", pair="line", nperms=64) +iso <- swish(iso, x="condition", pair="line") ## ----eval=FALSE, echo=FALSE--------------------------------------------------- # # some unevaluated code for looking into DTE within non-DGE gene @@ -261,28 +260,28 @@ plotInfReps(y2, idx[1], x="ifng", cov="salmonella") plotInfReps(y2, idx[2], x="ifng", cov="salmonella") -## ----------------------------------------------------------------------------- -dir <- system.file("extdata", package="tximportData") -files <- file.path(dir,"alevin/neurons_900_v014/alevin/quants_mat.gz") -neurons <- tximeta(files, type="alevin", - skipMeta=TRUE, # just for vignette - dropInfReps=TRUE, - alevinArgs=list(filterBarcodes=TRUE)) - -## ----------------------------------------------------------------------------- -library(SingleCellExperiment) -sce <- as(neurons, "SingleCellExperiment") -sce$cluster <- factor(paste0("cl",sample(1:6,ncol(sce),TRUE))) -par(mfrow=c(2,1), mar=c(2,4,2,1)) -plotInfReps(sce, "ENSMUSG00000072235.6", x="cluster", - legend=TRUE) -plotInfReps(sce, "ENSMUSG00000072235.6", x="cluster", - reorder=FALSE) - -## ----echo=FALSE--------------------------------------------------------------- -par(mfrow=c(2,1), mar=c(2,4,2,1)) -plotInfReps(sce[,1:50], "ENSMUSG00000072235.6", x="cluster") -plotInfReps(sce[,1:150], "ENSMUSG00000072235.6", x="cluster") +## ----eval=FALSE--------------------------------------------------------------- +# dir <- system.file("extdata", package="tximportData") +# files <- file.path(dir,"alevin/neurons_900_v014/alevin/quants_mat.gz") +# neurons <- tximeta(files, type="alevin", +# skipMeta=TRUE, # just for vignette +# dropInfReps=TRUE, +# alevinArgs=list(filterBarcodes=TRUE)) + +## ----eval=FALSE--------------------------------------------------------------- +# library(SingleCellExperiment) +# sce <- as(neurons, "SingleCellExperiment") +# sce$cluster <- factor(paste0("cl",sample(1:6,ncol(sce),TRUE))) +# par(mfrow=c(2,1), mar=c(2,4,2,1)) +# plotInfReps(sce, "ENSMUSG00000072235.6", x="cluster", +# legend=TRUE) +# plotInfReps(sce, "ENSMUSG00000072235.6", x="cluster", +# reorder=FALSE) + +## ----echo=FALSE, eval=FALSE--------------------------------------------------- +# par(mfrow=c(2,1), mar=c(2,4,2,1)) +# plotInfReps(sce[,1:50], "ENSMUSG00000072235.6", x="cluster") +# plotInfReps(sce[,1:150], "ENSMUSG00000072235.6", x="cluster") ## ----eval=FALSE--------------------------------------------------------------- # y <- labelKeep(y, minCount=3, minN=10) @@ -363,7 +362,7 @@ ## ----------------------------------------------------------------------------- y3 <- se -y3 <- y3[seqnames(y3) == "chr1",] +y3 <- y3[seqnames(y3) == "chr4",] y3 <- y3[,y3$condition %in% c("naive","IFNg")] y3 <- labelKeep(y3) y3 <- y3[mcols(y3)$keep,] diff -Nru r-bioc-fishpond-2.0.1+ds/inst/doc/swish.Rmd r-bioc-fishpond-2.2.0+ds/inst/doc/swish.Rmd --- r-bioc-fishpond-2.0.1+ds/inst/doc/swish.Rmd 2021-10-26 19:25:38.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/inst/doc/swish.Rmd 2022-04-26 18:25:58.000000000 +0000 @@ -1,7 +1,6 @@ --- -title: "DTE and DGE with inferential replicates" +title: "Swish: differential expression accounting for inferential uncertainty" date: "`r format(Sys.Date(), '%m/%d/%Y')`" -author: "Anqi Zhu, Avi Srivastava, Joseph Ibrahim, Rob Patro, Michael Love" output: rmarkdown::html_document: highlight: tango @@ -9,7 +8,7 @@ toc_float: true bibliography: library.bib vignette: | - %\VignetteIndexEntry{DTE and DGE with inferential replicates} + %\VignetteIndexEntry{1. Swish: DE analysis accounting for inferential uncertainty} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- @@ -41,9 +40,6 @@ interaction test for matched samples, to see if a condition effect changes in magnitude across two groups of samples. -**Acknowledgments:** We have benefited in the development of *Swish* -from the feedback of Hirak Sarkar and Scott Van Buren. - # Quick start The following lines of code will perform a basic transcript-level @@ -55,18 +51,19 @@ # 'coldata.csv': sample information table coldata <- read.csv("coldata.csv") library(tximeta) -y <- tximeta(coldata) # reads in counts -library(swish) +y <- tximeta(coldata) # reads in counts and inf reps +library(fishpond) y <- scaleInfReps(y) # scales counts -y <- labelKeep(y) # labels genes to keep +y <- labelKeep(y) # labels features to keep set.seed(1) y <- swish(y, x="condition") # simplest Swish case ``` **Note:** Inferential replicates, either from Gibbs sampling or -bootstrapping of reads, are required for the *swish* method. You can -set `--numGibbsSamples 20` when running Salmon to generate Gibbs -samples. +bootstrapping of reads, are required for the *swish* method. +When running Salmon, you can set `--numGibbsSamples 30` to generate +Gibbs samples, or `--numBootstraps 30` to generate bootstrap samples +(we typically recommend 20-30 inferential replicates). The results can be found in `mcols(y)`. For example, one can calculate the number of genes passing a 5% FDR threshold: @@ -226,11 +223,11 @@ We will rename our *SummarizedExperiment* `y` for the statistical analysis. For speed of the vignette, we subset to the transcripts on -chromosome 1. +chromosome 4. ```{r} y <- se -y <- y[seqnames(y) == "chr1",] +y <- y[seqnames(y) == "chr4",] ``` To demonstrate a two group comparison, we subset to the "naive" and @@ -247,7 +244,7 @@ that log2 fold changes are reported as the non-reference level over the reference level. By default, R will choose a *reference level* for factors based on alphabetical order, unless `levels` are -explicitly set. It is recommended to set the factors levels, as in +explicitly set. It is recommended to set the factors levels, as in the above code chunk, with the reference level coming first in the character vector, so that log2 fold changes correspond to the comparison of interest. @@ -263,23 +260,23 @@ permutations, to obtain identical results, one needs to set a random seed before running `swish()`, as we do below. -The default number of permutations in `swish` is -`nperms=100`. However, for paired datasets as this one, you may have -fewer maximum permutations. In this case, there are 64 possible ways -to switch the condition labels for six pairs of samples. We can set -the `nperms` manually (or if we had just used the default value, -`swish` would set `nperms` to the maximum value possible and notify -the user that it had done so). - ```{r results="hide", message=FALSE} library(fishpond) y <- scaleInfReps(y) y <- labelKeep(y) y <- y[mcols(y)$keep,] set.seed(1) -y <- swish(y, x="condition", pair="line", nperms=64) +y <- swish(y, x="condition", pair="line") ``` +The default number of permutations for computing p-values is +`nperms=100`. However, for paired datasets as this one, you may have +fewer maximum permutations. In this case, there are 64 possible ways +to switch the condition labels for six pairs of samples. We can set +the `nperms` manually to 64, or if we had just used the default value, +`swish` would set `nperms` to the maximum value possible and notify +the user that it had done so. + A note about `labelKeep`: by default we keep features with `minN=3` samples with a minimal count of 10. For scRNA-seq data with de-duplicated UMI counts, we recommend to lower the count, e.g. a @@ -410,12 +407,12 @@ We can also run swish at the gene level. First we summarize all of the data to the gene level, using the `summarizeToGene` function from *tximeta*. Again, we rename the object for statistical analysis, and -then we subset to the genes on chromosome 1 for the demonstration. +then we subset to the genes on chromosome 4 for the demonstration. ```{r} gse <- summarizeToGene(se) gy <- gse -gy <- gy[seqnames(gy) == "chr1",] +gy <- gy[seqnames(gy) == "chr4",] ``` Two demonstrate a two group comparison, we subset to the "naive" and @@ -434,7 +431,7 @@ gy <- labelKeep(gy) gy <- gy[mcols(gy)$keep,] set.seed(1) -gy <- swish(gy, x="condition", pair="line", nperms=64) +gy <- swish(gy, x="condition", pair="line") ``` As before, the number of genes in a 1% FDR set: @@ -525,8 +522,7 @@ ```{r} # run on the transcript-level dataset iso <- isoformProportions(y) -# `nperms=64` here is atypical, usually not specified, see note above -iso <- swish(iso, x="condition", pair="line", nperms=64) +iso <- swish(iso, x="condition", pair="line") ``` ```{r eval=FALSE, echo=FALSE} @@ -681,7 +677,7 @@ plotInfReps(y2, idx[2], x="ifng", cov="salmonella") ``` -# Correlation with continuous variables +# Correlation test *Swish* now has methods to compute correlations (`"spearman"` or `"pearson"`) of a continuous variable `x` with the @@ -737,7 +733,7 @@ `tximeta` to add the gene range information and other metadata if working with human, mouse, or fruit fly. -```{r} +```{r eval=FALSE} dir <- system.file("extdata", package="tximportData") files <- file.path(dir,"alevin/neurons_900_v014/alevin/quants_mat.gz") neurons <- tximeta(files, type="alevin", @@ -753,7 +749,7 @@ [Orchestrating Single-Cell Analysis with Bioconductor](https://osca.bioconductor.org/) [@Amezquita2020]. -```{r} +```{r eval=FALSE} library(SingleCellExperiment) sce <- as(neurons, "SingleCellExperiment") sce$cluster <- factor(paste0("cl",sample(1:6,ncol(sce),TRUE))) @@ -764,8 +760,8 @@ reorder=FALSE) ``` -`plotInfReps` has a number of options and convenience arguments. One -can: +`plotInfReps` has a number of options and convenience arguments for +visualizing single cell data. One can: * add a `legend`, * `reorder` the cells by expression value within cluster, @@ -784,7 +780,7 @@ example with $n \in [1,150)$ or $[150,400)$, more visual elements per cell will be included: -```{r echo=FALSE} +```{r echo=FALSE, eval=FALSE} par(mfrow=c(2,1), mar=c(2,4,2,1)) plotInfReps(sce[,1:50], "ENSMUSG00000072235.6", x="cluster") plotInfReps(sce[,1:150], "ENSMUSG00000072235.6", x="cluster") @@ -897,21 +893,25 @@ ## Analysis types supported by Swish -There are currently five types of analysis supported by `swish`: +There are currently seven types of analysis supported by `swish`: * Two group analysis * Two groups with two or more batches * Two group paired or matched samples * Two condition x two group paired samples, interaction test * Two condition x two group samples, not paired, interaction test +* Correlation test of expression with continuous x +* Correlation test of LFC for paired samples with continuous covariate This vignette demonstrated the third in this list, but the others can be run by either not specifying any additional covariates, or by specifying a batch variable with the argument `cov` instead of `pair`. The two interaction tests can be run by specifying `interaction=TRUE` -and providing `x`, `cov`, and optionally `pair`. +and providing `x`, `cov`, and optionally `pair`. The last two tests +can be run using the `cor` argument +(see [Correlation test](#correlation-test) section in this vignette). -## Accounting for estimated / continuous batch effects +## Accounting for continuous variables We have two recommended approaches for using `swish` in combination with estimated batch effects, e.g. factors of unwanted variation @@ -936,7 +936,7 @@ matrices, and these contain both sampling and additional inferential variance, we are able to track how scaling the counts affects the variance, and this informs the test statistic. - + For the second approach, one can directly scale the inferential counts using *limma* and `removeBatchEffect`. @@ -1003,6 +1003,10 @@ y <- swish(y, x="condition") ``` +While we have assessed this approach with a small number of +estimated nuisance covariates, note that with many covariates this +approach may lead to inflation of test statistics. + ## Structure of tximeta output While `tximeta` is the safest way to provide the correct input to @@ -1063,7 +1067,7 @@ ```{r} y3 <- se -y3 <- y3[seqnames(y3) == "chr1",] +y3 <- y3[seqnames(y3) == "chr4",] y3 <- y3[,y3$condition %in% c("naive","IFNg")] y3 <- labelKeep(y3) y3 <- y3[mcols(y3)$keep,] diff -Nru r-bioc-fishpond-2.0.1+ds/inst/extdata/animation-code.R r-bioc-fishpond-2.2.0+ds/inst/extdata/animation-code.R --- r-bioc-fishpond-2.0.1+ds/inst/extdata/animation-code.R 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/inst/extdata/animation-code.R 2022-04-26 18:25:58.000000000 +0000 @@ -0,0 +1,46 @@ +library(ggplot2) +library(gganimate) +library(SummarizedExperiment) +library(fishpond) +set.seed(5) +y <- makeSimSwishData(m=10, meanVariance=TRUE) +y$condition <- 1:10 +assay(y, "mean")[1,] <- 100 + y$condition * 20 + rnorm(10,0,20) +assay(y, "variance")[1,] <- 200 + c(1,0,1,0,0, 1,0,0,0,1) * 10000 +# plotInfReps(y, 1, x="condition") +y <- makeInfReps(y, numReps=30) +dat <- getTrace(y, idx=1, samp_idx=1:10) +library(dplyr) +coldata <- colData(y) %>% + as.data.frame() %>% + mutate(sample=condition) +dat <- dat %>% inner_join(coldata) +anim <- dat %>% + ggplot(aes(condition, count)) + + geom_point(col="dodgerblue", size=3) + + geom_line(stat="smooth", method="lm", formula=y~x, se=FALSE, + col="black", lwd=2) + + ylab("scaled counts") + + coord_cartesian(ylim=c(0,500)) + + theme_bw() + + theme(text = element_text(size = 24)) + + transition_time(infRep) + + shadow_mark(alpha=0.25, size=2.5) +animate(anim, duration=6, fps=5) + +library(tidyr) +library(purrr) +library(broom) +dat %>% + select(-sample) %>% + nest(data=-infRep) %>% + mutate(fit = map(data, ~ tidy(lm(count ~ condition, data=.x)))) %>% + unnest(fit) %>% + group_by(term) %>% + summarize(mu=mean(estimate)) + +metadata(y)$infRepsScaled <- TRUE +library(rafalib) +bigpar() +plotInfReps(y, idx=1, x="condition", main="") +abline(120, 17.7, col=rgb(0,0,0,.3), lwd=5) diff -Nru r-bioc-fishpond-2.0.1+ds/inst/script/timing.R r-bioc-fishpond-2.2.0+ds/inst/script/timing.R --- r-bioc-fishpond-2.0.1+ds/inst/script/timing.R 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/inst/script/timing.R 2022-04-26 18:25:58.000000000 +0000 @@ -0,0 +1,25 @@ +load_all() +y <- makeSimSwishData(m=5000,n=20,null=TRUE) +y <- scaleInfReps(y) + +y$allele <- factor(rep(1:2,times=10)) +y$pair <- factor(rep(1:10,times=2)) +y$pair2 <- factor(rep(1:10,each=2)) +y$continuous <- round(rep(rnorm(10),each=2),2) + +# 1.5 s +system.time({ + y <- swish(y, x="condition", quiet=TRUE) +}) +# 26 s +system.time({ + y <- swish(y, x="condition", pair="pair", quiet=TRUE) +}) +# 2.0 s +system.time({ + y <- swish(y, x="allele", pair="pair2", cov="condition", interaction=TRUE, quiet=TRUE) +}) +# 3.0 s +system.time({ + y <- swish(y, x="allele", pair="pair2", cov="continuous", cor="pearson", quiet=TRUE) +}) diff -Nru r-bioc-fishpond-2.0.1+ds/man/addStatsFromCSV.Rd r-bioc-fishpond-2.2.0+ds/man/addStatsFromCSV.Rd --- r-bioc-fishpond-2.0.1+ds/man/addStatsFromCSV.Rd 2021-10-26 19:25:38.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/man/addStatsFromCSV.Rd 2022-04-26 18:25:58.000000000 +0000 @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helper.R +% Please edit documentation in R/helper-compress.R \name{addStatsFromCSV} \alias{addStatsFromCSV} \title{Read statistics and nulls from CSV file} diff -Nru r-bioc-fishpond-2.0.1+ds/man/alevinEC.Rd r-bioc-fishpond-2.2.0+ds/man/alevinEC.Rd --- r-bioc-fishpond-2.0.1+ds/man/alevinEC.Rd 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/man/alevinEC.Rd 2022-04-26 18:25:58.000000000 +0000 @@ -0,0 +1,65 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/alevin-EC.R +\name{alevinEC} +\alias{alevinEC} +\title{Construct a sparse matrix of transcript compatibility counts from alevin +output} +\usage{ +alevinEC( + paths, + tx2gene, + multigene = FALSE, + ignoreTxVersion = FALSE, + ignoreAfterBar = FALSE, + quiet = FALSE +) +} +\arguments{ +\item{paths}{`Charachter` or `character vector`, path specifying the +location of the `bfh.txt` files generated with alevin-fry.} + +\item{tx2gene}{A `dataframe` linking transcript identifiers to their +corresponding gene identifiers. Transcript identifiers must be in a column +`isoform_id`. Corresponding gene identifiers must be in a column `gene_id`.} + +\item{multigene}{`Logical`, should equivalence classes that are compatible +with multiple genes be retained? Default is `FALSE`, removing such ambiguous +equivalence classes.} + +\item{ignoreTxVersion}{logical, whether to split the isoform id on the '.' +character to remove version information to facilitate matching with the +isoform id in `tx2gene` (default FALSE).} + +\item{ignoreAfterBar}{logical, whether to split the isoform id on the '|' +character to facilitate matching with the isoform id in `tx2gene` +(default FALSE).} + +\item{quiet}{`Logical`, set `TRUE` to avoid displaying messages.} +} +\value{ +A list with two elements. The first element `counts` is a sparse +count matrix with equivalence class identifiers in the rows and barcode +identifiers in the columns. The second element `tx2gene_matched` allows for +linking the equivalence class identifiers to their respective transcripts +and genes. +} +\description{ +Constructs a UMI count matrix with equivalence class identifiers +in the rows and barcode identifiers in the columns. The count matrix is +generated from one or multiple `bfh.txt` files that have been created by +running alevin-fry with the --dumpBFH flag. Alevin-fry - +\url{https://doi.org/10.1186/s13059-019-1670-y} +} +\section{Details}{ + +The resulting count matrix uses equivalence class identifiers as rownames. +These can be linked to respective transcripts and genes using the +`tx2gene_matched` element of the output. Specifically, if the equivalence +class identifier reads 1|2|8, then the equivalence class is compatible with +the transcripts and their respective genes in rows 1, 2 and 8 of +`tx2gene_matched`. +} + +\author{ +Jeroen Gilis +} Binary files /tmp/tmp2sjhyil9/ls2zZw_ys0/r-bioc-fishpond-2.0.1+ds/man/figures/fishpond.png and /tmp/tmp2sjhyil9/EGIFf3Vtyl/r-bioc-fishpond-2.2.0+ds/man/figures/fishpond.png differ Binary files /tmp/tmp2sjhyil9/ls2zZw_ys0/r-bioc-fishpond-2.0.1+ds/man/figures/plotInfReps.png and /tmp/tmp2sjhyil9/EGIFf3Vtyl/r-bioc-fishpond-2.2.0+ds/man/figures/plotInfReps.png differ Binary files /tmp/tmp2sjhyil9/ls2zZw_ys0/r-bioc-fishpond-2.0.1+ds/man/figures/swish.gif and /tmp/tmp2sjhyil9/EGIFf3Vtyl/r-bioc-fishpond-2.2.0+ds/man/figures/swish.gif differ diff -Nru r-bioc-fishpond-2.0.1+ds/man/fishpond-package.Rd r-bioc-fishpond-2.2.0+ds/man/fishpond-package.Rd --- r-bioc-fishpond-2.0.1+ds/man/fishpond-package.Rd 2021-10-26 19:25:38.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/man/fishpond-package.Rd 2022-04-26 18:25:58.000000000 +0000 @@ -1,26 +1,25 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/swish.R +% Please edit documentation in R/fishpond.R \docType{package} \name{fishpond-package} \alias{fishpond-package} -\title{Downstream methods for Salmon and Alevin expression data} +\title{Fishpond: downstream methods and tools for expression data} \description{ This package provides statistical methods and other tools for working with Salmon and Alevin quantification of RNA-seq data. -In particular, it contains the Swish non-parametric method for +Fishpond contains the Swish non-parametric method for detecting differential transcript expression (DTE). Swish can -also be used to detect differential gene expresion (DGE). +also be used to detect differential gene expresion (DGE), +to perform allelic analysis, or to assess changes in isoform +proportions. } \details{ -The main functions are: +The main Swish functions are: \itemize{ \item \code{\link{scaleInfReps}} - scaling transcript or gene expression data \item \code{\link{labelKeep}} - labelling which features have sufficient counts \item \code{\link{swish}} - perform non-parametric differential analysis \item Plots, e.g., \code{\link{plotMASwish}}, \code{\link{plotInfReps}} -\item \code{\link{isoformProportions}} - convert counts to isoform proportions -\item \code{\link{makeInfReps}} - create pseudo-inferential replicates -\item \code{\link{splitSwish}} - split Swish analysis across jobs with Snakemake } All software-related questions should be posted to the Bioconductor Support Site: @@ -48,7 +47,4 @@ bioRxiv. \url{https://doi.org/10.1101/2020.07.06.189639} } -\author{ -Anqi Zhu, Avi Srivastava, Joseph G. Ibrahim, Rob Patro, Michael I. Love -} \keyword{package} diff -Nru r-bioc-fishpond-2.0.1+ds/man/importAllelicCounts.Rd r-bioc-fishpond-2.2.0+ds/man/importAllelicCounts.Rd --- r-bioc-fishpond-2.0.1+ds/man/importAllelicCounts.Rd 2021-12-02 07:00:52.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/man/importAllelicCounts.Rd 2022-04-26 18:25:58.000000000 +0000 @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helper.R +% Please edit documentation in R/helper-allelic.R \name{importAllelicCounts} \alias{importAllelicCounts} \title{Import allelic counts as a SummarizedExperiment} @@ -32,20 +32,10 @@ \item{tx2gene}{optional, a data.frame with first column indicating transcripts, second column indicating genes (or any other transcript -grouping). Either this should include the \code{a1} and \code{a2} -suffix for the transcripts and genes, or those will be added internally, -if it is detected that the first transcript does not have these suffices. -For example if \code{_alt} or \code{_ref}, or \code{_M} or \code{_P} -(as indicated by the \code{a1} and \code{a2} arguments) are not present -in the table, the table rows will be duplicated with those suffices -added on behalf of the user. -If not provided, the output object will be transcript-level. -Note: do not attempt to set the \code{txOut} argument, it will -conflict with internal calls to downstream functions. -Note: if the a1/a2 suffices are not at the end of the transcript name -in the quantification files, e.g. \code{ENST123_M|}, -then \code{ignoreAfterBar=TRUE} can be used to match regardless of -the string following \code{|} in the quantification files.} +grouping). Alternatively, this can be a GRanges object with +required columns \code{tx_id}, and \code{group_id} +(see \code{makeTx2Tss}). For more information on this argument, +see Details.} \item{...}{any arguments to pass to tximeta} } @@ -54,8 +44,9 @@ combined into a wide matrix \code{[a2 | a1]}, or as assays (a1, then a2). The original strings associated with a1 and a2 are stored in the metadata of the object, in the \code{alleles} list element. -Note the \code{ref} level of \code{se$allele} will be \code{"a2"}, -such that comparisons by default will be a1 vs a2 (effect vs non-effect). +Note the reference level of \code{se$allele} will be \code{"a2"}, +such that comparisons by default will be a1 vs a2 +(effect vs non-effect). } \description{ Read in Salmon quantification of allelic counts from a @@ -63,23 +54,45 @@ are marked with the following suffix: an underscore and a consistent symbol for each of the two alleles, e.g. \code{ENST123_M} and \code{ENST123_P}, -or \code{ENST123_alt} and \code{ENST123_ref}. -There must be exactly two alleles for each transcript, -and the \code{--keep-duplicates} option should be used in -Salmon indexing to avoid removing transcripts with identical sequence. -The output object has half the number of transcripts, -with the two alleles either stored in a \code{"wide"} object, -or as re-named \code{"assays"}. Note carefully that the symbol -provided to \code{a1} is used as the effect allele, -and \code{a2} is used as the non-effect allele -(see the \code{format} argument description and Value -description below). +or \code{ENST123_alt} and \code{ENST123_ref}, etc. +\code{importAllelicCounts} requires the tximeta package. +Further information in Details below. } \details{ -Requires the tximeta package. -\code{skipMeta=TRUE} is used, as it is assumed -the diploid transcriptome does not match any reference -transcript collection. This may change in future iterations -of the function, depending on developments in upstream -software. +\strong{Requirements} - There must be exactly two alleles for each +transcript, and the \code{--keep-duplicates} option should be used +in Salmon indexing to avoid removal of transcripts with identical +sequence. The output object has half the number of transcripts, +with the two alleles either stored in a \code{"wide"} object, or as +re-named \code{"assays"}. Note carefully that the symbol provided +to \code{a1} is used as the effect allele, and \code{a2} is used as +the non-effect allele (see the \code{format} argument description +and Value description below). + +\strong{tx2gene} - The two columns should include the \code{a1} and +\code{a2} suffix for the transcripts and genes/groups, or those +will be added internally, if it is detected that the first +transcript does not have these suffices. For example if +\code{_alt} or \code{_ref}, or \code{_M} or \code{_P} (as indicated +by the \code{a1} and \code{a2} arguments) are not present in the +table, the table rows will be duplicated with those suffices added +on behalf of the user. If \code{tx2gene} is not provided, the +output object will be transcript-level. Do not attempt to set the +\code{txOut} argument, it will conflict with internal calls to +downstream functions. If the a1/a2 suffices are not at the end of +the transcript name in the quantification files, +e.g. \code{ENST123_M|}, then \code{ignoreAfterBar=TRUE} +can be used to match regardless of the string following \code{|} in +the quantification files. + +\code{skipMeta=TRUE} is used, as it is assumed the diploid +transcriptome does not match any reference transcript +collection. This may change in future iterations of the function, +depending on developments in upstream annotations and software. + +If \code{tx2gene} is a GRanges object, the rowRanges of the output +will be the reduced ranges of the grouped input ranges, with +\code{tx_id} collapsed into a CharacterList. Other metadata columns +are not manipulated, just the metadata for the first range is +returned. } diff -Nru r-bioc-fishpond-2.0.1+ds/man/isoformProportions.Rd r-bioc-fishpond-2.2.0+ds/man/isoformProportions.Rd --- r-bioc-fishpond-2.0.1+ds/man/isoformProportions.Rd 2021-10-26 19:25:38.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/man/isoformProportions.Rd 2022-04-26 18:25:58.000000000 +0000 @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helper.R +% Please edit documentation in R/helper-isoform.R \name{isoformProportions} \alias{isoformProportions} \title{Create isoform proportions from scaled data} diff -Nru r-bioc-fishpond-2.0.1+ds/man/loadFry.Rd r-bioc-fishpond-2.2.0+ds/man/loadFry.Rd --- r-bioc-fishpond-2.0.1+ds/man/loadFry.Rd 2021-10-26 19:25:38.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/man/loadFry.Rd 2022-04-26 18:25:58.000000000 +0000 @@ -1,12 +1,9 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/loadFry.R +% Please edit documentation in R/alevin-loadFry.R \name{loadFry} \alias{loadFry} -\alias{load_fry_raw} \title{Load in data from alevin-fry USA mode} \usage{ -load_fry_raw(fryDir, quiet = FALSE) - loadFry(fryDir, outputFormat = "scRNA", nonzero = FALSE, quiet = FALSE) } \arguments{ @@ -16,8 +13,6 @@ \code{quants_mat.mtx}, \code{quants_mat_cols.txt} and \code{quants_mat_rows.txt}} -\item{quiet}{logical whether to display no messages} - \item{outputFormat}{can be \emph{either} be a list that defines the desired format of the output \code{SingleCellExperiment} object \emph{or} a string that represents one of the pre-defined output @@ -29,7 +24,10 @@ value across all genes (default \code{FALSE}). If \code{TRUE}, this will filter based on all assays. If a string vector of assay names, it will filter based -on the matching assays in the vector.} +on the matching assays in the vector. +If not in USA mode, it must be TRUE/FALSE/counts.} + +\item{quiet}{logical whether to display no messages} } \value{ A \code{SingleCellExperiment} object that contains one @@ -38,8 +36,8 @@ barcodes } \description{ -Enables easy loading of sparse data matrices provided by alevin-fry USA mode. -Alevin-fry - \url{https://www.biorxiv.org/content/10.1101/2021.06.29.450377v1} +Enables easy loading of sparse data matrices provided by +alevin-fry USA mode. } \section{Details about \code{loadFry}}{ @@ -129,4 +127,14 @@ SummarizedExperiment::assayNames(scev) } -\concept{preprocessing} +\references{ +alevin-fry publication: + +He, D., Zakeri, M., Sarkar, H. et al. "Alevin-fry unlocks rapid, accurate and +memory-frugal quantification of single-cell RNA-seq data." +Nature Methods 19, 316–322 (2022). +\url{https://doi.org/10.1038/s41592-022-01408-3} +} +\author{ +Dongze He, with contributions from Steve Lianoglou, Wes Wilson +} diff -Nru r-bioc-fishpond-2.0.1+ds/man/makeInfReps.Rd r-bioc-fishpond-2.2.0+ds/man/makeInfReps.Rd --- r-bioc-fishpond-2.0.1+ds/man/makeInfReps.Rd 2021-10-26 19:25:38.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/man/makeInfReps.Rd 2022-04-26 18:25:58.000000000 +0000 @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helper.R +% Please edit documentation in R/helper-compress.R \name{makeInfReps} \alias{makeInfReps} \title{Make pseudo-inferential replicates from mean and variance} diff -Nru r-bioc-fishpond-2.0.1+ds/man/makeSimSwishData.Rd r-bioc-fishpond-2.2.0+ds/man/makeSimSwishData.Rd --- r-bioc-fishpond-2.0.1+ds/man/makeSimSwishData.Rd 2021-10-26 19:25:38.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/man/makeSimSwishData.Rd 2022-04-26 18:25:58.000000000 +0000 @@ -9,7 +9,9 @@ n = 10, numReps = 20, null = FALSE, - meanVariance = FALSE + meanVariance = FALSE, + allelic = FALSE, + dynamic = FALSE ) } \arguments{ @@ -23,6 +25,10 @@ \item{meanVariance}{logical, whether to output only mean and variance of inferential replicates} + +\item{allelic}{logical, whether to make an allelic sim dataset} + +\item{dynamic}{logical, whether to make a dynamic allelic sim dataset} } \value{ a SummarizedExperiment diff -Nru r-bioc-fishpond-2.0.1+ds/man/makeTx2Tss.Rd r-bioc-fishpond-2.2.0+ds/man/makeTx2Tss.Rd --- r-bioc-fishpond-2.0.1+ds/man/makeTx2Tss.Rd 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/man/makeTx2Tss.Rd 2022-04-26 18:25:58.000000000 +0000 @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/helper-allelic.R +\name{makeTx2Tss} +\alias{makeTx2Tss} +\title{Make a GRanges linking transcripts to TSS within gene} +\usage{ +makeTx2Tss(x, maxgap = 0) +} +\arguments{ +\item{x}{either TxDb/EnsDb or GRanges object. The GRanges +object should have metadata columns \code{tx_id} and +\code{gene_id}} + +\item{maxgap}{integer, number of basepairs to use determining +whether to combine nearby TSS} +} +\value{ +GRanges with columns \code{tx_id}, \code{tss}, and +\code{group_id} +} +\description{ +This helper function takes either a TxDb/EnsDb or +GRanges object as input and outputs a GRanges object +where transcripts are aggregated to the gene + TSS +(transcription start site). For nearby TSS that should +be grouped together, see \code{maxgap}. +} +\examples{ +\dontrun{ +library(EnsDb.Hsapiens.v86) +edb <- EnsDb.Hsapiens.v86 +t2t <- makeTx2Tss(edb) +} + +} diff -Nru r-bioc-fishpond-2.0.1+ds/man/miniSwish.Rd r-bioc-fishpond-2.2.0+ds/man/miniSwish.Rd --- r-bioc-fishpond-2.0.1+ds/man/miniSwish.Rd 2021-10-26 19:25:38.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/man/miniSwish.Rd 2022-04-26 18:25:58.000000000 +0000 @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helper.R +% Please edit documentation in R/helper-compress.R \name{miniSwish} \alias{miniSwish} \title{Helper function for distributing Swish on a subset of data} diff -Nru r-bioc-fishpond-2.0.1+ds/man/plotAllelicGene.Rd r-bioc-fishpond-2.2.0+ds/man/plotAllelicGene.Rd --- r-bioc-fishpond-2.0.1+ds/man/plotAllelicGene.Rd 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/man/plotAllelicGene.Rd 2022-04-26 18:25:58.000000000 +0000 @@ -0,0 +1,128 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/helper-allelic.R +\name{plotAllelicGene} +\alias{plotAllelicGene} +\title{Plot allelic counts in a gene context using Gviz} +\usage{ +plotAllelicGene( + y, + gene, + db, + region = NULL, + symbol = NULL, + genome = NULL, + tpmFilter = 1, + isoPropFilter = 0.05, + countFilter = 10, + pc = 1, + transcriptAnnotation = "symbol", + labels = list(a2 = "a2", a1 = "a1"), + qvalue = TRUE, + log2FC = TRUE, + ideogram = FALSE, + cov = NULL, + covFacetIsoform = FALSE, + allelicCol = c("dodgerblue", "goldenrod1"), + isoformCol = "firebrick", + statCol = "black", + gridCol = "grey80", + baselineCol = "black", + titleCol = "black", + titleAxisCol = "black", + titleBgCol = "white", + geneBorderCol = "darkblue", + geneFillCol = "darkblue", + genomeAxisCol = "black", + innerFontCol = "black", + ... +) +} +\arguments{ +\item{y}{a SummarizedExperiment (see \code{swish})} + +\item{gene}{the name of the gene of interest, requires +a column \code{gene_id} in the metadata columns of the +rowRanges of y} + +\item{db}{either a TxDb or EnsDb object to use for the gene model} + +\item{region}{GRanges, the region to be displayed in the Gviz plot. +if not specified, will be set according to the gene plus 20% +of the total gene extent on either side} + +\item{symbol}{alternative to \code{gene}, to specify +the gene of interest according to a column \code{symbol} +in the metadata columns of the rowRanges of y} + +\item{genome}{UCSC genome code (e.g. \code{"hg38"}, +if not specified it will use the \code{GenomeInfoDb::genome()} +of the rowRanges of \code{y}} + +\item{tpmFilter}{minimum TPM value (mean over samples) to keep a feature} + +\item{isoPropFilter}{minimum percent of isoform proportion to keep a feature} + +\item{countFilter}{minimum count value (mean over samples) to keep a feature} + +\item{pc}{pseudocount to avoid dividing by zero in allelic proportion calculation} + +\item{transcriptAnnotation}{argument passed to Gviz::GeneRegionTrack +(\code{"symbol"}, \code{"gene"}, \code{"transcript"}, etc.)} + +\item{labels}{list, labels for a2 (non-effect) and a1 (effect) alleles} + +\item{qvalue}{logical, whether to inclue qvalue track} + +\item{log2FC}{logical, whether to include log2FC track} + +\item{ideogram}{logical, whether to include ideogram track} + +\item{cov}{character specifying a factor or integer variable to use +to facet the allelic proportion plots, should be a column in +\code{colData(y)}} + +\item{covFacetIsoform}{logical, if \code{cov} is provided, +should it also be used to facet the isoform proportion track, +in addition to the allelic proportion track} + +\item{allelicCol}{the colors of the points and lines for allelic proportion} + +\item{isoformCol}{the colors of the points and lines for isoform proportion} + +\item{statCol}{the color of the lollipops for q-value and log2FC} + +\item{gridCol}{the color of the grid in the data tracks} + +\item{baselineCol}{the color of the horizontal baseline for q-value and lo2gFC} + +\item{titleCol}{font color of the side titles (track labels)} + +\item{titleAxisCol}{axis color of the side titles (track labels)} + +\item{titleBgCol}{background color of the side titles (track labels)} + +\item{geneBorderCol}{the color of the borders and font in gene region track} + +\item{geneFillCol}{the color of the fill in the gene region track} + +\item{genomeAxisCol}{line color of the genome axis track} + +\item{innerFontCol}{font color of genome axis track, ideogram, and +allelic proportion legend} + +\item{...}{additional arguments passed to \code{Gviz::plotTracks()}} +} +\value{ +nothing, a plot is displayed +} +\description{ +Plot allelic data (allelic proportions, isoform propostions) +in a gene context leveraging the Gviz package. See the allelic +vignette for example usage. TPM and count filters are used by +default to clean up the plot of features with minimal signal; +note that the isoform proportion displayed at the bottom of the +plot is among the features that pass the filtering steps. +If the function is not responding, it is likely due to issues +connecting to UCSC servers to draw the ideogram, in this case +set \code{ideogram=FALSE}. +} diff -Nru r-bioc-fishpond-2.0.1+ds/man/plotAllelicHeatmap.Rd r-bioc-fishpond-2.2.0+ds/man/plotAllelicHeatmap.Rd --- r-bioc-fishpond-2.0.1+ds/man/plotAllelicHeatmap.Rd 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/man/plotAllelicHeatmap.Rd 2022-04-26 18:25:58.000000000 +0000 @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/helper-allelic.R +\name{plotAllelicHeatmap} +\alias{plotAllelicHeatmap} +\title{Plot allelic ratio heatmap} +\usage{ +plotAllelicHeatmap( + y, + idx, + breaks = NULL, + cluster_cols = FALSE, + main = "Allelic ratio", + stripAfterChar = "-", + ... +) +} +\arguments{ +\item{y}{a SummarizedExperiment (see \code{swish})} + +\item{idx}{a numeric or logical vector of which features +to plot} + +\item{breaks}{breaks passed along to pheatmap} + +\item{cluster_cols}{logical, passed to pheatmap} + +\item{main}{title of the plot} + +\item{stripAfterChar}{for the column names, if specified +will strip allelic identifiers after this character, +default is hyphen. set to NULL to avoid this action} + +\item{...}{other arguments passed to pheatmap} +} +\value{ +nothing, a plot is displayed +} +\description{ +Plot allelic ratio heatmap over features and samples +using the pheatmap package. The a1/(a2 + a1) ratio +is displayed. +} diff -Nru r-bioc-fishpond-2.0.1+ds/man/plotInfReps.Rd r-bioc-fishpond-2.2.0+ds/man/plotInfReps.Rd --- r-bioc-fishpond-2.0.1+ds/man/plotInfReps.Rd 2021-10-26 19:25:38.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/man/plotInfReps.Rd 2022-04-26 18:25:58.000000000 +0000 @@ -25,7 +25,8 @@ q = qnorm(0.975), applySF = FALSE, reorder, - thin + thin, + shiftX ) } \arguments{ @@ -88,6 +89,12 @@ be drawn thin (the default switches from 0 [not thin] to 1 [thinner] at n=150 cells, and from 1 [thinner] to 2 [thinnest] at n=400 cells)} + +\item{shiftX}{when \code{x} is continuous and \code{cov} +is provided, the amount to shift the values on the x-axis +to improve visibility of the point and line ranges +(will be subtracted from the first level of \code{cov} +and added to the second level of \code{cov})} } \value{ nothing, a plot is displayed diff -Nru r-bioc-fishpond-2.0.1+ds/man/plotMASwish.Rd r-bioc-fishpond-2.2.0+ds/man/plotMASwish.Rd --- r-bioc-fishpond-2.0.1+ds/man/plotMASwish.Rd 2021-10-26 19:25:38.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/man/plotMASwish.Rd 2022-04-26 18:25:58.000000000 +0000 @@ -2,7 +2,7 @@ % Please edit documentation in R/plots.R \name{plotMASwish} \alias{plotMASwish} -\title{MA plot} +\title{MA plot - log fold change over average counts} \usage{ plotMASwish(y, alpha = 0.05, sigcolor = "blue", ...) } @@ -19,7 +19,7 @@ nothing, a plot is displayed } \description{ -MA plot +MA plot - log fold change over average counts } \examples{ diff -Nru r-bioc-fishpond-2.0.1+ds/man/readEDS.Rd r-bioc-fishpond-2.2.0+ds/man/readEDS.Rd --- r-bioc-fishpond-2.0.1+ds/man/readEDS.Rd 2021-10-26 19:25:38.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/man/readEDS.Rd 2022-04-26 18:25:58.000000000 +0000 @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/readEDS.R +% Please edit documentation in R/alevin-readEDS.R \name{readEDS} \alias{readEDS} \title{readEDS - a utility function for quickly reading in Alevin's EDS format} diff -Nru r-bioc-fishpond-2.0.1+ds/man/salmonEC.Rd r-bioc-fishpond-2.2.0+ds/man/salmonEC.Rd --- r-bioc-fishpond-2.0.1+ds/man/salmonEC.Rd 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/man/salmonEC.Rd 2022-04-26 18:25:58.000000000 +0000 @@ -0,0 +1,63 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/salmon-EC.R +\name{salmonEC} +\alias{salmonEC} +\title{Construct a sparse matrix of transcript compatibility counts from salmon +output} +\usage{ +salmonEC( + paths, + tx2gene, + multigene = FALSE, + ignoreTxVersion = FALSE, + ignoreAfterBar = FALSE, + quiet = FALSE +) +} +\arguments{ +\item{paths}{`Charachter` or `character vector`, path specifying the +location of the `eq_classes.txt` files generated with salmon.} + +\item{tx2gene}{A `dataframe` linking transcript identifiers to their +corresponding gene identifiers. Transcript identifiers must be in a column +`isoform_id`. Corresponding gene identifiers must be in a column `gene_id`.} + +\item{multigene}{`Logical`, should equivalence classes that are compatible +with multiple genes be retained? Default is `FALSE`, removing such ambiguous +equivalence classes.} + +\item{ignoreTxVersion}{logical, whether to split the isoform id on the '.' +character to remove version information to facilitate matching with the +isoform id in `tx2gene` (default FALSE).} + +\item{ignoreAfterBar}{logical, whether to split the isoform id on the '|' +character to facilitate matching with the isoform id in `tx2gene` +(default FALSE).} + +\item{quiet}{`Logical`, set `TRUE` to avoid displaying messages.} +} +\value{ +A list with two elements. The first element `counts` is a sparse +count matrix with equivalence class identifiers in the rows. The second +element `tx2gene_matched` allows for linking those identifiers to their +respective transcripts and genes. +} +\description{ +Constructs a count matrix with equivalence class identifiers +in the rows. The count matrix is generated from one or multiple +`eq_classes.txt` files that have been created by running salmon with the +--dumpEq flag. Salmon - \url{https://doi.org/10.1038/nmeth.4197} +} +\section{Details}{ + +The resulting count matrix uses equivalence class identifiers as rownames. +These can be linked to respective transcripts and genes using the +`tx2gene_matched` element of the output. Specifically, if the equivalence +class identifier reads 1|2|8, then the equivalence class is compatible with +the transcripts and their respective genes in rows 1, 2 and 8 of +`tx2gene_matched`. +} + +\author{ +Jeroen Gilis +} diff -Nru r-bioc-fishpond-2.0.1+ds/man/splitSwish.Rd r-bioc-fishpond-2.2.0+ds/man/splitSwish.Rd --- r-bioc-fishpond-2.0.1+ds/man/splitSwish.Rd 2021-10-26 19:25:38.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/man/splitSwish.Rd 2022-04-26 18:25:58.000000000 +0000 @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helper.R +% Please edit documentation in R/helper-compress.R \name{splitSwish} \alias{splitSwish} \title{Function for splitting SummarizedExperiment into separate RDS files} diff -Nru r-bioc-fishpond-2.0.1+ds/man/swish.Rd r-bioc-fishpond-2.2.0+ds/man/swish.Rd --- r-bioc-fishpond-2.0.1+ds/man/swish.Rd 2021-10-26 19:25:38.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/man/swish.Rd 2022-04-26 18:25:58.000000000 +0000 @@ -2,7 +2,7 @@ % Please edit documentation in R/swish.R \name{swish} \alias{swish} -\title{swish: SAMseq With Inferential Samples Helps} +\title{Swish method: differential expression accounting for inferential uncertainty} \usage{ swish( y, @@ -33,17 +33,19 @@ \item{cov}{the name of the covariate for adjustment. If provided a stratified Wilcoxon in performed. -Cannot be used with \code{pair} (unless using \code{cor})} +Cannot be used with \code{pair}, unless using +either \code{interaction} or \code{cor}} \item{pair}{the name of the pair variable, which should be the number of the pair. Can be an integer or factor. If specified, a signed rank test is used to build the statistic. All samples across \code{x} must be -pairs if this is specified. Cannot be used with \code{cov} -(unless using \code{cor})} +pairs if this is specified. Cannot be used with \code{cov}, +unless using either \code{interaction} or \code{cor}} \item{interaction}{logical, whether to perform a test of an interaction -between \code{x} and \code{cov}. See Details.} +between \code{x} and \code{cov}. Can use \code{pair} or not. +See Details.} \item{cor}{character, whether to compute correlation of \code{x} with the log counts, and signifance testing on the correlation @@ -91,6 +93,7 @@ the local FDR and q-value, as estimated by the \code{samr} package. } \description{ +The Swish method, or "SAMseq With Inferential Samples Helps". Performs non-parametric inference on rows of \code{y} for various experimental designs. See References for details. } diff -Nru r-bioc-fishpond-2.0.1+ds/NAMESPACE r-bioc-fishpond-2.2.0+ds/NAMESPACE --- r-bioc-fishpond-2.0.1+ds/NAMESPACE 2021-10-26 19:25:38.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/NAMESPACE 2022-04-26 18:25:58.000000000 +0000 @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand export(addStatsFromCSV) +export(alevinEC) export(computeInfRV) export(deswish) export(getTrace) @@ -8,20 +9,37 @@ export(isoformProportions) export(labelKeep) export(loadFry) -export(load_fry_raw) export(makeInfReps) export(makeSimSwishData) +export(makeTx2Tss) export(miniSwish) +export(plotAllelicGene) +export(plotAllelicHeatmap) export(plotInfReps) export(plotMASwish) export(readEDS) +export(salmonEC) export(scaleInfReps) export(splitSwish) export(swish) import(Rcpp) +importFrom(GenomicRanges,"end<-") +importFrom(GenomicRanges,"start<-") +importFrom(GenomicRanges,"strand<-") +importFrom(GenomicRanges,end) +importFrom(GenomicRanges,flank) +importFrom(GenomicRanges,resize) +importFrom(GenomicRanges,seqnames) +importFrom(GenomicRanges,sort) +importFrom(GenomicRanges,start) +importFrom(GenomicRanges,strand) +importFrom(GenomicRanges,width) +importFrom(IRanges,IntegerList) importFrom(Matrix,colSums) importFrom(Matrix,readMM) importFrom(Matrix,rowSums) +importFrom(Matrix,sparseMatrix) +importFrom(Matrix,sparseVector) importFrom(Matrix,t) importFrom(S4Vectors,"metadata<-") importFrom(S4Vectors,DataFrame) @@ -32,12 +50,14 @@ importFrom(SummarizedExperiment,"assays<-") importFrom(SummarizedExperiment,"colData<-") importFrom(SummarizedExperiment,"mcols<-") +importFrom(SummarizedExperiment,"rowRanges<-") importFrom(SummarizedExperiment,SummarizedExperiment) importFrom(SummarizedExperiment,assay) importFrom(SummarizedExperiment,assayNames) importFrom(SummarizedExperiment,assays) importFrom(SummarizedExperiment,colData) importFrom(SummarizedExperiment,mcols) +importFrom(SummarizedExperiment,rowRanges) importFrom(graphics,abline) importFrom(graphics,axis) importFrom(graphics,legend) @@ -54,6 +74,7 @@ importFrom(matrixStats,rowVars) importFrom(methods,as) importFrom(methods,is) +importFrom(methods,slot) importFrom(stats,cor) importFrom(stats,median) importFrom(stats,qnorm) @@ -62,6 +83,7 @@ importFrom(stats,rpois) importFrom(stats,runif) importFrom(stats,var) +importFrom(svMisc,progress) importFrom(utils,capture.output) importFrom(utils,head) importFrom(utils,read.csv) diff -Nru r-bioc-fishpond-2.0.1+ds/NEWS r-bioc-fishpond-2.2.0+ds/NEWS --- r-bioc-fishpond-2.0.1+ds/NEWS 2021-10-26 19:25:38.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/NEWS 1970-01-01 00:00:00.000000000 +0000 @@ -1,173 +0,0 @@ -Changes in version 2.0.0 -* New loadFry() function, written by Dongze He with - contributions from Steve Lianoglou and Wes Wilson. - loadFry() helps users to import and process - alevin-fry quantification results. Can process - spliced, unspliced and ambiguous counts separately - and flexibly. Has specific output formats designed - for downstream use with scVelo or velocity analysis. - See ?loadFry for more details. -* Adding correlation tests: Spearman or Pearson - correlations of a numeric covariate with the - log counts, or with the log fold changes across - pairs. The Spearman correlation test with counts - was already implemented in the original SAMseq - method as response type = "Quantitative". - For new functionality see 'cor' argument in the - ?swish man page. -* Adding importAllelicCounts() to facilitate importing - Salmon quantification data against a diploid - transcriptome. Can import either as a 'wide' - format or as 'assays'. Leverages tximeta(). - For gene-level summarization, importAllelicCounts() - can create an appropriate tx2gene table - with the necessary a1 and a2 suffices, - and it will automatically set txOut=FALSE, see - ?importAllelicCounts for more details. -* Added a 'q' argument to plotInfReps to change the - intervals when making point and line plots. -* Switched the legend of plotInfReps so that - reference levels will now be on the bottom, - and non-reference (e.g. treatment) on top. - -Changes in version 1.99.18 -* Added helper functionality to importAllelicCounts, - so it will create an appropriate tx2gene table - with the necessary a1 and a2 suffices, - and it will automatically set txOut=FALSE. -* Added a 'q' argument to plotInfReps to change the - intervals when making point and line plots. -* Switched the legend of plotInfReps so that - reference levels will now be on the bottom, - and non-reference (e.g. treatment) on top. -* Added loadFry() to process alevin-fry - quantification result. Can process spliced, - unspliced and ambiguous counts separately - and flexibly. - -Changes in version 1.99.15 -* Adding correlation tests: Spearman or Pearson - correlations of a numeric covariate with the - log counts, or with the log fold changes across - pairs. The Spearman correlation test with counts - was already implemented in the original SAMseq - method as response type = "Quantitative". - For new functionality see 'cor' argument in the - ?swish man page. -* Adding importAllelicCounts() to facilitate importing - Salmon quantification data against a diploid - transcriptome. Can import either as a 'wide' - format or as 'assays'. Leverages tximeta(). - -Changes in version 1.9.6 -* Specifying ties.method in matrixStats::rowRanks. - -Changes in version 1.9.1 -* Added importAllelicCounts() with options for importing - Salmon quantification on diploid transcriptomes. - -Changes in version 1.8.0 -* Added note in vignette about how to deal with estimated - batch factors, e.g. from RUVSeq or SVA. Two strategies are - outlined: either discretizing the estimate batch factors - and performing stratified analysis, or regressing out the - batch-associated variation using limma's removeBatchEffect. - Demonstation code is included. - -Changes in version 1.6.0 -* Added makeInfReps() to create pseudo-inferential replicates - via negative binomial simulation from mean and variance - matrices. Note: the mean and the variance provide the - _inferential_ distribution per element of the count matrix. - See preprint for details, doi: 10.1101/2020.07.06.189639. -* Added splitSwish() and addStatsFromCSV(), which can be used - to distribute running of Swish across a number of jobs - managed by `Snakemake`. See vignette for description of - a suggested workflow. For a large single-cell dataset - with mean and variance summaries of inferential uncertainty, - splitSwish() avoids generating the inferential replicate - counts until the data has been split into smaller pieces and - sent to different jobs, then only the necessary summary - statistics are gathered and q-values computed by - addStatsFromCSV(). -* plotInfReps() gains many new features to facilitate plotting of - inferential count distributions for single cells, as quantified - with alevin and imported with tximport. E.g. allow for numeric - `x` argument plus grouping with `cov` for showing - counts over pseudotime across groups of cells. Also added - `applySF` argument which can be used to divide out a - size factor, and the `reorder` argument which will re-order - the samples/cells within groups by the count. plotInfReps() - will draw boxplots with progressively thinner visual features - as the number of cells grows to make the plots still legible. - -Changes in version 1.5.2 -* First version of makeInfReps(), to create pseudo-infReps - via negative binomial simulation from set of mean and - variance matrices in the assays of the SummarizedExperiment. - -Changes in version 1.4.0 -* Added isoformProportions(), which can be run after - scaleInfReps() and optionally after filtering out - transcripts using labelKeep(). Running swish() after - isoformProportions() will produce differential transcript - usage (DTU) results, instead of differential transcript - expression (DTE) results. Example in vignette. -* Default number of permutations increased from 30 to 100. - It was observed that there was too much fluctuation in the - DE called set for nperms=30 across different seeds, and - setting to 100 helped to stabilize results across seeds, - without increasing running time too much. For further reduced - dependence on the seed, even higher values of nperms - (e.g. 200, 300) can be used. - -Changes in version 1.3.8 -* Added isoformProportions(), which can be run after - scaleInfReps() and optionally after filtering out - transcripts using labelKeep(). Running swish() after - isoformProportions() will produce differential transcript - usage (DTU) results, instead of differential transcript - expression (DTE) results. Example in vignette. - -Changes in version 1.3.4 -* Default number of permutations increased from 30 to 100. - It was observed that there was too much fluctuation in the - DE called set for nperms=30 across different seeds, and - setting to 100 helped to stabilize results across seeds, - without increasing running time too much. For further reduced - dependence on the seed, even higher values of nperms - (e.g. 200, 300) can be used. - -Changes in version 1.2.0 -* Switching to a faster version of Swish which only - computes the ranks of the data once, and then re-uses - this for the permutation distribution. This bypasses - the addition of uniform noise per permutation and - is 10x faster. Two designs which still require - re-computation of ranks per permutation are the - paired analysis and the general interaction analysis. - Two-group, stratified two-group, and the paired - interaction analysis now default to the new fast - method, but the original, slower method can be used - by setting fast=0 in the call to swish(). -* Adding Rcpp-based function readEDS() written by - Avi Srivastava which imports the sparse counts stored - in Alevin's Efficient Data Storage (EDS) format. -* Changed the vignette so that it (will) use a linkedTxome, - as sometime the build would break if the Bioc build - machine couldn't access ftp.ebi.ac.uk. -* Add 'computeInfRV' function. InfRV is not used in the - Swish methods, only for visualization purposes in the - Swish paper. -* removed 'samr' from Imports, as it required source - installation, moved to Suggests, for optional qvalue - calculation - -Changes in version 0.99.30 -* added two interaction tests, described in ?swish -* incorporate qvalue package for pvalue, locfdr and qvalue -* added plotMASwish() to facilitate plotting -* wilcoxP is removed, and the mean is used instead - -Changes in version 0.99.0 -* fishpond getting ready for submission to Bioc diff -Nru r-bioc-fishpond-2.0.1+ds/NEWS.md r-bioc-fishpond-2.2.0+ds/NEWS.md --- r-bioc-fishpond-2.0.1+ds/NEWS.md 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/NEWS.md 2022-04-26 18:25:58.000000000 +0000 @@ -0,0 +1,206 @@ +# fishpond 2.2.0 + +* New vignette demonstrating allelic analysis at isoform, + TSS, or gene-level. See more details below. +* New import functions for equivalence class analysis of Salmon + or alevin data, written by Jeroen Gilis. See salmonEC() and + alevinEC() man pages. +* New plotAllelicGene() and plotAllelicHeatmap() functions + for plotting results from allelic expression analysis. +* New makeTx2Tss() helper function for allelic analysis. +* Now importFromAllelicCounts() can take a GRanges + object as the `tx2gene` argument, so that ranges will + be distributed to the rowRanges of the output + SummarizedExperiment. +* Adding `shiftX` argument to plotInfReps() for numeric + x variable, to help with overplotting. +* Re-organized package for new pkgdown homepage: + https://mikelove.github.io/fishpond + +# fishpond 2.0.0 + +* New loadFry() function, written by Dongze He with + contributions from Steve Lianoglou and Wes Wilson. + loadFry() helps users to import and process + alevin-fry quantification results. Can process + spliced, unspliced and ambiguous counts separately + and flexibly. Has specific output formats designed + for downstream use with scVelo or velocity analysis. + See ?loadFry for more details. +* Adding correlation tests: Spearman or Pearson + correlations of a numeric covariate with the + log counts, or with the log fold changes across + pairs. The Spearman correlation test with counts + was already implemented in the original SAMseq + method as response type = "Quantitative". + For new functionality see 'cor' argument in the + ?swish man page. +* Adding importAllelicCounts() to facilitate importing + Salmon quantification data against a diploid + transcriptome. Can import either as a 'wide' + format or as 'assays'. Leverages tximeta(). + For gene-level summarization, importAllelicCounts() + can create an appropriate tx2gene table + with the necessary a1 and a2 suffices, + and it will automatically set txOut=FALSE, see + ?importAllelicCounts for more details. +* Added a 'q' argument to plotInfReps to change the + intervals when making point and line plots. +* Switched the legend of plotInfReps so that + reference levels will now be on the bottom, + and non-reference (e.g. treatment) on top. + +# fishpond 1.99.18 + +* Added helper functionality to importAllelicCounts, + so it will create an appropriate tx2gene table + with the necessary a1 and a2 suffices, + and it will automatically set txOut=FALSE. +* Added a 'q' argument to plotInfReps to change the + intervals when making point and line plots. +* Switched the legend of plotInfReps so that + reference levels will now be on the bottom, + and non-reference (e.g. treatment) on top. +* Added loadFry() to process alevin-fry + quantification result. Can process spliced, + unspliced and ambiguous counts separately + and flexibly. + +# fishpond 1.99.15 + +* Adding correlation tests: Spearman or Pearson + correlations of a numeric covariate with the + log counts, or with the log fold changes across + pairs. The Spearman correlation test with counts + was already implemented in the original SAMseq + method as response type = "Quantitative". + For new functionality see 'cor' argument in the + ?swish man page. +* Adding importAllelicCounts() to facilitate importing + Salmon quantification data against a diploid + transcriptome. Can import either as a 'wide' + format or as 'assays'. Leverages tximeta(). + +# fishpond 1.9.6 + +* Specifying ties.method in matrixStats::rowRanks. + +# fishpond 1.9.1 + +* Added importAllelicCounts() with options for importing + Salmon quantification on diploid transcriptomes. + +# fishpond 1.8.0 + +* Added note in vignette about how to deal with estimated + batch factors, e.g. from RUVSeq or SVA. Two strategies are + outlined: either discretizing the estimate batch factors + and performing stratified analysis, or regressing out the + batch-associated variation using limma's removeBatchEffect. + Demonstation code is included. + +# fishpond 1.6.0 + +* Added makeInfReps() to create pseudo-inferential replicates + via negative binomial simulation from mean and variance + matrices. Note: the mean and the variance provide the + _inferential_ distribution per element of the count matrix. + See preprint for details, doi: 10.1101/2020.07.06.189639. +* Added splitSwish() and addStatsFromCSV(), which can be used + to distribute running of Swish across a number of jobs + managed by `Snakemake`. See vignette for description of + a suggested workflow. For a large single-cell dataset + with mean and variance summaries of inferential uncertainty, + splitSwish() avoids generating the inferential replicate + counts until the data has been split into smaller pieces and + sent to different jobs, then only the necessary summary + statistics are gathered and q-values computed by + addStatsFromCSV(). +* plotInfReps() gains many new features to facilitate plotting of + inferential count distributions for single cells, as quantified + with alevin and imported with tximport. E.g. allow for numeric + `x` argument plus grouping with `cov` for showing + counts over pseudotime across groups of cells. Also added + `applySF` argument which can be used to divide out a + size factor, and the `reorder` argument which will re-order + the samples/cells within groups by the count. plotInfReps() + will draw boxplots with progressively thinner visual features + as the number of cells grows to make the plots still legible. + +# fishpond 1.5.2 + +* First version of makeInfReps(), to create pseudo-infReps + via negative binomial simulation from set of mean and + variance matrices in the assays of the SummarizedExperiment. + +# fishpond 1.4.0 + +* Added isoformProportions(), which can be run after + scaleInfReps() and optionally after filtering out + transcripts using labelKeep(). Running swish() after + isoformProportions() will produce differential transcript + usage (DTU) results, instead of differential transcript + expression (DTE) results. Example in vignette. +* Default number of permutations increased from 30 to 100. + It was observed that there was too much fluctuation in the + DE called set for nperms=30 across different seeds, and + setting to 100 helped to stabilize results across seeds, + without increasing running time too much. For further reduced + dependence on the seed, even higher values of nperms + (e.g. 200, 300) can be used. + +# fishpond 1.3.8 + +* Added isoformProportions(), which can be run after + scaleInfReps() and optionally after filtering out + transcripts using labelKeep(). Running swish() after + isoformProportions() will produce differential transcript + usage (DTU) results, instead of differential transcript + expression (DTE) results. Example in vignette. + +# fishpond 1.3.4 + +* Default number of permutations increased from 30 to 100. + It was observed that there was too much fluctuation in the + DE called set for nperms=30 across different seeds, and + setting to 100 helped to stabilize results across seeds, + without increasing running time too much. For further reduced + dependence on the seed, even higher values of nperms + (e.g. 200, 300) can be used. + +# fishpond 1.2.0 + +* Switching to a faster version of Swish which only + computes the ranks of the data once, and then re-uses + this for the permutation distribution. This bypasses + the addition of uniform noise per permutation and + is 10x faster. Two designs which still require + re-computation of ranks per permutation are the + paired analysis and the general interaction analysis. + Two-group, stratified two-group, and the paired + interaction analysis now default to the new fast + method, but the original, slower method can be used + by setting fast=0 in the call to swish(). +* Adding Rcpp-based function readEDS() written by + Avi Srivastava which imports the sparse counts stored + in Alevin's Efficient Data Storage (EDS) format. +* Changed the vignette so that it (will) use a linkedTxome, + as sometime the build would break if the Bioc build + machine couldn't access ftp.ebi.ac.uk. +* Add 'computeInfRV' function. InfRV is not used in the + Swish methods, only for visualization purposes in the + Swish paper. +* removed 'samr' from Imports, as it required source + installation, moved to Suggests, for optional qvalue + calculation + +# fishpond 0.99.30 + +* added two interaction tests, described in ?swish +* incorporate qvalue package for pvalue, locfdr and qvalue +* added plotMASwish() to facilitate plotting +* wilcoxP is removed, and the mean is used instead + +# fishpond 0.99.0 + +* fishpond getting ready for submission to Bioc Binary files /tmp/tmp2sjhyil9/ls2zZw_ys0/r-bioc-fishpond-2.0.1+ds/pkgdown/favicon/apple-touch-icon-120x120.png and /tmp/tmp2sjhyil9/EGIFf3Vtyl/r-bioc-fishpond-2.2.0+ds/pkgdown/favicon/apple-touch-icon-120x120.png differ Binary files /tmp/tmp2sjhyil9/ls2zZw_ys0/r-bioc-fishpond-2.0.1+ds/pkgdown/favicon/apple-touch-icon-152x152.png and /tmp/tmp2sjhyil9/EGIFf3Vtyl/r-bioc-fishpond-2.2.0+ds/pkgdown/favicon/apple-touch-icon-152x152.png differ Binary files /tmp/tmp2sjhyil9/ls2zZw_ys0/r-bioc-fishpond-2.0.1+ds/pkgdown/favicon/apple-touch-icon-180x180.png and /tmp/tmp2sjhyil9/EGIFf3Vtyl/r-bioc-fishpond-2.2.0+ds/pkgdown/favicon/apple-touch-icon-180x180.png differ Binary files /tmp/tmp2sjhyil9/ls2zZw_ys0/r-bioc-fishpond-2.0.1+ds/pkgdown/favicon/apple-touch-icon-60x60.png and /tmp/tmp2sjhyil9/EGIFf3Vtyl/r-bioc-fishpond-2.2.0+ds/pkgdown/favicon/apple-touch-icon-60x60.png differ Binary files /tmp/tmp2sjhyil9/ls2zZw_ys0/r-bioc-fishpond-2.0.1+ds/pkgdown/favicon/apple-touch-icon-76x76.png and /tmp/tmp2sjhyil9/EGIFf3Vtyl/r-bioc-fishpond-2.2.0+ds/pkgdown/favicon/apple-touch-icon-76x76.png differ Binary files /tmp/tmp2sjhyil9/ls2zZw_ys0/r-bioc-fishpond-2.0.1+ds/pkgdown/favicon/apple-touch-icon.png and /tmp/tmp2sjhyil9/EGIFf3Vtyl/r-bioc-fishpond-2.2.0+ds/pkgdown/favicon/apple-touch-icon.png differ Binary files /tmp/tmp2sjhyil9/ls2zZw_ys0/r-bioc-fishpond-2.0.1+ds/pkgdown/favicon/favicon-16x16.png and /tmp/tmp2sjhyil9/EGIFf3Vtyl/r-bioc-fishpond-2.2.0+ds/pkgdown/favicon/favicon-16x16.png differ Binary files /tmp/tmp2sjhyil9/ls2zZw_ys0/r-bioc-fishpond-2.0.1+ds/pkgdown/favicon/favicon-32x32.png and /tmp/tmp2sjhyil9/EGIFf3Vtyl/r-bioc-fishpond-2.2.0+ds/pkgdown/favicon/favicon-32x32.png differ Binary files /tmp/tmp2sjhyil9/ls2zZw_ys0/r-bioc-fishpond-2.0.1+ds/pkgdown/favicon/favicon.ico and /tmp/tmp2sjhyil9/EGIFf3Vtyl/r-bioc-fishpond-2.2.0+ds/pkgdown/favicon/favicon.ico differ diff -Nru r-bioc-fishpond-2.0.1+ds/pkgdown/_pkgdown.yml r-bioc-fishpond-2.2.0+ds/pkgdown/_pkgdown.yml --- r-bioc-fishpond-2.0.1+ds/pkgdown/_pkgdown.yml 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/pkgdown/_pkgdown.yml 2022-04-26 18:25:58.000000000 +0000 @@ -0,0 +1,83 @@ +url: https://mikelove.github.io/fishpond + +title: fishpond + +authors: + Anqi Zhu: + href: https://www.linkedin.com/in/aqzhu91 + Michael Love: + href: https://mikelove.github.io + Avi Srivastava: + href: https://k3yavi.github.io + Rob Patro: + href: https://combine-lab.github.io + Joseph Ibrahim: + href: https://sph.unc.edu/adv_profile/joseph-g-ibrahim-phd + Hirak Sarkar: + href: https://hiraksarkar.github.io + Euphy Wu: + href: https://www.linkedin.com/in/euphy-wu-09b14383 + Noor Pratap Singh: + href: https://combine-lab.github.io/members/noor-pratap-singh.html + Scott Van Buren: + href: https://www.linkedin.com/in/scottvanburen + Dongze He: + href: https://combine-lab.github.io/members/dongze-he.html + Steve Lianoglou: + href: https://www.linkedin.com/in/slianoglou + Wes Wilson: + href: https://www.med.upenn.edu/albeldalab/wes-wilson.html + Jeroen Gilis: + href: https://be.linkedin.com/in/jeroen-gilis-4044b5151 + +figures: + fig.width: 5.5 + fig.retina: 1 + +articles: + - title: Swish + navbar: ~ + contents: + - swish + - allelic + +reference: + - title: Fishpond + - contents: + - fishpond-package + - title: Swish method + - contents: + - scaleInfReps + - labelKeep + - swish + - title: Main Swish plotting functions + - contents: + - plotInfReps + - plotMASwish + - title: Helper functions + - contents: + - computeInfRV + - getTrace + - isoformProportions + - makeSimSwishData + - title: Import and utilities + - contents: + - loadFry + - readEDS + - alevinEC + - salmonEC + - title: Allelic analysis and plotting + - contents: + - makeTx2Tss + - importAllelicCounts + - plotAllelicGene + - plotAllelicHeatmap + - title: Compression + - contents: + - splitSwish + - addStatsFromCSV + - makeInfReps + - miniSwish + - title: Experimental + - contents: + - deswish diff -Nru r-bioc-fishpond-2.0.1+ds/R/alevin-EC.R r-bioc-fishpond-2.2.0+ds/R/alevin-EC.R --- r-bioc-fishpond-2.0.1+ds/R/alevin-EC.R 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/R/alevin-EC.R 2022-04-26 18:25:58.000000000 +0000 @@ -0,0 +1,218 @@ +#' Construct a sparse matrix of transcript compatibility counts from alevin +#' output +#' +#' Constructs a UMI count matrix with equivalence class identifiers +#' in the rows and barcode identifiers in the columns. The count matrix is +#' generated from one or multiple `bfh.txt` files that have been created by +#' running alevin-fry with the --dumpBFH flag. Alevin-fry - +#' \url{https://doi.org/10.1186/s13059-019-1670-y} +#' +#' @rdname alevinEC +#' +#' @param paths `Charachter` or `character vector`, path specifying the +#' location of the `bfh.txt` files generated with alevin-fry. +#' @param tx2gene A `dataframe` linking transcript identifiers to their +#' corresponding gene identifiers. Transcript identifiers must be in a column +#' `isoform_id`. Corresponding gene identifiers must be in a column `gene_id`. +#' @param multigene `Logical`, should equivalence classes that are compatible +#' with multiple genes be retained? Default is `FALSE`, removing such ambiguous +#' equivalence classes. +#' @param ignoreTxVersion logical, whether to split the isoform id on the '.' +#' character to remove version information to facilitate matching with the +#' isoform id in `tx2gene` (default FALSE). +#' @param ignoreAfterBar logical, whether to split the isoform id on the '|' +#' character to facilitate matching with the isoform id in `tx2gene` +#' (default FALSE). +#' @param quiet `Logical`, set `TRUE` to avoid displaying messages. +#' +#' @author Jeroen Gilis +#' +#' @return A list with two elements. The first element `counts` is a sparse +#' count matrix with equivalence class identifiers in the rows and barcode +#' identifiers in the columns. The second element `tx2gene_matched` allows for +#' linking the equivalence class identifiers to their respective transcripts +#' and genes. +#' +#' @section Details: +#' The resulting count matrix uses equivalence class identifiers as rownames. +#' These can be linked to respective transcripts and genes using the +#' `tx2gene_matched` element of the output. Specifically, if the equivalence +#' class identifier reads 1|2|8, then the equivalence class is compatible with +#' the transcripts and their respective genes in rows 1, 2 and 8 of +#' `tx2gene_matched`. +#' +#' @importFrom Matrix sparseMatrix +#' @export +alevinEC <- function(paths, tx2gene, multigene = FALSE, ignoreTxVersion = FALSE, + ignoreAfterBar = FALSE, quiet = FALSE){ + + if (!requireNamespace("data.table", quietly=TRUE)) { + stop("alevinEC() requires CRAN package data.table") + } + + if(!all(file.exists(paths))){ + stop("The following paths do not exist: ", + paste(paths[which(!file.exists(paths))], "\n") + ) + } + + if(!isTRUE(all.equal(colnames(tx2gene), + c("isoform_id","gene_id")))){ + stop("tx2gene does not contain columns gene_id and isoform_id") + } + + # Read and wrangle link between genes and transcripts + n_tx <- data.table::fread(paths[1], nrows=1, + sep = " ",quote = "", header = FALSE)$V1 + tname <- data.table::fread(paths[1], skip = 3, nrows = n_tx, + sep = " ", quote = "", header = FALSE)$V1 + if (ignoreTxVersion) { + tx2gene$isoform_id <- sub("\\..*", "", tx2gene$isoform_id) + tname <- sub("\\..*", "", tname) + } else if (ignoreAfterBar) { + tx2gene$isoform_id <- sub("\\|.*", "", tx2gene$isoform_id) + tname <- sub("\\|.*", "", tname) + } + + if (!any(tname %in% tx2gene$isoform_id)) { + txFromFile <- paste0("Example IDs (file): [", + paste(head(tname,3),collapse=", "), + ", ...]") + txFromTable <- paste0("Example IDs (tx2gene): [", + paste(head(tx2gene$isoform_id,3), + collapse=", "),", ...]") + stop(paste0(" +None of the transcripts in the quantification files are present +in the first column of tx2gene. Check to see that you are using +the same annotation for both.\n\n",txFromFile,"\n\n",txFromTable, + "\n\n This can sometimes (not always) be fixed using 'ignoreTxVersion' or 'ignoreAfterBar'.\n\n")) + } + + # matching of tname and tx2gene + tx2gene <- tx2gene[match(tname, tx2gene$isoform_id),] + + ntxmissing <- sum(!tname %in% tx2gene$isoform_id) + if (ntxmissing > 0) message("transcripts missing from tx2gene: ", ntxmissing) + + # Reading and wrangling alevin output + # message("Reading and wrangling alevin output") + sample_list <- vector("list", length = length(paths)) + for (i in seq_along(paths)) { + if (!quiet) svMisc::progress(i, max.value=length(paths), + init=(i==1), gui=FALSE) + sample_list[[i]] <- readBFH(file = paths[i], + tx2gene = tx2gene, + multigene = multigene) + } + + # Get unique equivalence classes to later construct sparse matrix + unique_ecs <- unique(unlist(lapply(sample_list, "[[", 1), use.names = FALSE)) + + # Constructing sparseMatrix output + # message("Constructing sparseMatrix output \n") + mat_list <- vector("list", length = length(sample_list)) + for (i in seq_along(sample_list)) { + # if (!quiet) svMisc::progress(i, max.value=length(paths), + # init=(i==1), gui=FALSE) + # wrangle elements sample_list to use as input for sparseMatrix + ## wrangle rows + EC_names <- sample_list[[i]][[1]] + times <- sample_list[[i]][[4]] + RowIdx <- match(EC_names, unique_ecs) + RowIdx <- rep(RowIdx, times=times) + + ## wrangle columns + bc_id_to_name <- sample_list[[i]][[5]] + ColIdx <- sample_list[[i]][[2]] + + ## wrangle entries + X <- sample_list[[i]][[3]] + + # construct sparse matrix + mat_list[[i]] <- sparseMatrix( + i = RowIdx, + j = ColIdx, + x = X, + dims = c(length(unique_ecs), length(bc_id_to_name)), + dimnames = list(unique_ecs, bc_id_to_name) + ) + } + matrix_umis <- do.call(cbind, mat_list) + + rownames(tx2gene) <- seq_len(nrow(tx2gene)) + + return(list(counts = matrix_umis, tx2gene_matched = tx2gene)) +} + + +# readBFH: helper to read bfh.txt files, creating lists with 5 elements: +# (i) the names of the ECs +# (ii) the indices of the barcodes in which each of the ECs are expressed +# (iii) the UMI expression level for each EC/barcode pair +# (iv) number of barcodes in which each EC is expressed (to construct +# sparseMatrix) +# (v) the names of the barcodes +readBFH <- function(file, tx2gene, multigene){ + + numbers <- data.table::fread(file, nrows=3, sep = " ", quote = "", + header = FALSE) + n_tx <- numbers$V1[1] + n_bc <- numbers$V1[2] + + # read barcode names + bc_id_to_name <- data.table::fread(file, skip = 3+n_tx, nrows = n_bc, + sep = " ", quote = "", header = FALSE)$V1 + + # read ECs + startread <- sum(n_tx, n_bc, 3) # first line with EC info + ec_df <- data.table::fread(file, skip=startread, sep = " ", quote = "", + header = FALSE) + eccs <- strsplit(ec_df$V1,"\t",fixed=TRUE) + + # code strongly based on read_bfh() from + # https://github.com/k3yavi/vpolo/blob/master/vpolo/alevin/parser.py#L374-L444 + hlpList <- lapply(eccs, function(toks) { + + num_labels <- as.integer(toks[1]) + txps <- as.integer(toks[2:(num_labels+1)])+1 + genes <- tx2gene$gene_id[txps] + # if annotation not complete -> remove EC with any NA genes + if(any(is.na(genes))){ + return(NULL) + } + if(!multigene){ + if(length(unique(genes))>1){ + return(NULL) + } + } + + idx <- num_labels+3 + num_bcs <- as.integer(toks[idx]) # number of cells in which this ECs + # is present in this sample + + bc_names <- vector(mode = "numeric", length=num_bcs) + num_umis <- vector(mode = "numeric", length=num_bcs) + + for (bc in seq_len(num_bcs)) { + idx <- idx+1 + bc_names[bc] <- as.integer(toks[idx])+1 + idx <- idx+1 + num_umi <- as.integer(toks[idx]) + num_umis[bc] <- num_umi + idx <- idx+(2*num_umi) + } + + times <- length(bc_names) + return(list(txps, bc_names, num_umis, times)) + }) + hlpList <- hlpList[!sapply(hlpList, is.null)] + + # wrangle + bc_names <- unlist(lapply(hlpList, `[[`, 2)) + num_umis <- unlist(lapply(hlpList, `[[`, 3)) + times <- unlist(lapply(hlpList, `[[`, 4)) + + EC_names <- unlist(lapply(lapply(hlpList, `[[`, 1), paste, collapse="|")) + + return(list(EC_names, bc_names, num_umis, times, bc_id_to_name)) +} diff -Nru r-bioc-fishpond-2.0.1+ds/R/alevin-loadFry.R r-bioc-fishpond-2.2.0+ds/R/alevin-loadFry.R --- r-bioc-fishpond-2.0.1+ds/R/alevin-loadFry.R 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/R/alevin-loadFry.R 2022-04-26 18:25:58.000000000 +0000 @@ -0,0 +1,452 @@ +#' Load in data from alevin-fry USA mode +#' +#' Enables easy loading of sparse data matrices provided by +#' alevin-fry USA mode. +#' +#' @param fryDir path to the output directory returned by +#' alevin-fry quant command. This directory should contain a +#' \code{metainfo.json}, and an alevin folder which contains +#' \code{quants_mat.mtx}, \code{quants_mat_cols.txt} and +#' \code{quants_mat_rows.txt} +#' @param outputFormat can be \emph{either} be a list that defines the +#' desired format of the output \code{SingleCellExperiment} object +#' \emph{or} a string that represents one of the pre-defined output +#' formats, which are "scRNA", "snRNA", "scVelo" and "velocity". +#' See details for the explainations of the pre-defined formats and +#' how to define custom format. +#' @param nonzero whether to filter cells with non-zero expression +#' value across all genes (default \code{FALSE}). +#' If \code{TRUE}, this will filter based on all assays. +#' If a string vector of assay names, it will filter based +#' on the matching assays in the vector. +#' If not in USA mode, it must be TRUE/FALSE/counts. +#' @param quiet logical whether to display no messages +#' +#' @section Details about \code{loadFry}: +#' This function consumes the result folder returned by running +#' alevin-fry quant in unspliced, spliced, ambiguous (USA) +#' quantification mode, and returns a \code{SingleCellExperiement} object +#' that contains a final count for each gene within each cell. In +#' USA mode, alevin-fry quant returns a count matrix contains three +#' types of count for each feature (gene) within each sample (cell +#' or nucleus), which represent the spliced mRNA count of the gene (S), +#' the unspliced mRNA count of the gene (U), and the count of UMIs whose +#' splicing status is ambiguous for the gene (A). For each assay +#' defined by \code{outputFormat}, these three counts of a gene +#' within a cell will be summed to get the final count of the gene +#' according to the rule defined in the \code{outputFormat}. The +#' returned object will contains the desired assays defined by +#' \code{outputFormat}, with rownames as the barcode of samples and +#' colnames as the feature names. +#' +#' @section Details about the output format: +#' The \code{outputFormat} argument takes \emph{either} be a list that defines +#' the desired format of the output +#' \code{SingleCellExperiment} object \emph{or} a string that represents one of +#' the pre-defined output format. +#' +#' Currently the pre-defined formats +#' of the output \code{SingleCellExperiment} object are: +#' \describe{ +#' \item{"scRNA":}{This format is recommended for single cell experiments. +#' It returns a \code{counts} assay that contains the S+A count of each gene in each cell.} +#' \item{"snRNA":}{This format is recommended for single nucleus experiments. +#' It returns a \code{counts} assay that contains the U+S+A count of each gene in each cell.} +#' \item{"raw":}{This format put the three kinds of counts into three separate assays, +#' which are \code{unspliced}, \code{spliced} and \code{ambiguous}.} +#' \item{"velocity":}{This format contains two assays. +#' The \code{spliced} assay contains the S+A count of each gene in each cell. +#' The \code{unspliced} assay contains the U counts of each gene in each cell.} +#' \item{"scVelo":}{This format is for direct entry into velociraptor R package or +#' other scVelo downstream analysis pipeline for velocity +#' analysis in R with Bioconductor. It adds the expected +#' "S"-pliced assay and removes errors for size factors being +#' non-positive.} +#' } +#' +#' A custom output format can be defined using a list. Each element in the list +#' defines an assay in the output \code{SingleCellExperiment} object. +#' The name of an element in the list will be the name of the corresponding +#' assay in the output object. Each element in the list should be defined as +#' a vector that takes at least one of the three kinds of count, which are U, S and A. +#' See the provided toy example for defining a custom output format. +#' +#' @section Details about \code{load_fry_raw}: +#' This function processes alevin-fry's quantification result +#' contained within the input folder.This function returns a list +#' that consists of the gene count matrix, the gene names list, the +#' barcode list, and some metadata, such as the number of genes in +#' the experiment and whether alevin-fry was executed in USA +#' mode. In the returned list, the all-in-one count matrix, +#' \code{count_mat}, returned from the USA mode of alevin-fry +#' consists of the spliced count of genes defined in +#' \code{gene.names} for all barcodes defined in \code{barcodes}, +#' followed by the unspliced count of genes in the same order for +#' all cells, then followed by the ambiguous count of genes in the +#' same order for all cells. +#' +#' @return A \code{SingleCellExperiment} object that contains one +#' or more assays. Each assay consists of a gene by cell count matrix. +#' The row names are feature names, and the column names are cell +#' barcodes +#' +#' @references +#' +#' alevin-fry publication: +#' +#' He, D., Zakeri, M., Sarkar, H. et al. "Alevin-fry unlocks rapid, accurate and +#' memory-frugal quantification of single-cell RNA-seq data." +#' Nature Methods 19, 316–322 (2022). +#' \url{https://doi.org/10.1038/s41592-022-01408-3} +#' +#' @examples +#' +#' # Get path for minimal example avelin-fry output dir +#' testdat <- fishpond:::readExampleFryData("fry-usa-basic") +#' +#' # This is exactly how the velocity format defined internally. +#' custom_velocity_format <- list("spliced"=c("S","A"), "unspliced"=c("U")) +#' +#' # Load alevin-fry gene quantification in velocity format +#' sce <- loadFry(fryDir=testdat$parent_dir, outputFormat=custom_velocity_format) +#' SummarizedExperiment::assayNames(sce) +#' +#' # Load the same data but use pre-defined, velociraptor R pckage desired format +#' scvelo_format <- "scVelo" +#' +#' scev <- loadFry(fryDir=testdat$parent_dir, outputFormat=scvelo_format, nonzero=TRUE) +#' SummarizedExperiment::assayNames(scev) +#' +#' @author Dongze He, with contributions from Steve Lianoglou, Wes Wilson +#' +#' @export +loadFry <- function(fryDir, + outputFormat = "scRNA", + nonzero = FALSE, + quiet = FALSE) { + + # load in fry result + fry.raw <- load_fry_raw(fryDir, quiet) + meta.data <- fry.raw$meta.data + + + # if in usa.mode, sum up counts in different status according to which.counts + if (meta.data$usa.mode) { + # preparation + predefined.format <- list("scrna" = list("counts" = c("S", "A")), + "snrna" = list("counts" = c("U", "S", "A")), + "velocity" = list("spliced" = c("S", "A"), "unspliced" = c("U")), + "scvelo" = list("counts" = c("S", "A"), "spliced" = c("S", "A"), "unspliced" = c("U")), + "raw" = list("spliced" = c("S"), "unspliced" = c("U"), "ambiguous" = c("A")) + ) + valid.counts <- c("U", "S", "A") + + # check outputFormat + if (is.character(outputFormat)) { + outputFormat = tolower(outputFormat) + # Check whether outputFormat is a predefined format + if (! (outputFormat %in% names(predefined.format))) { + stop("Provided outputFormat string is invalid. Please check the function description +for the list of predifined format") + } + + if (!quiet) { + message("Using pre-defined output format: ", outputFormat) + } + + # get the assays + output.assays <- predefined.format[[outputFormat]] + + } else if (is.list(outputFormat)) { + # check whether user-defined assays are all + for (assay.name in names(outputFormat)) { + if (is.null(outputFormat[[assay.name]])) { + stop(paste0("The provided assay '", assay.name, "' is empty. Please remove it")) + } + else if (!all(outputFormat[[assay.name]] %in% valid.counts)) { + stop("Please use U, S and A ONLY to indicate which counts will be considered to build a assay") + } + } + if (!all(unlist(outputFormat) %in% valid.counts)) { + stop("Please use U, S and A ONLY to indicate which counts will be considered to build a assay") + } + + output.assays <- outputFormat + + if (!quiet) { + message("Using user-defined output assays") + } + } + + # If we are here, the output.assays is valid. + # then we check the assay names in nonzero + if (is.logical(nonzero)) { + if (nonzero) { + nonzero <- names(output.assays) + } else { + if (is.character(outputFormat) && outputFormat == "scvelo") { + nonzero <- c("counts") + } else { + nonzero <- c() + } + } + } else if (is.character(nonzero)) { + if (length(nonzero) > 0) { + for (idx in seq_along(nonzero)) { + if (!nonzero[idx] %in% names(output.assays)) { + warning(paste0("In the provided nonzero vector, '", + nonzero[idx], + "' is not one of the output assays, ignored")) + nonzero <- nonzero[-idx] + } + } + } + } else { + warning("Invalid nonzero, ignored") + nonzero <- c() + } + + # assembly + alist <- vector(mode = "list", length = length(output.assays)) + names(alist) <- names(output.assays) + ng <- meta.data$num.genes + rd <- list("S" = seq(1, ng), "U" = seq(ng + 1, 2 * ng), + "A" = seq(2 * ng + 1, 3 * ng)) + # fill in each assay + for (assay.name in names(output.assays)) { + which.counts <- output.assays[[assay.name]] + alist[[assay.name]] <- fry.raw$count.mat[, rd[[which.counts[1]]], drop = FALSE] + if (length(which.counts) > 1) { + # build assay + for (wc in which.counts[-1]) { + alist[[assay.name]] <- alist[[assay.name]] + fry.raw$count.mat[, rd[[wc]], drop = FALSE] + } + } + alist[[assay.name]] <- t(alist[[assay.name]]) + if (!quiet) { + message(paste(c(paste0("Building the '", + assay.name, + "' assay, which contains"), + which.counts), + collapse = " ")) + } + } + } else { + # not in USA mode + if (!quiet) { + message("Not in USA mode, set assay name as \"counts\"") + } + + if (is.logical(nonzero)) { + if (nonzero) { + nonzero <- c("counts") + } else { + nonzero <- c() + } + } else { + if (nonzero != "counts") { + message("Not in USA mode, nonzero must be TRUE/FALSE/counts") + message("Set nonzero as FALSE") + nonzero <- c() + } else { + nonzero <- c("counts") + } + } + + # define output matrix + alist <- list(counts = t(fry.raw$count.mat)) + } + + if (!quiet) { + message("Constructing output SingleCellExperiment object") + } + + # create SingleCellExperiment object + sce <- SingleCellExperiment(alist, + colData = fry.raw$barcodes, + rowData = fry.raw$gene.names) + + # filter all zero cells in zero, one or multiple assays + for (assay.name in nonzero) { + sce <- sce[, colSums(assay(sce, assay.name)) > 0] + } + + if (!quiet) { + message("Done") + } + + sce +} + +load_fry_raw <- function(fryDir, quiet = FALSE) { + # Check `fryDir` is legit + if (!quiet) { + message("locating quant file") + } + quant.file <- file.path(fryDir, "alevin", "quants_mat.mtx") + if (!file.exists(quant.file)) { + stop("The `fryDir` directory provided does not look like a directory generated from alevin-fry:\n", + sprintf("Missing quant file: %s", quant.file) + ) + } + + # Since alevin-fry 0.4.1, meta_info.json is changed to quant.json, we check both + # read in metadata + qfile <- file.path(fryDir, "quant.json") + if (!file.exists(qfile)) { + qfile <- file.path(fryDir, "meta_info.json") + } + + # read in metadata + meta.info <- fromJSON(qfile) + ng <- meta.info$num_genes + usa.mode <- meta.info$usa_mode + + if (!quiet) { + message("Reading meta data") + message(paste0("USA mode: ", usa.mode)) + } + + # if usa mode, each gene gets 3 rows, so the actual number of genes is ng/3 + if (usa.mode) { + if (ng %% 3 != 0) { + stop("The number of quantified targets is not a multiple of 3") + } + ng <- as.integer(ng/3) + } + + # read in count matrix, gene names, and barcodes + count.mat <- as(readMM(file = quant.file), "dgCMatrix") + afg <- read.table(file.path(fryDir, "alevin", "quants_mat_cols.txt"), + strip.white = TRUE, header = FALSE, nrows = ng, + col.names = c("gene_ids")) + rownames(afg) <- afg$gene_ids + afc <- read.table(file.path(fryDir, "alevin", "quants_mat_rows.txt"), + strip.white = TRUE, header = FALSE, + col.names = c("barcodes")) + rownames(afc) <- afc$barcodes + + if (!quiet) { + message(paste("Processing", ng, "genes", "and", nrow(count.mat), "barcodes")) + } + + list(count.mat = count.mat, gene.names = afg, barcodes = afc, meta.data = list(num.genes = ng, usa.mode = usa.mode)) +} + +# Functions to help with running or creating test data +# none of these functions are exported on purpose + +# Example alevin-fry quant dataset --------------------------------------------- +# +# These methods provide an orthologous way to create and read in test examples +# from alevin outputs than what is implemented in loadFry(). +# +# The functions provide a mechanism that enables you to define the sample output +# in a plain text matrix format (data.frame) named `example-dat.csv` and +# place that in a new directory under extdata/alevin/example-quants. +# +# Look at the `extdata/alevin/example-quants/fry-usa-basic/example-dat.csv` for +# an example data file that created a sample `salmon alevin-fry quant` directory +# we can use to test `loadFry` against. +# +# Once we created the extdata/alevin/example-quants/fry-usa-basic directory +# and put the the example-dat.csv file in it, we then run the following command +# create all of the *.mtx and other files to make this look like an alevin +# output directoy. +# +# ```{r} +# devtools::load_all(".") +# writeExampleFryDat("fry-usa-basic") +# ``` + +#' Loads an example data matrix from a csv data from a top-level example +#' fry output directory. +#' +#' @noRd +#' @param example_name the name of the folder that holds the example data. +#' @return a list of primitive data types you can use to serialize a mock +#' output dir from a `salmon alevin-fry quant` run. +readExampleFryData <- function(example_name = "fry-usa-basic", usa = TRUE, ...) { + example_dir <- system.file("extdata", "alevin", "example-quants", + example_name, package = "fishpond", + mustWork = TRUE) + dat <- read.csv(file.path(example_dir, "example-dat.csv"), + strip.white = TRUE, comment.char = "#") + m <- as.matrix(dat) + dimnames(m) <- list(NULL, NULL) + + if (usa) { + genes <- unique(sub("_.*$", "", colnames(dat))) + } else { + genes <- colnames(dat) + } + + out <- list( + parent_dir = example_dir, + matrix = m, + barcodes = rownames(dat), + genes = colnames(dat)) + + if (usa) { + out$genes <- unique(sub("_.*$", "", out$genes)) + out$usa <- list( + U = grep("_U$", colnames(dat)), + S = grep("_S$", colnames(dat)), + A = grep("_A$", colnames(dat))) + } + + out +} + +#' Serializes example fry output data from an `example-fry-dat.csv` as if +#' it were produced from an alevin-fry quant run +#' +#' @noRd +#' @param x the name of the top level directory in +#' `extdata/alevin/example-fry-USA-quants` or a list result from calling +#' `readExampleFryData` +writeExampleFryDat <- function(x = "fry-usa-basic", ...) { + if (is.character(x)) { + x <- readExampleFryData(x) + } + stopifnot( + is.list(x), + all(c("matrix", "genes", "barcodes", "parent_dir") %in% names(x))) + if (is.null(x$usa)) { + stop("Haven't kicked the tires on a non USA like output") + } + + out.dir <- file.path(x$parent_dir, "alevin") + + if (!dir.exists(out.dir)) { + stopifnot(dir.create(out.dir)) + } + + m <- Matrix::Matrix(x$matrix, sparse = TRUE) + m <- as(m, "dgCMatrix") + Matrix::writeMM(m, file.path(out.dir, "quants_mat.mtx")) + + write.table( + x$genes, + file = file.path(out.dir, "quants_mat_cols.txt"), + quote = FALSE,col.names = FALSE, row.names = FALSE, sep = "\t") + + write.table( + x$barcodes, + file = file.path(out.dir, "quants_mat_rows.txt"), + quote = FALSE, col.names = FALSE, row.names = FALSE, sep = "\t") + + # create metadata + meta_info = list() + meta_info[["alt_resolved_cell_numbers"]] = list() + meta_info[["cmd"]] = "" + meta_info[["dump_eq"]] = FALSE + meta_info[["num_genes"]] = length(x$genes) * ifelse(is.null(x$usa), 1, 3) + meta_info[["num_quantified_cells"]] = length(x$barcodes) + meta_info[["resolution_strategy"]] = "CellRangerLike" + meta_info[["usa_mode"]] = TRUE + + write( + jsonlite::toJSON(meta_info, pretty=TRUE), + file = file.path(x$parent_dir, "meta_info.json")) +} diff -Nru r-bioc-fishpond-2.0.1+ds/R/alevin-readEDS.R r-bioc-fishpond-2.2.0+ds/R/alevin-readEDS.R --- r-bioc-fishpond-2.0.1+ds/R/alevin-readEDS.R 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/R/alevin-readEDS.R 2022-04-26 18:25:58.000000000 +0000 @@ -0,0 +1,14 @@ +#' readEDS - a utility function for quickly reading in Alevin's EDS format +#' +#' @param numOfGenes number of genes +#' @param numOfOriginalCells number of cells +#' @param countMatFilename pointer to the EDS file, \code{quants_mat.gz} +#' @param tierImport whether the \code{countMatFilename} refers to a quants tier file +#' +#' @return a genes x cells sparse matrix, of the class \code{dgCMatrix} +#' +#' @useDynLib fishpond +#' @export +readEDS <- function(numOfGenes, numOfOriginalCells, countMatFilename, tierImport=FALSE) { + getSparseMatrix(numOfGenes, numOfOriginalCells, path.expand(countMatFilename), tierImport) +} diff -Nru r-bioc-fishpond-2.0.1+ds/R/alevinTestHelpers.R r-bioc-fishpond-2.2.0+ds/R/alevinTestHelpers.R --- r-bioc-fishpond-2.0.1+ds/R/alevinTestHelpers.R 2021-10-26 19:25:38.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/R/alevinTestHelpers.R 1970-01-01 00:00:00.000000000 +0000 @@ -1,117 +0,0 @@ -# Functions to help with running or creating test data -# none of these functions are exported on purpose - -# Example alevin-fry quant dataset --------------------------------------------- -# -# These methods provide an orthologous way to create and read in test examples -# from alevin outputs than what is implemented in loadFry(). -# -# The functions provide a mechanism that enables you to define the sample output -# in a plain text matrix format (data.frame) named `example-dat.csv` and -# place that in a new directory under extdata/alevin/example-quants. -# -# Look at the `extdata/alevin/example-quants/fry-usa-basic/example-dat.csv` for -# an example data file that created a sample `salmon alevin-fry quant` directory -# we can use to test `loadFry` against. -# -# Once we created the extdata/alevin/example-quants/fry-usa-basic directory -# and put the the example-dat.csv file in it, we then run the following command -# create all of the *.mtx and other files to make this look like an alevin -# output directoy. -# -# ```{r} -# devtools::load_all(".") -# writeExampleFryDat("fry-usa-basic") -# ``` - -#' Loads an example data matrix from a csv data from a top-level example -#' fry output directory. -#' -#' @noRd -#' @param example_name the name of the folder that holds the example data. -#' @return a list of primitive data types you can use to serialize a mock -#' output dir from a `salmon alevin-fry quant` run. -readExampleFryData <- function(example_name = "fry-usa-basic", usa = TRUE, ...) { - example_dir <- system.file("extdata", "alevin", "example-quants", - example_name, package = "fishpond", - mustWork = TRUE) - dat <- read.csv(file.path(example_dir, "example-dat.csv"), - strip.white = TRUE, comment.char = "#") - m <- as.matrix(dat) - dimnames(m) <- list(NULL, NULL) - - if (usa) { - genes <- unique(sub("_.*$", "", colnames(dat))) - } else { - genes <- colnames(dat) - } - - out <- list( - parent_dir = example_dir, - matrix = m, - barcodes = rownames(dat), - genes = colnames(dat)) - - if (usa) { - out$genes <- unique(sub("_.*$", "", out$genes)) - out$usa <- list( - U = grep("_U$", colnames(dat)), - S = grep("_S$", colnames(dat)), - A = grep("_A$", colnames(dat))) - } - - out -} - -#' Serializes example fry output data from an `example-fry-dat.csv` as if -#' it were produced from an alevin-fry quant run -#' -#' @noRd -#' @param x the name of the top level directory in -#' `extdata/alevin/example-fry-USA-quants` or a list result from calling -#' `readExampleFryData` -writeExampleFryDat <- function(x = "fry-usa-basic", ...) { - if (is.character(x)) { - x <- readExampleFryData(x) - } - stopifnot( - is.list(x), - all(c("matrix", "genes", "barcodes", "parent_dir") %in% names(x))) - if (is.null(x$usa)) { - stop("Haven't kicked the tires on a non USA like output") - } - - out.dir <- file.path(x$parent_dir, "alevin") - - if (!dir.exists(out.dir)) { - stopifnot(dir.create(out.dir)) - } - - m <- Matrix::Matrix(x$matrix, sparse = TRUE) - m <- as(m, "dgCMatrix") - Matrix::writeMM(m, file.path(out.dir, "quants_mat.mtx")) - - write.table( - x$genes, - file = file.path(out.dir, "quants_mat_cols.txt"), - quote = FALSE,col.names = FALSE, row.names = FALSE, sep = "\t") - - write.table( - x$barcodes, - file = file.path(out.dir, "quants_mat_rows.txt"), - quote = FALSE, col.names = FALSE, row.names = FALSE, sep = "\t") - - # create metadata - meta_info = list() - meta_info[["alt_resolved_cell_numbers"]] = list() - meta_info[["cmd"]] = "" - meta_info[["dump_eq"]] = FALSE - meta_info[["num_genes"]] = length(x$genes) * ifelse(is.null(x$usa), 1, 3) - meta_info[["num_quantified_cells"]] = length(x$barcodes) - meta_info[["resolution_strategy"]] = "CellRangerLike" - meta_info[["usa_mode"]] = TRUE - - write( - jsonlite::toJSON(meta_info, pretty=TRUE), - file = file.path(x$parent_dir, "meta_info.json")) -} diff -Nru r-bioc-fishpond-2.0.1+ds/R/fishpond.R r-bioc-fishpond-2.2.0+ds/R/fishpond.R --- r-bioc-fishpond-2.0.1+ds/R/fishpond.R 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/R/fishpond.R 2022-04-26 18:25:58.000000000 +0000 @@ -0,0 +1,67 @@ +#' Fishpond: downstream methods and tools for expression data +#' +#' This package provides statistical methods and other tools for +#' working with Salmon and Alevin quantification of RNA-seq data. +#' Fishpond contains the Swish non-parametric method for +#' detecting differential transcript expression (DTE). Swish can +#' also be used to detect differential gene expresion (DGE), +#' to perform allelic analysis, or to assess changes in isoform +#' proportions. +#' +#' The main Swish functions are: +#' \itemize{ +#' \item \code{\link{scaleInfReps}} - scaling transcript or gene expression data +#' \item \code{\link{labelKeep}} - labelling which features have sufficient counts +#' \item \code{\link{swish}} - perform non-parametric differential analysis +#' \item Plots, e.g., \code{\link{plotMASwish}}, \code{\link{plotInfReps}} +#' } +#' +#' All software-related questions should be posted to the Bioconductor Support Site: +#' +#' \url{https://support.bioconductor.org} +#' +#' The code can be viewed at the GitHub repository, +#' which also lists the contributor code of conduct: +#' +#' \url{https://github.com/mikelove/fishpond} +#' +#' @references +#' +#' Swish method: +#' +#' Zhu, A., Srivastava, A., Ibrahim, J.G., Patro, R., Love, M.I. (2019) +#' Nonparametric expression analysis using inferential replicate counts. +#' Nucleic Acids Research. +#' \url{https://doi.org/10.1093/nar/gkz622} +#' +#' Compression, makeInfReps and splitSwish: +#' +#' Van Buren, S., Sarkar, H., Srivastava, A., Rashid, N.U., +#' Patro, R., Love, M.I. (2020) +#' Compression of quantification uncertainty for scRNA-seq counts. +#' bioRxiv. +#' \url{https://doi.org/10.1101/2020.07.06.189639} +#' +#' @importFrom graphics plot points segments rect abline title axis legend +#' @importFrom stats median quantile qnorm rpois runif rnbinom var cor +#' @importFrom utils head tail capture.output read.table write.table read.csv +#' @importFrom methods is as slot +#' @importFrom SummarizedExperiment SummarizedExperiment +#' assayNames assayNames<- assay assay<- assays assays<- +#' colData colData<- mcols mcols<- rowRanges rowRanges<- +#' @importFrom SingleCellExperiment SingleCellExperiment +#' @importFrom IRanges IntegerList +#' @importFrom S4Vectors DataFrame metadata metadata<- +#' @importFrom GenomicRanges start end strand width +#' start<- end<- strand<- resize flank sort seqnames +#' @importFrom gtools permutations +#' @importFrom Matrix rowSums readMM t colSums +#' @importFrom matrixStats rowRanks rowMedians rowVars rowQuantiles +#' @importFrom svMisc progress +#' @importFrom jsonlite fromJSON +#' +#' @docType package +#' @name fishpond-package +#' @aliases fishpond-package +#' @keywords package +NULL diff -Nru r-bioc-fishpond-2.0.1+ds/R/helper-allelic.R r-bioc-fishpond-2.2.0+ds/R/helper-allelic.R --- r-bioc-fishpond-2.0.1+ds/R/helper-allelic.R 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/R/helper-allelic.R 2022-04-26 18:25:58.000000000 +0000 @@ -0,0 +1,673 @@ +#' Import allelic counts as a SummarizedExperiment +#' +#' Read in Salmon quantification of allelic counts from a +#' diploid transcriptome. Assumes that diploid transcripts +#' are marked with the following suffix: an underscore and +#' a consistent symbol for each of the two alleles, +#' e.g. \code{ENST123_M} and \code{ENST123_P}, +#' or \code{ENST123_alt} and \code{ENST123_ref}, etc. +#' \code{importAllelicCounts} requires the tximeta package. +#' Further information in Details below. +#' +#' \strong{Requirements} - There must be exactly two alleles for each +#' transcript, and the \code{--keep-duplicates} option should be used +#' in Salmon indexing to avoid removal of transcripts with identical +#' sequence. The output object has half the number of transcripts, +#' with the two alleles either stored in a \code{"wide"} object, or as +#' re-named \code{"assays"}. Note carefully that the symbol provided +#' to \code{a1} is used as the effect allele, and \code{a2} is used as +#' the non-effect allele (see the \code{format} argument description +#' and Value description below). +#' +#' \strong{tx2gene} - The two columns should include the \code{a1} and +#' \code{a2} suffix for the transcripts and genes/groups, or those +#' will be added internally, if it is detected that the first +#' transcript does not have these suffices. For example if +#' \code{_alt} or \code{_ref}, or \code{_M} or \code{_P} (as indicated +#' by the \code{a1} and \code{a2} arguments) are not present in the +#' table, the table rows will be duplicated with those suffices added +#' on behalf of the user. If \code{tx2gene} is not provided, the +#' output object will be transcript-level. Do not attempt to set the +#' \code{txOut} argument, it will conflict with internal calls to +#' downstream functions. If the a1/a2 suffices are not at the end of +#' the transcript name in the quantification files, +#' e.g. \code{ENST123_M|}, then \code{ignoreAfterBar=TRUE} +#' can be used to match regardless of the string following \code{|} in +#' the quantification files. +#' +#' \code{skipMeta=TRUE} is used, as it is assumed the diploid +#' transcriptome does not match any reference transcript +#' collection. This may change in future iterations of the function, +#' depending on developments in upstream annotations and software. +#' +#' If \code{tx2gene} is a GRanges object, the rowRanges of the output +#' will be the reduced ranges of the grouped input ranges, with +#' \code{tx_id} collapsed into a CharacterList. Other metadata columns +#' are not manipulated, just the metadata for the first range is +#' returned. +#' +#' @param coldata a data.frame as used in \code{tximeta} +#' @param a1 the symbol for the effect allele +#' @param a2 the symbol for the non-effect allele +#' @param format either \code{"wide"} or \code{"assays"} for whether +#' to combine the allelic counts as columns (wide) or put the allelic +#' count information in different assay slots (assays). +#' For wide output, the data for the non-effect allele (a2) comes first, +#' then the effect allele (a1), e.g. \code{[a2 | a1]}. The \code{ref} level +#' of the factor variable \code{se$allele} will be \code{"a2"} +#' (so by default comparisons will be: a1 vs a2). +#' For assays output, all of the original matrices are renamed with a prefix, +#' either \code{a1-} or \code{a2-}. +#' @param tx2gene optional, a data.frame with first column indicating +#' transcripts, second column indicating genes (or any other transcript +#' grouping). Alternatively, this can be a GRanges object with +#' required columns \code{tx_id}, and \code{group_id} +#' (see \code{makeTx2Tss}). For more information on this argument, +#' see Details. +#' @param ... any arguments to pass to tximeta +#' +#' @return a SummarizedExperiment, with allele counts (and other data) +#' combined into a wide matrix \code{[a2 | a1]}, or as assays (a1, then a2). +#' The original strings associated with a1 and a2 are stored in the +#' metadata of the object, in the \code{alleles} list element. +#' Note the reference level of \code{se$allele} will be \code{"a2"}, +#' such that comparisons by default will be a1 vs a2 +#' (effect vs non-effect). +#' +#' @export +importAllelicCounts <- function(coldata, a1, a2, + format=c("wide","assays"), + tx2gene=NULL, ...) { + format <- match.arg(format) + if (!requireNamespace("tximeta", quietly=TRUE)) { + stop("this function requires installing the Bioconductor package 'tximeta'") + } + + a1match <- paste0("_",a1,"$") + a2match <- paste0("_",a2,"$") + + # output transcripts if no tx2gene provided + txOut <- is.null(tx2gene) + # if tx2gene is ranges, some extra steps to modify output rowRanges + t2gRanges <- is(tx2gene, "GRanges") + if (!txOut) { + # if tx2gene is ranges, then pull out the table for collapsing + if (t2gRanges) { + cols <- c("tx_id","group_id") + stopifnot(all(cols %in% names(mcols(tx2gene)))) + # swap around variable names to run the data.frame-based code + txps <- tx2gene + tx2gene <- mcols(txps)[,cols] + } + stopifnot(ncol(tx2gene) == 2) + # see if tx2gene already has tagged the txps and genes + diploid <- FALSE # save whether tx2gene is diploid + if (grepl(a1match, tx2gene[1,1]) | grepl(a2match, tx2gene[1,1])) { + # ensure same number of a1 and a2 alleles in txps and genes + sum1txp <- sum(grepl(a1match, tx2gene[,1])) + sum2txp <- sum(grepl(a2match, tx2gene[,1])) + sum1gene <- sum(grepl(a1match, tx2gene[,2])) + sum2gene <- sum(grepl(a2match, tx2gene[,2])) + stopifnot(sum1txp > 0 & sum2txp > 0 & sum1gene > 0 & sum2gene > 0) + stopifnot(sum1txp == sum2txp) + stopifnot(sum1gene == sum2gene) + diploid <- TRUE + } else { + a2a1_vec <- rep(c(a2, a1), each=nrow(tx2gene)) + tx2gene <- data.frame( + tx=paste0(rep(tx2gene[,1], 2), "_", a2a1_vec), + gene=paste0(rep(tx2gene[,2], 2), "_", a2a1_vec) + ) + } + } + + se <- tximeta::tximeta(coldata, skipMeta=TRUE, tx2gene=tx2gene, txOut=txOut, ...) + + # remove any characters after "|" + rownames(se) <- sub("\\|.*", "", rownames(se)) + + ntxp <- nrow(se)/2 + n <- ncol(se) + + # gather transcript names for a1 and a2 alleles + txp_nms_a1 <- grep(a1match, rownames(se), value=TRUE) + stopifnot(length(txp_nms_a1) == ntxp) + txp_nms_a2 <- sub(paste0("_",a1),paste0("_",a2),txp_nms_a1) + stopifnot(all(txp_nms_a2 %in% rownames(se))) + stopifnot(length(txp_nms_a1) == length(txp_nms_a2)) + txp_nms <- sub(paste0("_",a1),"",txp_nms_a1) + + if (format == "wide") { + coldata_wide <- data.frame( + allele=factor(rep(c("a2","a1"), each=n), levels=c("a2","a1")) + ) + # add any other covariate data + for (v in setdiff(names(coldata), c("files","names"))) { + coldata_wide[[v]] <- coldata[[v]] + } + rownames(coldata_wide) <- paste0(colnames(se), "-", coldata_wide$allele) + + assays_wide <- lapply(assays(se), function(a) { + a_wide <- cbind(a[txp_nms_a2,], a[txp_nms_a1,]) + rownames(a_wide) <- txp_nms + colnames(a_wide) <- rownames(coldata_wide) + a_wide + }) + # make a new SE + out <- SummarizedExperiment(assays=assays_wide, + colData=coldata_wide) + metadata(out) <- c(metadata(se), list(alleles=c(a1=a1, a2=a2))) + } else if (format == "assays") { + se_a1 <- se[txp_nms_a1,] + se_a2 <- se[txp_nms_a2,] + rownames(se_a1) <- txp_nms + rownames(se_a2) <- txp_nms + # rename the assays + assayNames(se_a1) <- paste0("a1-", assayNames(se_a1)) + assayNames(se_a2) <- paste0("a2-", assayNames(se_a2)) + # add the a2 matrices to the a1 SE object + assays(se_a1) <- c(assays(se_a1), assays(se_a2)) + metadata(se_a1) <- c(metadata(se_a1), list(alleles=c(a1=a1, a2=a2))) + out <- se_a1 + } + # extra steps to assign rowRanges to output + if (t2gRanges) { + if (diploid) { + txps <- txps[grepl(a1match, names(txps))] + names(txps) <- sub(a1match,"",names(txps)) + mcols(txps)$tx_id <- sub(a1match,"",mcols(txps)$tx_id) + mcols(txps)$group_id <- sub(a1match,"",mcols(txps)$group_id) + } + stopifnot(all(rownames(out) %in% mcols(txps)$group_id)) + tx_list <- CharacterList(split(mcols(txps)$tx_id, mcols(txps)$group_id)) + # detect if the TSS have been grouped by `maxgap` + # (the group_id will not be equal to "gene-tss") + tss_grouped <- mcols(txps)$group_id[1] != paste0(mcols(txps)$gene_id[1], + "-", mcols(txps)$tss[1]) + # want to save individual TSS information in a list + if (tss_grouped) { + tss_list <- IntegerList(split(mcols(txps)$tss, mcols(txps)$group_id)) + } + tx_starts <- sapply(split(start(txps), mcols(txps)$group_id), min) + tx_ends <- sapply(split(end(txps), mcols(txps)$group_id), max) + txps <- txps[!duplicated(mcols(txps)$group_id)] + names(txps) <- mcols(txps)$group_id + start(txps) <- tx_starts[names(txps)] + end(txps) <- tx_ends[names(txps)] + txps <- txps[rownames(out)] + mcols(txps)$tx_id <- tx_list[rownames(out)] + if (tss_grouped) { + mcols(txps)$tss <- tss_list[rownames(out)] + } + SummarizedExperiment::rowRanges(out) <- txps + } + out +} + +#' Make a GRanges linking transcripts to TSS within gene +#' +#' This helper function takes either a TxDb/EnsDb or +#' GRanges object as input and outputs a GRanges object +#' where transcripts are aggregated to the gene + TSS +#' (transcription start site). For nearby TSS that should +#' be grouped together, see \code{maxgap}. +#' +#' @param x either TxDb/EnsDb or GRanges object. The GRanges +#' object should have metadata columns \code{tx_id} and +#' \code{gene_id} +#' @param maxgap integer, number of basepairs to use determining +#' whether to combine nearby TSS +#' +#' @return GRanges with columns \code{tx_id}, \code{tss}, and +#' \code{group_id} +#' +#' @examples +#' \dontrun{ +#' library(EnsDb.Hsapiens.v86) +#' edb <- EnsDb.Hsapiens.v86 +#' t2t <- makeTx2Tss(edb) +#' } +#' +#' @export +makeTx2Tss <- function(x, maxgap=0) { + if (is(x, "GRanges")) { + txps <- x + if (!all(c("tx_id","gene_id") %in% names(mcols(txps)))) { + stop("'tx_id' and 'gene_id' must be mcols of 'x'") + } + } else if (is(x, "TxDb")) { + if (!requireNamespace("GenomicFeatures", quietly=TRUE)) { + stop("'x' as TxDb requires GenomicFeatures") + } + if (!requireNamespace("AnnotationDbi", quietly=TRUE)) { + stop("'x' as TxDb requires AnnotationDbi") + } + txps <- GenomicFeatures::transcripts(x) + mcols(txps)$tx_id <- mcols(txps)$tx_name + tx_id_stripped <- sub("\\..*", "", mcols(txps)$tx_id) + mcols(txps)$gene_id <- AnnotationDbi::mapIds(x, tx_id_stripped, "GENEID", "TXNAME") + # if any missing gene IDs, just propagate the txp IDs + missing.idx <- is.na(mcols(txps)$gene_id) + mcols(txps)$gene_id[ missing.idx ] <- mcols(txps)$tx_id[ missing.idx ] + } else if (is(x, "EnsDb")) { + if (!requireNamespace("ensembldb", quietly=TRUE)) { + stop("'x' as EnsDb requires ensembldb") + } + txps <- ensembldb::transcripts(x) + } else { + stop("'x' should be a GRanges or TxDb/EnsDb") + } + tss_ranges <- GenomicRanges::resize(txps, width=1) + tss <- GenomicRanges::start(tss_ranges) + mcols(txps)$tss <- tss + if (maxgap == 0) { + # group ID is just gene ID + TSS + mcols(txps)$group_id <- paste0(mcols(txps)$gene_id, "-", mcols(txps)$tss) + } else { + # in this case, we will number the TSS groups with 1,2,... + # nearby TSS are grouped together according to maxgap + tss_ranges <- sort(tss_ranges) + txps <- txps[ names(tss_ranges) ] + txps <- txps[ order(mcols(txps)$gene_id) ] + sp_tss <- split(mcols(txps)$tss, mcols(txps)$gene_id) + tss_groups <- lapply(sp_tss, joinNearbyTss, maxgap) + mcols(txps)$group_id <- paste0(mcols(txps)$gene_id, "-", unlist(tss_groups)) + } + txps +} + +joinNearbyTss <- function(tss, maxgap) { + delta <- diff(tss, lag=1) + new_tss_loc <- as.integer(delta > maxgap) + tss_loc <- c(1,cumsum(new_tss_loc) + 1) + tss_loc +} + +#' Plot allelic counts in a gene context using Gviz +#' +#' Plot allelic data (allelic proportions, isoform propostions) +#' in a gene context leveraging the Gviz package. See the allelic +#' vignette for example usage. TPM and count filters are used by +#' default to clean up the plot of features with minimal signal; +#' note that the isoform proportion displayed at the bottom of the +#' plot is among the features that pass the filtering steps. +#' If the function is not responding, it is likely due to issues +#' connecting to UCSC servers to draw the ideogram, in this case +#' set \code{ideogram=FALSE}. +#' +#' @param y a SummarizedExperiment (see \code{swish}) +#' @param gene the name of the gene of interest, requires +#' a column \code{gene_id} in the metadata columns of the +#' rowRanges of y +#' @param db either a TxDb or EnsDb object to use for the gene model +#' @param region GRanges, the region to be displayed in the Gviz plot. +#' if not specified, will be set according to the gene plus 20% +#' of the total gene extent on either side +#' @param symbol alternative to \code{gene}, to specify +#' the gene of interest according to a column \code{symbol} +#' in the metadata columns of the rowRanges of y +#' @param genome UCSC genome code (e.g. \code{"hg38"}, +#' if not specified it will use the \code{GenomeInfoDb::genome()} +#' of the rowRanges of \code{y} +#' @param tpmFilter minimum TPM value (mean over samples) to keep a feature +#' @param isoPropFilter minimum percent of isoform proportion to keep a feature +#' @param countFilter minimum count value (mean over samples) to keep a feature +#' @param pc pseudocount to avoid dividing by zero in allelic proportion calculation +#' @param transcriptAnnotation argument passed to Gviz::GeneRegionTrack +#' (\code{"symbol"}, \code{"gene"}, \code{"transcript"}, etc.) +#' @param labels list, labels for a2 (non-effect) and a1 (effect) alleles +#' @param qvalue logical, whether to inclue qvalue track +#' @param log2FC logical, whether to include log2FC track +#' @param ideogram logical, whether to include ideogram track +#' @param cov character specifying a factor or integer variable to use +#' to facet the allelic proportion plots, should be a column in +#' \code{colData(y)} +#' @param covFacetIsoform logical, if \code{cov} is provided, +#' should it also be used to facet the isoform proportion track, +#' in addition to the allelic proportion track +#' @param allelicCol the colors of the points and lines for allelic proportion +#' @param isoformCol the colors of the points and lines for isoform proportion +#' @param statCol the color of the lollipops for q-value and log2FC +#' @param gridCol the color of the grid in the data tracks +#' @param baselineCol the color of the horizontal baseline for q-value and lo2gFC +#' @param titleCol font color of the side titles (track labels) +#' @param titleAxisCol axis color of the side titles (track labels) +#' @param titleBgCol background color of the side titles (track labels) +#' @param geneBorderCol the color of the borders and font in gene region track +#' @param geneFillCol the color of the fill in the gene region track +#' @param genomeAxisCol line color of the genome axis track +#' @param innerFontCol font color of genome axis track, ideogram, and +#' allelic proportion legend +#' @param ... additional arguments passed to \code{Gviz::plotTracks()} +#' +#' @return nothing, a plot is displayed +#' +#' @export +plotAllelicGene <- function(y, gene, db, + region=NULL, symbol=NULL, genome=NULL, + tpmFilter=1, isoPropFilter=.05, countFilter=10, + pc=1, transcriptAnnotation="symbol", + labels=list(a2="a2",a1="a1"), + qvalue=TRUE, log2FC=TRUE, ideogram=FALSE, + cov=NULL, covFacetIsoform=FALSE, + allelicCol=c("dodgerblue","goldenrod1"), + isoformCol="firebrick", + statCol="black", + gridCol="grey80", + baselineCol="black", + titleCol="black", + titleAxisCol="black", + titleBgCol="white", + geneBorderCol="darkblue", + geneFillCol="darkblue", + genomeAxisCol="black", + innerFontCol="black", ...) { + if (!requireNamespace("Gviz", quietly=TRUE)) { + stop("plotAllelicGene() requires 'Gviz' Bioconductor package") + } + if (!requireNamespace("GenomeInfoDb", quietly=TRUE)) { + stop("plotAllelicGene() requires 'GenomeInfoDb' Bioconductor package") + } + stopifnot("gene_id" %in% names(mcols(y))) + if (!is.null(symbol)) { + stopifnot("symbol" %in% names(mcols(y))) + gene <- mcols(y)$gene_id[match(symbol, mcols(y)$symbol)] + } + stopifnot(is.character(gene)) + stopifnot(is(db, "TxDb") | is(db, "EnsDb")) + stopifnot(length(allelicCol) == 2) + stopifnot(all(c("a2","a1") %in% names(labels))) + if (!is.null(cov)) { + stopifnot(cov %in% names(colData(y))) + } else { + stopifnot(!covFacetIsoform) + } + # pull out the ranges of y to build the Gviz plot + gr <- rowRanges(y) + # standard chr's and UCSC for compatibility w Gviz + gr <- GenomeInfoDb::keepStandardChromosomes( + gr, pruning.mode="coarse") + GenomeInfoDb::seqlevelsStyle(gr) <- "UCSC" + # gene must be in the metadata + stopifnot(gene %in% gr$gene_id) + # pull out the ranges for the gene of interest + gr <- gr[gr$gene_id == gene] + stopifnot(!any(duplicated(names(gr)))) + # use TPM and count filtering to remove lowly expressed features + if (!is.null(tpmFilter) | !is.null(countFilter)) { + keep <- rep(TRUE, length(gr)) + meanTPM <- rowMeans(assay(y[names(gr),,drop=FALSE],"abundance")) + meanIsoProp <- meanTPM / sum(meanTPM) + meanCounts <- rowMeans(assay(y[names(gr),,drop=FALSE],"counts")) + if (!is.null(tpmFilter)) { + keep <- keep & meanTPM >= tpmFilter + } + if (!is.null(tpmFilter)) { + keep <- keep & meanIsoProp >= isoPropFilter + } + if (!is.null(countFilter)) { + keep <- keep & meanCounts >= countFilter + } + stopifnot(sum(keep) > 0) + gr <- gr[keep,] + } + if (qvalue & !"qvalue" %in% names(mcols(gr))) { + stop("expecting qvalue, first run swish()") + } + if (log2FC & !"log2FC" %in% names(mcols(gr))) { + stop("expecting log2FC, first run swish()") + } + regionProvided <- TRUE + if (is.null(region)) { + regionProvided <- FALSE + # the region to be displayed + region <- range(gr) + total_width <- width(region) + } + # plot data at the TSS + gr <- flank(gr, width=1) + strand(gr) <- "*" + # so data plots work, need to be sorted + gr <- sort(gr) + if (qvalue) { + gr$minusLogQ <- -log10(gr$qvalue) + # define upper bounds for the q-value and LFC + qUpper <- 1.2 * max(gr$minusLogQ) + } + if (log2FC) { + lfcUpper <- 1.2 * max(abs(gr$log2FC)) + } + chr <- as.character(seqnames(region)) + ########################################## + ## store allelic data in GRanges object ## + ########################################## + gr_allelic <- gr + mcols(gr_allelic) <- NULL + stopifnot("counts" %in% assayNames(y)) + allelic_counts <- assay(y[names(gr),,drop=FALSE], "counts") + stopifnot(rownames(allelic_counts) == names(gr_allelic)) + n <- ncol(allelic_counts)/2 + stopifnot(all(y$allele == rep(c("a2","a1"),each=n))) + total_counts <- ( + allelic_counts[,1:n,drop=FALSE] + + allelic_counts[,(n+1):(2*n),drop=FALSE]) + allelic_prop <- (allelic_counts+pc) / + cbind(total_counts+2*pc, total_counts+2*pc) + rowMeans(allelic_prop, na.rm=TRUE) + mcols(gr_allelic) <- allelic_prop + ########################################## + ## store isoform data in GRanges object ## + ########################################## + gr_isoform <- gr + mcols(gr_isoform) <- NULL + stopifnot("abundance" %in% assayNames(y)) + allelic_tpm <- assay(y[names(gr),,drop=FALSE], "abundance") + total_tpm <- ( + allelic_tpm[,1:n,drop=FALSE] + + allelic_tpm[,(n+1):(2*n),drop=FALSE]) + gene_tpm <- colSums(total_tpm) + isoform_prop <- t(t(total_tpm) / gene_tpm) + isoUpper <- 1.1 * max(isoform_prop) + mcols(gr_isoform) <- isoform_prop + #################### + ## ideogram track ## + #################### + if (ideogram) { + if (is.null(genome)) { + genome <- GenomeInfoDb::genome(gr)[1] + } + ideo_track <- Gviz::IdeogramTrack(genome=genome, chromosome=chr, + fontcolor=innerFontCol) + } else { + ideo_track <- NULL + } + ################## + ## genome track ## + ################## + genome_track <- Gviz::GenomeAxisTrack(col=genomeAxisCol, + fontcolor=innerFontCol) + # for ensembldb EnsDb + if (is(db, "EnsDb")) { + if (!requireNamespace("ensembldb", quietly=TRUE)) { + stop("db=EnsDb requires 'ensembldb' Bioconductor package") + } + GenomeInfoDb::seqlevelsStyle(db) <- "UCSC" + # warning is also showing up on ensembldb vignette... + suppressWarnings({ + gene_region <- ensembldb::getGeneRegionTrackForGviz( + db, + chromosome=chr, + start=start(region)+10, + end=end(region)-10) + }) + gene_track <- Gviz::GeneRegionTrack( + range=gene_region, + name="gene model", + transcriptAnnotation=transcriptAnnotation, + col=geneBorderCol, col.line=geneBorderCol, + fontcolor.group=geneBorderCol, fill=geneFillCol) + } else { + message("using TxDb, assuming UCSC seqlevelsStyle") + gene_track <- Gviz::GeneRegionTrack( + range=db, + chromosome=chr, + start=start(region)+10, + end=end(region)-10, + name="gene model", + transcriptAnnotation=transcriptAnnotation, + col=geneBorderCol, col.line=geneBorderCol, + fontcolor.group=geneBorderCol, fill=geneFillCol) + } + ################################## + ## put together the data tracks ## + ################################## + if (qvalue) { + qvalue_track <- Gviz::DataTrack( + grSelect(gr, "minusLogQ"), + type=c("p","h","g"), name="-log10 qvalue", + col=statCol, col.grid=gridCol, cex=1.5, lwd=2, + ylim=c(0,qUpper), baseline=0) + } else { + qvalue_track <- NULL + } + if (log2FC) { + lfc_track <- Gviz::DataTrack( + grSelect(gr, "log2FC"), + type=c("p","h","g"), name="log2FC", + col=statCol, col.grid=gridCol, cex=1.5, lwd=2, + baseline=0, + ylim=c(-lfcUpper,lfcUpper)) + } else { + lfc_track <- NULL + } + allele <- y$allele + # optionally relabelling of alleles + if (!(labels$a2 == "a2" & labels$a1 == "a1")) { + levels(allele) <- c(labels$a2, labels$a1) + } + # case where we are not faceting across a covariate + if (is.null(cov)) { + allele_track <- list(Gviz::DataTrack( + gr_allelic, type=c("p","a","g"), name="allelic prop.", + groups=allele, col=allelicCol, col.grid=gridCol, lwd=2, + baseline=0.5, fontcolor.legend=innerFontCol)) + } else { + # faceting case + covariate <- colData(y)[[cov]] + if (!is(covariate, "factor")) { + covariate <- factor(covariate) + } + allele_track <- lapply(levels(covariate), function(i) { + gr_allelic_sub <- gr_allelic + idx <- covariate == i + mcols(gr_allelic_sub) <- mcols(gr_allelic_sub)[,idx] + Gviz::DataTrack( + gr_allelic_sub, type=c("p","a","g"), name=i, + groups=allele[idx], col=allelicCol, col.grid=gridCol, lwd=2, + baseline=0.5, fontcolor.legend=innerFontCol) + }) + } + # case where we are not faceting across a covariate + if (!covFacetIsoform) { + isoform_track <- list(Gviz::DataTrack( + gr_isoform, type=c("p","a","g"), name="isoform prop.", + col=isoformCol, col.grid=gridCol, lwd=2, + baseline=0, ylim=c(0, isoUpper))) + } else { + # faceting case + isoform_track <- lapply(levels(covariate), function(i) { + gr_isoform_sub <- gr_isoform + # subset to one half of the covariate vector + idx <- covariate[1:n] == i + mcols(gr_isoform_sub) <- mcols(gr_isoform_sub)[,idx] + Gviz::DataTrack( + gr_isoform_sub, type=c("p","a","g"), name=i, + col=isoformCol, col.grid=gridCol, lwd=2, + baseline=0, ylim=c(0, isoUpper)) + }) + } + if (!regionProvided) { + eps <- round(.2 * total_width) + gvizFrom <- start(region) - eps + gvizTo <- end(region) + eps + } else { + gvizFrom <- start(region) + gvizTo <- end(region) + } + ############################## + ## finally, plot the tracks ## + ############################## + tracks <- c(list(ideo_track, genome_track, gene_track, + qvalue_track, lfc_track), + allele_track, # already list + isoform_track) # already list + tracks <- tracks[!sapply(tracks, is.null)] + Gviz::plotTracks( + tracks, from=gvizFrom, to=gvizTo, + col.title=titleCol, + col.axis=titleAxisCol, + background.title=titleBgCol, + col.baseline=baselineCol, ...) +} + +grSelect <- function(gr, col) { + out <- gr + mcols(out) <- NULL + mcols(out)[col] <- mcols(gr)[col] + out +} + +#' Plot allelic ratio heatmap +#' +#' Plot allelic ratio heatmap over features and samples +#' using the pheatmap package. The a1/(a2 + a1) ratio +#' is displayed. +#' +#' @param y a SummarizedExperiment (see \code{swish}) +#' @param idx a numeric or logical vector of which features +#' to plot +#' @param breaks breaks passed along to pheatmap +#' @param cluster_cols logical, passed to pheatmap +#' @param main title of the plot +#' @param stripAfterChar for the column names, if specified +#' will strip allelic identifiers after this character, +#' default is hyphen. set to NULL to avoid this action +#' @param ... other arguments passed to pheatmap +#' +#' @return nothing, a plot is displayed +#' +#' @export +plotAllelicHeatmap <- function(y, idx, + breaks=NULL, + cluster_cols=FALSE, + main="Allelic ratio", + stripAfterChar="-", + ...) { + if (!requireNamespace("GenomeInfoDb", quietly=TRUE)) { + stop("plotAllelicHeatmap() requires 'pheatmap' CRAN package") + } + stopifnot(ncol(y) > 1) + if ("mean" %in% assayNames(y)) { + cts <- assay(y, "mean")[idx,,drop=FALSE] + message("using posterior mean for calculating ratio") + } else { + cts <- assay(y, "counts")[idx,,drop=FALSE] + message("using counts, for posterior mean, run computeInfRV") + } + stopifnot("allele" %in% names(colData(y))) + cts_a2 <- cts[,y$allele == "a2",drop=FALSE] + cts_a1 <- cts[,y$allele == "a1",drop=FALSE] + tot <- cts_a2 + cts_a1 + ratio <- cts_a1 / tot + if (!is.null(breaks)) { + delta <- max(abs(ratio - 0.5)) + breaks <- seq(from=0.5 - delta, 0.5 + delta, length.out=101) + } + if (!is.null(stripAfterChar)) { + colnames(ratio) <- sub(paste0(stripAfterChar,".*"), + "",colnames(ratio)) + } + pheatmap::pheatmap(ratio, breaks=breaks, + cluster_cols=cluster_cols, + main=main, ...) +} diff -Nru r-bioc-fishpond-2.0.1+ds/R/helper-compress.R r-bioc-fishpond-2.2.0+ds/R/helper-compress.R --- r-bioc-fishpond-2.0.1+ds/R/helper-compress.R 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/R/helper-compress.R 2022-04-26 18:25:58.000000000 +0000 @@ -0,0 +1,218 @@ +#' Make pseudo-inferential replicates from mean and variance +#' +#' Makes pseudo-inferential replicate counts from +#' \code{mean} and \code{variance} assays. The simulated +#' counts are drawn from a negative binomial distribution, +#' with \code{mu=mean} and \code{size} set using a method +#' of moments estimator for dispersion. +#' +#' Note that these simulated counts only reflect marginal +#' variance (one transcript or gene at a time), +#' and do not capture the covariance of counts across +#' transcripts or genes, unlike imported inferential +#' replicate data. Therefore, \code{makeInfReps} should +#' not be used with \code{summarizeToGene} to create +#' gene-level inferential replicates if inferential +#' replicates were originally created on the transcript +#' level. Instead, import the original inferential +#' replicates. +#' +#' @param y a SummarizedExperiment +#' @param numReps how many inferential replicates +#' @param minDisp the minimal dispersion value, +#' set after method of moments estimation from +#' inferential mean and variance +#' +#' @return a SummarizedExperiment +#' +#' @references +#' +#' Van Buren, S., Sarkar, H., Srivastava, A., Rashid, N.U., +#' Patro, R., Love, M.I. (2020) +#' Compression of quantification uncertainty for scRNA-seq counts. +#' bioRxiv. +#' \url{https://doi.org/10.1101/2020.07.06.189639} +#' +#' @examples +#' +#' library(SummarizedExperiment) +#' mean <- matrix(1:4,ncol=2) +#' variance <- mean +#' se <- SummarizedExperiment(list(mean=mean, variance=variance)) +#' se <- makeInfReps(se, numReps=50) +#' +#' @export +makeInfReps <- function(y, numReps, minDisp=1e-3) { + stopifnot(is(y, "SummarizedExperiment")) + stopifnot("mean" %in% assayNames(y)) + stopifnot("variance" %in% assayNames(y)) + stopifnot(numReps > 1) + stopifnot(round(numReps) == numReps) + if (any(grepl("infRep", assayNames(y)))) { + stop("infReps already exist, remove these first") + } + # drop sparsity here + m <- as.matrix(assays(y)[["mean"]]) + v <- as.matrix(assays(y)[["variance"]]) + disp <- ifelse(m > 0, pmax(minDisp, (v - m)/m^2), minDisp) + infReps <- list() + for (k in seq_len(numReps)) { + infReps[[k]] <- matrix(rnbinom(n=nrow(y)*ncol(y), mu=m, size=1/disp), + ncol=ncol(y), dimnames=dimnames(m)) + } + names(infReps) <- paste0("infRep", 1:numReps) + assays(y) <- c(assays(y), infReps) + metadata(y)$infRepsScaled <- FALSE + y +} + +#' Function for splitting SummarizedExperiment into separate RDS files +#' +#' The \code{splitSwish} function splits up the \code{y} object +#' along genes and writes a \code{Snakefile} that can be used with +#' Snakemake to distribute running \code{swish} across genes. +#' This workflow is primarily designed for large single cell datasets, +#' and so the default is to not perform length correction +#' within the distributed jobs. +#' See the alevin section of the vignette for an example. See +#' the Snakemake documention for details on how to run and customize +#' a \code{Snakefile}: \url{https://snakemake.readthedocs.io} +#' +#' @param y a SummarizedExperiment +#' @param nsplits integer, how many pieces to break \code{y} into +#' @param prefix character, the path of the RDS files to write out, +#' e.g. \code{prefix="/path/to/swish"} will generate \code{swish.rds} +#' files at this path +#' @param snakefile character, the path of a Snakemake file, e.g. +#' \code{Snakefile}, that should be written out. If \code{NULL}, +#' then no \code{Snakefile} is written out +#' @param overwrite logical, whether the \code{snakefile} and +#' RDS files (\code{swish1.rds}, ...) should overwrite existing files +#' +#' @references +#' +#' Compression and splitting across jobs: +#' +#' Van Buren, S., Sarkar, H., Srivastava, A., Rashid, N.U., +#' Patro, R., Love, M.I. (2020) +#' Compression of quantification uncertainty for scRNA-seq counts. +#' bioRxiv. +#' \url{https://doi.org/10.1101/2020.07.06.189639} +#' +#' Snakemake: +#' +#' Koster, J., Rahmann, S. (2012) +#' Snakemake - a scalable bioinformatics workflow engine. +#' Bioinformatics. +#' \url{https://doi.org/10.1093/bioinformatics/bts480} +#' +#' @return nothing, files are written out +#' +#' @export +splitSwish <- function(y, nsplits, prefix="swish", + snakefile=NULL, overwrite=FALSE) { + stopifnot(nsplits > 1) + stopifnot(nsplits == round(nsplits)) + stopifnot(nsplits < nrow(y)) + stopifnot(!is.null(rownames(y))) + stopifnot(all(mcols(y)$keep)) + stopifnot(basename(prefix) == "swish") + if (!is.null(snakefile)) { + stopifnot(is(snakefile, "character")) + if (file.exists(snakefile) & !overwrite) + stop("snakefile already exists at specified location, see 'overwrite'") + snake <- scan(system.file("extdata/Snakefile", package="fishpond"), + what="character", sep="\n", blank.lines.skip=FALSE, + quiet=TRUE) + write(snake, file=snakefile) + } + # how many leading 0's + width <- floor(log10(nsplits)) + 1 + nums <- formatC(seq_len(nsplits), width=max(2, width), + format="d", flag="0") + files <- paste0(prefix, nums, ".rds") + if (any(file.exists(files)) & !overwrite) + stop("swish RDS files exist at specified locations, see 'overwrite'") + idx <- sort(rep(seq_len(nsplits), length.out=nrow(y))) + for (i in seq_len(nsplits)) { + saveRDS(y[idx == i,], file=files[i]) + } +} + +#' Helper function for distributing Swish on a subset of data +#' +#' This function is called by the \code{Snakefile} that is generated +#' by \code{\link{splitSwish}}. See alevin example in the vignette. +#' As such, it doesn't need to be run by users in an interactive +#' R session. +#' +#' Note that the default for length correction is FALSE, as +#' opposed to the default in \code{\link{scaleInfReps}} which +#' is TRUE. The default for \code{numReps} here is 20. +#' +#' @param infile path to an RDS file of a SummarizedExperiment +#' @param outfile a CSV file to write out +#' @param numReps how many inferential replicates to generate +#' @param lengthCorrect logical, see \code{\link{scaleInfReps}}, +#' and Swish vignette. As this function is primarily for alevin, +#' the default is \code{FALSE} +#' @param overwrite logical, whether \code{outfile} +#' should overwrite an existing file +#' @param ... arguments passed to \code{\link{swish}} +#' +#' @return nothing, files are written out +#' +#' @export +miniSwish <- function(infile, outfile, numReps=20, + lengthCorrect=FALSE, overwrite=FALSE, ...) { + stopifnot(all(is(c(infile, outfile), "character"))) + stopifnot(file.exists(infile)) + if (file.exists(outfile) & !overwrite) + stop("outfile already exists at specified location, see 'overwrite'") + y <- readRDS(infile) + stopifnot(!is.null(rownames(y))) + stopifnot(all(mcols(y)$keep)) + y <- makeInfReps(y, numReps=numReps) + if (is.null(colData(y)$sizeFactor)) + stop("miniSwish requires pre-estimated sizeFactors stored in colData(...)") + y <- scaleInfReps(y, lengthCorrect=lengthCorrect, sfFun=colData(y)$sizeFactor) + out <- swish(y=y, returnNulls=TRUE, ...) + mat <- cbind(out$stat, out$log2FC, out$nulls) + rownames(mat) <- rownames(y) + write.table(mat, file=outfile, col.names=FALSE, sep=",") +} + +#' Read statistics and nulls from CSV file +#' +#' After running \code{\link{splitSwish}} and the associated +#' \code{Snakefile}, this function can be used to gather and +#' add the results to the original object. See the alevin +#' section of the vignette for an example. +#' +#' @param y a SummarizedExperiment (if NULL, function will +#' output a data.frame) +#' @param infile character, path to the \code{summary.csv} file +#' @param estPi0 logical, see \code{\link{swish}} +#' +#' @return the SummarizedExperiment with metadata columns added, +#' or if \code{y} is NULL, a data.frame of compiled results +#' +#' @export +addStatsFromCSV <- function(y=NULL, infile, estPi0=FALSE) { + res <- read.table(infile, header=FALSE, row.names=1, sep=",") + stat <- res[,1] + log2FC <- res[,2] + nulls <- as.matrix(res[,-c(1:2)]) + pvalue <- qvalue::empPvals(abs(stat), abs(nulls)) + pi0 <- if (estPi0) NULL else 1 + q.res <- qvalue::qvalue(pvalue, pi0=pi0) + locfdr <- q.res$lfdr + qvalue <- q.res$qvalues + df <- data.frame(stat, log2FC, pvalue, locfdr, qvalue, row.names=rownames(res)) + if (is.null(y)) { + return(df) + } else { + stopifnot(all(rownames(y) == rownames(df))) + return(postprocess(y, df)) + } +} diff -Nru r-bioc-fishpond-2.0.1+ds/R/helper-isoform.R r-bioc-fishpond-2.2.0+ds/R/helper-isoform.R --- r-bioc-fishpond-2.0.1+ds/R/helper-isoform.R 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/R/helper-isoform.R 2022-04-26 18:25:58.000000000 +0000 @@ -0,0 +1,75 @@ +#' Create isoform proportions from scaled data +#' +#' Takes output of scaled (and optionally filtered) counts +#' and returns isoform proportions by dividing out the +#' total scaled count for the gene for each sample. +#' The operation is performed on the \code{counts} assay, +#' then creating a new assay called \code{isoProp}, +#' and on all of the inferential replicates, turning them +#' from counts into isoform proportions. Any transcripts +#' (rows) from single isoform genes are removed, and the +#' transcripts will be re-ordered by gene ID. +#' +#' @param y a SummarizedExperiment +#' @param geneCol the name of the gene ID column in the +#' metadata columns for the rows of \code{y} +#' @param quiet display no messages +#' +#' @return a SummarizedExperiment, with single-isoform +#' transcripts removed, and transcripts now ordered by +#' gene +#' +#' @export +isoformProportions <- function(y, geneCol="gene_id", quiet=FALSE) { + if (!interactive()) { + quiet <- TRUE + } + if (is.null(metadata(y)$infRepsScaled)) { + stop("first run scaleInfReps()") + } + if (!is.null(metadata(y)$infRepsProportions)) { + if (metadata(y)$infRepsProportions) stop("inferential replicates already proportions") + } + stopifnot(geneCol %in% names(mcols(y))) + + gene <- mcols(y)[[geneCol]] + stopifnot(all(lengths(gene) == 1)) + mcols(y)$gene <- unlist(gene) + gene.tbl <- table(mcols(y)$gene) + # remove single isoform genes + keep <- mcols(y)$gene %in% names(gene.tbl)[gene.tbl > 1] + stopifnot(sum(keep) > 0) + y <- y[keep,] + y <- y[order(mcols(y)$gene),] + assays(y, withDimnames=FALSE)[["isoProp"]] <- makeIsoProp( + assays(y)[["counts"]], + mcols(y)$gene + ) + + infRepIdx <- grep("infRep",assayNames(y)) + infRepError(infRepIdx) + infReps <- assays(y)[infRepIdx] + nreps <- length(infReps) + for (k in seq_len(nreps)) { + if (!quiet) svMisc::progress(k, max.value=nreps, init=(k==1), gui=FALSE) + infReps[[k]] <- makeIsoProp(infReps[[k]], + mcols(y)$gene) + } + if (!quiet) message("") + + assays(y)[grep("infRep",assayNames(y))] <- infReps + metadata(y)$infRepsProportions <- TRUE + y +} + +# not exported +makeIsoProp <- function(counts, gene) { + totals <- rowsum(counts, gene) + # if a sample has a gene total of 0, replace here with 1 to avoid division by 0 + totals[totals == 0] <- 1 + ngene <- length(unique(gene)) + gene.tbl <- table(gene) + idx <- rep(seq_len(ngene), gene.tbl) + big.totals <- totals[idx,] + counts / big.totals +} diff -Nru r-bioc-fishpond-2.0.1+ds/R/helper.R r-bioc-fishpond-2.2.0+ds/R/helper.R --- r-bioc-fishpond-2.0.1+ds/R/helper.R 2021-12-02 07:00:52.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/R/helper.R 2022-04-26 18:25:58.000000000 +0000 @@ -158,307 +158,12 @@ } mcols(y)$keep <- keep metadata(y)$preprocessed <- TRUE - if (!"infRepScaled" %in% names(metadata(y))) { + if (!"infRepsScaled" %in% names(metadata(y))) { metadata(y)$infRepsScaled <- FALSE } y } -#' Create isoform proportions from scaled data -#' -#' Takes output of scaled (and optionally filtered) counts -#' and returns isoform proportions by dividing out the -#' total scaled count for the gene for each sample. -#' The operation is performed on the \code{counts} assay, -#' then creating a new assay called \code{isoProp}, -#' and on all of the inferential replicates, turning them -#' from counts into isoform proportions. Any transcripts -#' (rows) from single isoform genes are removed, and the -#' transcripts will be re-ordered by gene ID. -#' -#' @param y a SummarizedExperiment -#' @param geneCol the name of the gene ID column in the -#' metadata columns for the rows of \code{y} -#' @param quiet display no messages -#' -#' @return a SummarizedExperiment, with single-isoform -#' transcripts removed, and transcripts now ordered by -#' gene -#' -#' @export -isoformProportions <- function(y, geneCol="gene_id", quiet=FALSE) { - if (!interactive()) { - quiet <- TRUE - } - if (is.null(metadata(y)$infRepsScaled)) { - stop("first run scaleInfReps()") - } - if (!is.null(metadata(y)$infRepsProportions)) { - if (metadata(y)$infRepsProportions) stop("inferential replicates already proportions") - } - stopifnot(geneCol %in% names(mcols(y))) - - gene <- mcols(y)[[geneCol]] - stopifnot(all(lengths(gene) == 1)) - mcols(y)$gene <- unlist(gene) - gene.tbl <- table(mcols(y)$gene) - # remove single isoform genes - keep <- mcols(y)$gene %in% names(gene.tbl)[gene.tbl > 1] - stopifnot(sum(keep) > 0) - y <- y[keep,] - y <- y[order(mcols(y)$gene),] - assays(y, withDimnames=FALSE)[["isoProp"]] <- makeIsoProp( - assays(y)[["counts"]], - mcols(y)$gene - ) - - infRepIdx <- grep("infRep",assayNames(y)) - infRepError(infRepIdx) - infReps <- assays(y)[infRepIdx] - nreps <- length(infReps) - for (k in seq_len(nreps)) { - if (!quiet) svMisc::progress(k, max.value=nreps, init=(k==1), gui=FALSE) - infReps[[k]] <- makeIsoProp(infReps[[k]], - mcols(y)$gene) - } - if (!quiet) message("") - - assays(y)[grep("infRep",assayNames(y))] <- infReps - metadata(y)$infRepsProportions <- TRUE - y -} - -# not exported -makeIsoProp <- function(counts, gene) { - totals <- rowsum(counts, gene) - # if a sample has a gene total of 0, replace here with 1 to avoid division by 0 - totals[totals == 0] <- 1 - ngene <- length(unique(gene)) - gene.tbl <- table(gene) - idx <- rep(seq_len(ngene), gene.tbl) - big.totals <- totals[idx,] - counts / big.totals -} - -#' Make pseudo-inferential replicates from mean and variance -#' -#' Makes pseudo-inferential replicate counts from -#' \code{mean} and \code{variance} assays. The simulated -#' counts are drawn from a negative binomial distribution, -#' with \code{mu=mean} and \code{size} set using a method -#' of moments estimator for dispersion. -#' -#' Note that these simulated counts only reflect marginal -#' variance (one transcript or gene at a time), -#' and do not capture the covariance of counts across -#' transcripts or genes, unlike imported inferential -#' replicate data. Therefore, \code{makeInfReps} should -#' not be used with \code{summarizeToGene} to create -#' gene-level inferential replicates if inferential -#' replicates were originally created on the transcript -#' level. Instead, import the original inferential -#' replicates. -#' -#' @param y a SummarizedExperiment -#' @param numReps how many inferential replicates -#' @param minDisp the minimal dispersion value, -#' set after method of moments estimation from -#' inferential mean and variance -#' -#' @return a SummarizedExperiment -#' -#' @references -#' -#' Van Buren, S., Sarkar, H., Srivastava, A., Rashid, N.U., -#' Patro, R., Love, M.I. (2020) -#' Compression of quantification uncertainty for scRNA-seq counts. -#' bioRxiv. -#' \url{https://doi.org/10.1101/2020.07.06.189639} -#' -#' @examples -#' -#' library(SummarizedExperiment) -#' mean <- matrix(1:4,ncol=2) -#' variance <- mean -#' se <- SummarizedExperiment(list(mean=mean, variance=variance)) -#' se <- makeInfReps(se, numReps=50) -#' -#' @export -makeInfReps <- function(y, numReps, minDisp=1e-3) { - stopifnot(is(y, "SummarizedExperiment")) - stopifnot("mean" %in% assayNames(y)) - stopifnot("variance" %in% assayNames(y)) - stopifnot(numReps > 1) - stopifnot(round(numReps) == numReps) - if (any(grepl("infRep", assayNames(y)))) { - stop("infReps already exist, remove these first") - } - # drop sparsity here - m <- as.matrix(assays(y)[["mean"]]) - v <- as.matrix(assays(y)[["variance"]]) - disp <- ifelse(m > 0, pmax(minDisp, (v - m)/m^2), minDisp) - infReps <- list() - for (k in seq_len(numReps)) { - infReps[[k]] <- matrix(rnbinom(n=nrow(y)*ncol(y), mu=m, size=1/disp), - ncol=ncol(y), dimnames=dimnames(m)) - } - names(infReps) <- paste0("infRep", 1:numReps) - assays(y) <- c(assays(y), infReps) - metadata(y)$infRepsScaled <- FALSE - y -} - -#' Function for splitting SummarizedExperiment into separate RDS files -#' -#' The \code{splitSwish} function splits up the \code{y} object -#' along genes and writes a \code{Snakefile} that can be used with -#' Snakemake to distribute running \code{swish} across genes. -#' This workflow is primarily designed for large single cell datasets, -#' and so the default is to not perform length correction -#' within the distributed jobs. -#' See the alevin section of the vignette for an example. See -#' the Snakemake documention for details on how to run and customize -#' a \code{Snakefile}: \url{https://snakemake.readthedocs.io} -#' -#' @param y a SummarizedExperiment -#' @param nsplits integer, how many pieces to break \code{y} into -#' @param prefix character, the path of the RDS files to write out, -#' e.g. \code{prefix="/path/to/swish"} will generate \code{swish.rds} -#' files at this path -#' @param snakefile character, the path of a Snakemake file, e.g. -#' \code{Snakefile}, that should be written out. If \code{NULL}, -#' then no \code{Snakefile} is written out -#' @param overwrite logical, whether the \code{snakefile} and -#' RDS files (\code{swish1.rds}, ...) should overwrite existing files -#' -#' @references -#' -#' Compression and splitting across jobs: -#' -#' Van Buren, S., Sarkar, H., Srivastava, A., Rashid, N.U., -#' Patro, R., Love, M.I. (2020) -#' Compression of quantification uncertainty for scRNA-seq counts. -#' bioRxiv. -#' \url{https://doi.org/10.1101/2020.07.06.189639} -#' -#' Snakemake: -#' -#' Koster, J., Rahmann, S. (2012) -#' Snakemake - a scalable bioinformatics workflow engine. -#' Bioinformatics. -#' \url{https://doi.org/10.1093/bioinformatics/bts480} -#' -#' @return nothing, files are written out -#' -#' @export -splitSwish <- function(y, nsplits, prefix="swish", - snakefile=NULL, overwrite=FALSE) { - stopifnot(nsplits > 1) - stopifnot(nsplits == round(nsplits)) - stopifnot(nsplits < nrow(y)) - stopifnot(!is.null(rownames(y))) - stopifnot(all(mcols(y)$keep)) - stopifnot(basename(prefix) == "swish") - if (!is.null(snakefile)) { - stopifnot(is(snakefile, "character")) - if (file.exists(snakefile) & !overwrite) - stop("snakefile already exists at specified location, see 'overwrite'") - snake <- scan(system.file("extdata/Snakefile", package="fishpond"), - what="character", sep="\n", blank.lines.skip=FALSE, - quiet=TRUE) - write(snake, file=snakefile) - } - # how many leading 0's - width <- floor(log10(nsplits)) + 1 - nums <- formatC(seq_len(nsplits), width=max(2, width), - format="d", flag="0") - files <- paste0(prefix, nums, ".rds") - if (any(file.exists(files)) & !overwrite) - stop("swish RDS files exist at specified locations, see 'overwrite'") - idx <- sort(rep(seq_len(nsplits), length.out=nrow(y))) - for (i in seq_len(nsplits)) { - saveRDS(y[idx == i,], file=files[i]) - } -} - -#' Helper function for distributing Swish on a subset of data -#' -#' This function is called by the \code{Snakefile} that is generated -#' by \code{\link{splitSwish}}. See alevin example in the vignette. -#' As such, it doesn't need to be run by users in an interactive -#' R session. -#' -#' Note that the default for length correction is FALSE, as -#' opposed to the default in \code{\link{scaleInfReps}} which -#' is TRUE. The default for \code{numReps} here is 20. -#' -#' @param infile path to an RDS file of a SummarizedExperiment -#' @param outfile a CSV file to write out -#' @param numReps how many inferential replicates to generate -#' @param lengthCorrect logical, see \code{\link{scaleInfReps}}, -#' and Swish vignette. As this function is primarily for alevin, -#' the default is \code{FALSE} -#' @param overwrite logical, whether \code{outfile} -#' should overwrite an existing file -#' @param ... arguments passed to \code{\link{swish}} -#' -#' @return nothing, files are written out -#' -#' @export -miniSwish <- function(infile, outfile, numReps=20, - lengthCorrect=FALSE, overwrite=FALSE, ...) { - stopifnot(all(is(c(infile, outfile), "character"))) - stopifnot(file.exists(infile)) - if (file.exists(outfile) & !overwrite) - stop("outfile already exists at specified location, see 'overwrite'") - y <- readRDS(infile) - stopifnot(!is.null(rownames(y))) - stopifnot(all(mcols(y)$keep)) - y <- makeInfReps(y, numReps=numReps) - if (is.null(colData(y)$sizeFactor)) - stop("miniSwish requires pre-estimated sizeFactors stored in colData(...)") - y <- scaleInfReps(y, lengthCorrect=lengthCorrect, sfFun=colData(y)$sizeFactor) - out <- swish(y=y, returnNulls=TRUE, ...) - mat <- cbind(out$stat, out$log2FC, out$nulls) - rownames(mat) <- rownames(y) - write.table(mat, file=outfile, col.names=FALSE, sep=",") -} - -#' Read statistics and nulls from CSV file -#' -#' After running \code{\link{splitSwish}} and the associated -#' \code{Snakefile}, this function can be used to gather and -#' add the results to the original object. See the alevin -#' section of the vignette for an example. -#' -#' @param y a SummarizedExperiment (if NULL, function will -#' output a data.frame) -#' @param infile character, path to the \code{summary.csv} file -#' @param estPi0 logical, see \code{\link{swish}} -#' -#' @return the SummarizedExperiment with metadata columns added, -#' or if \code{y} is NULL, a data.frame of compiled results -#' -#' @export -addStatsFromCSV <- function(y=NULL, infile, estPi0=FALSE) { - res <- read.table(infile, header=FALSE, row.names=1, sep=",") - stat <- res[,1] - log2FC <- res[,2] - nulls <- as.matrix(res[,-c(1:2)]) - pvalue <- qvalue::empPvals(abs(stat), abs(nulls)) - pi0 <- if (estPi0) NULL else 1 - q.res <- qvalue::qvalue(pvalue, pi0=pi0) - locfdr <- q.res$lfdr - qvalue <- q.res$qvalues - df <- data.frame(stat, log2FC, pvalue, locfdr, qvalue, row.names=rownames(res)) - if (is.null(y)) { - return(df) - } else { - stopifnot(all(rownames(y) == rownames(df))) - return(postprocess(y, df)) - } -} - #' Make simulated data for swish for examples/testing #' #' Makes a small swish dataset for examples and testing. @@ -474,6 +179,8 @@ #' @param null logical, whether to make an all null dataset #' @param meanVariance logical, whether to output only mean and #' variance of inferential replicates +#' @param allelic logical, whether to make an allelic sim dataset +#' @param dynamic logical, whether to make a dynamic allelic sim dataset #' #' @return a SummarizedExperiment #' @@ -484,26 +191,69 @@ #' assayNames(y) #' #' @export -makeSimSwishData <- function(m=1000, n=10, numReps=20, null=FALSE, meanVariance=FALSE) { - stopifnot(m>8) +makeSimSwishData <- function(m=1000, n=10, numReps=20, + null=FALSE, meanVariance=FALSE, + allelic=FALSE, dynamic=FALSE) { + stopifnot(m > 8) + if (dynamic) { + allelic <- TRUE + } + if (allelic) { + n <- 2 * n + } stopifnot(n %% 2 == 0) cts <- matrix(rpois(m*n, lambda=80), ncol=n) if (!null) { + grp1 <- 1:(n/2) grp2 <- (n/2+1):n - cts[1:6,grp2] <- rpois(3*n, lambda=120) + # standard sim + if (!dynamic) { + cts[1:6,grp2] <- rpois(3*n, lambda=120) + } else { + # dynamic AI, the LFC varies over a covariate + for (i in 1:6) { + cts[i,grp2] <- rpois(n/2, lambda=seq(53,120,length=n/2)) + } + } cts[7:8,] <- 0 } length <- matrix(1000, nrow=m, ncol=n) abundance <- t(t(cts)/colSums(cts))*1e6 + if (allelic) { + # change LFC for feature 2 + revIdx <- c(grp2,grp1) + cts[2,] <- cts[2,revIdx] + # make feature 3 null + cts[3,] <- rpois(n, lambda=80) + # make interesting abundance dynamics + abundance[2,] <- 0.5 * abundance[2,] + abundance[3,] <- 0.1 * abundance[3,] + } infReps <- lapply(seq_len(numReps), function(i) { m <- matrix(rpois(m*n, lambda=80), ncol=n) if (!null) { - # these row numbers are fixed for the demo dataset - m[1:6,grp2] <- rpois(3*n, lambda=120) - m[3:4,] <- round(m[3:4,] * runif(2*n,.5,1.5)) - m[5:6,grp2] <- round(pmax(m[5:6,grp2] + runif(n,-120,80),0)) + # standard sim + if (!dynamic) { + # these row numbers are fixed for the demo dataset + m[1:6,grp2] <- rpois(3*n, lambda=120) + m[3:4,] <- round(m[3:4,] * runif(2*n,.5,1.5)) + m[5:6,grp2] <- round(pmax(m[5:6,grp2] + runif(n,-120,80),0)) + } else { + # dynamic AI, the LFC varies over a covariate + for (i in 1:6) { + m[i,grp2] <- rpois(n/2, lambda=seq(53,120,length=n/2)) + } + m[3:4,] <- round(m[3:4,] * runif(2*n,.5,1.5)) + m[5:6,grp2] <- round(pmax(m[5:6,grp2] + runif(n,-120,80),0)) + } m[7:8,] <- 0 } + if (allelic) { + # change LFC for feature 2 + m[2,] <- m[2,revIdx] + # make feature 3 null + m[3,] <- rpois(n, lambda=80) + } m }) names(infReps) <- paste0("infRep", seq_len(numReps)) @@ -521,8 +271,23 @@ rownames(se) <- paste0("gene-",seq_len(nrow(se))) colnames(se) <- paste0("s",seq_len(n)) metadata(se) <- list(countsFromAbundance="no") - colData(se) <- DataFrame(condition=gl(2,n/2), - row.names=colnames(se)) + if (allelic) { + als <- c("a2","a1") + allele <- factor(rep(als, each=n/2), levels=als) + sample <- factor(paste0("sample",rep(1:(n/2),2))) + samp <- factor(paste0("s",rep(1:(n/2),2))) + colnames(se) <- paste0(samp,"-",allele) + coldata <- DataFrame(allele=allele, + sample=sample, + row.names=colnames(se)) + if (dynamic) { + coldata$time <- rep(round(seq(0,1,length=n/2),2),2) + } + } else { + coldata <- DataFrame(condition=gl(2,n/2), + row.names=colnames(se)) + } + colData(se) <- coldata se } @@ -577,155 +342,6 @@ y } -#' Import allelic counts as a SummarizedExperiment -#' -#' Read in Salmon quantification of allelic counts from a -#' diploid transcriptome. Assumes that diploid transcripts -#' are marked with the following suffix: an underscore and -#' a consistent symbol for each of the two alleles, -#' e.g. \code{ENST123_M} and \code{ENST123_P}, -#' or \code{ENST123_alt} and \code{ENST123_ref}. -#' There must be exactly two alleles for each transcript, -#' and the \code{--keep-duplicates} option should be used in -#' Salmon indexing to avoid removing transcripts with identical sequence. -#' The output object has half the number of transcripts, -#' with the two alleles either stored in a \code{"wide"} object, -#' or as re-named \code{"assays"}. Note carefully that the symbol -#' provided to \code{a1} is used as the effect allele, -#' and \code{a2} is used as the non-effect allele -#' (see the \code{format} argument description and Value -#' description below). -#' -#' Requires the tximeta package. -#' \code{skipMeta=TRUE} is used, as it is assumed -#' the diploid transcriptome does not match any reference -#' transcript collection. This may change in future iterations -#' of the function, depending on developments in upstream -#' software. -#' -#' @param coldata a data.frame as used in \code{tximeta} -#' @param a1 the symbol for the effect allele -#' @param a2 the symbol for the non-effect allele -#' @param format either \code{"wide"} or \code{"assays"} for whether -#' to combine the allelic counts as columns (wide) or put the allelic -#' count information in different assay slots (assays). -#' For wide output, the data for the non-effect allele (a2) comes first, -#' then the effect allele (a1), e.g. \code{[a2 | a1]}. The \code{ref} level -#' of the factor variable \code{se$allele} will be \code{"a2"} -#' (so by default comparisons will be: a1 vs a2). -#' For assays output, all of the original matrices are renamed with a prefix, -#' either \code{a1-} or \code{a2-}. -#' @param tx2gene optional, a data.frame with first column indicating -#' transcripts, second column indicating genes (or any other transcript -#' grouping). Either this should include the \code{a1} and \code{a2} -#' suffix for the transcripts and genes, or those will be added internally, -#' if it is detected that the first transcript does not have these suffices. -#' For example if \code{_alt} or \code{_ref}, or \code{_M} or \code{_P} -#' (as indicated by the \code{a1} and \code{a2} arguments) are not present -#' in the table, the table rows will be duplicated with those suffices -#' added on behalf of the user. -#' If not provided, the output object will be transcript-level. -#' Note: do not attempt to set the \code{txOut} argument, it will -#' conflict with internal calls to downstream functions. -#' Note: if the a1/a2 suffices are not at the end of the transcript name -#' in the quantification files, e.g. \code{ENST123_M|}, -#' then \code{ignoreAfterBar=TRUE} can be used to match regardless of -#' the string following \code{|} in the quantification files. -#' @param ... any arguments to pass to tximeta -#' -#' @return a SummarizedExperiment, with allele counts (and other data) -#' combined into a wide matrix \code{[a2 | a1]}, or as assays (a1, then a2). -#' The original strings associated with a1 and a2 are stored in the -#' metadata of the object, in the \code{alleles} list element. -#' Note the \code{ref} level of \code{se$allele} will be \code{"a2"}, -#' such that comparisons by default will be a1 vs a2 (effect vs non-effect). -#' -#' @export -importAllelicCounts <- function(coldata, a1, a2, - format=c("wide","assays"), - tx2gene=NULL, ...) { - format <- match.arg(format) - if (!requireNamespace("tximeta", quietly=TRUE)) { - stop("this function requires installing the Bioconductor package 'tximeta'") - } - - a1match <- paste0("_",a1,"$") - a2match <- paste0("_",a2,"$") - - txOut <- is.null(tx2gene) # output transcripts if no tx2gene provided - if (!txOut) { - stopifnot(ncol(tx2gene) == 2) - # see if tx2gene already has tagged the txps and genes - if (grepl(a1match, tx2gene[1,1]) | grepl(a2match, tx2gene[1,1])) { - # ensure same number of a1 and a2 alleles in txps and genes - sum1txp <- sum(grepl(a1match, tx2gene[,1])) - sum2txp <- sum(grepl(a2match, tx2gene[,1])) - sum1gene <- sum(grepl(a1match, tx2gene[,2])) - sum2gene <- sum(grepl(a2match, tx2gene[,2])) - stopifnot(sum1txp > 0 & sum2txp > 0 & sum1gene > 0 & sum2gene > 0) - stopifnot(sum1txp == sum2txp) - stopifnot(sum1gene == sum2gene) - } else { - a2a1_vec <- rep(c(a2, a1), each=nrow(tx2gene)) - tx2gene <- data.frame( - tx=paste0(rep(tx2gene[,1], 2), "_", a2a1_vec), - gene=paste0(rep(tx2gene[,2], 2), "_", a2a1_vec) - ) - } - } - - se <- tximeta::tximeta(coldata, skipMeta=TRUE, tx2gene=tx2gene, txOut=txOut, ...) - - # remove any characters after "|" - rownames(se) <- sub("\\|.*", "", rownames(se)) - - ntxp <- nrow(se)/2 - n <- ncol(se) - - # gather transcript names for a1 and a2 alleles - txp_nms_a1 <- grep(a1match, rownames(se), value=TRUE) - stopifnot(length(txp_nms_a1) == ntxp) - txp_nms_a2 <- sub(paste0("_",a1),paste0("_",a2),txp_nms_a1) - stopifnot(all(txp_nms_a2 %in% rownames(se))) - stopifnot(length(txp_nms_a1) == length(txp_nms_a2)) - txp_nms <- sub(paste0("_",a1),"",txp_nms_a1) - - if (format == "wide") { - coldata_wide <- data.frame( - allele=factor(rep(c("a2","a1"), each=n), levels=c("a2","a1")) - ) - # add any other covariate data - for (v in setdiff(names(coldata), c("files","names"))) { - coldata_wide[[v]] <- coldata[[v]] - } - rownames(coldata_wide) <- paste0(colnames(se), "-", coldata_wide$allele) - - assays_wide <- lapply(assays(se), function(a) { - a_wide <- cbind(a[txp_nms_a2,], a[txp_nms_a1,]) - rownames(a_wide) <- txp_nms - colnames(a_wide) <- rownames(coldata_wide) - a_wide - }) - # make a new SE - wide <- SummarizedExperiment(assays=assays_wide, - colData=coldata_wide) - metadata(wide) <- c(metadata(se), list(alleles=c(a1=a1, a2=a2))) - return(wide) - } else if (format == "assays") { - se_a1 <- se[txp_nms_a1,] - se_a2 <- se[txp_nms_a2,] - rownames(se_a1) <- txp_nms - rownames(se_a2) <- txp_nms - # rename the assays - assayNames(se_a1) <- paste0("a1-", assayNames(se_a1)) - assayNames(se_a2) <- paste0("a2-", assayNames(se_a2)) - # add the a2 matrices to the a1 SE object - assays(se_a1) <- c(assays(se_a1), assays(se_a2)) - metadata(se_a1) <- c(metadata(se_a1), list(alleles=c(a1=a1, a2=a2))) - return(se_a1) - } -} - #' Obtain a trace of inferential replicates for a sample #' #' Simple helper function to obtain a trace (e.g. MCMC trace) @@ -752,7 +368,7 @@ #' #' @export getTrace <- function(y, idx, samp_idx) { - stopifnot(length(idx) == 1 | samp_idx == 1) + stopifnot(length(idx) == 1 | length(samp_idx) == 1) stopifnot(is(idx, "character") | is(idx, "numeric")) stopifnot(is(samp_idx, "character") | is(samp_idx, "numeric")) infRepIdx <- grep("infRep",assayNames(y)) diff -Nru r-bioc-fishpond-2.0.1+ds/R/loadFry.R r-bioc-fishpond-2.2.0+ds/R/loadFry.R --- r-bioc-fishpond-2.0.1+ds/R/loadFry.R 2021-10-26 19:25:38.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/R/loadFry.R 1970-01-01 00:00:00.000000000 +0000 @@ -1,318 +0,0 @@ -#' Load in data from alevin-fry USA mode -#' -#' Enables easy loading of sparse data matrices provided by alevin-fry USA mode. -#' Alevin-fry - \url{https://www.biorxiv.org/content/10.1101/2021.06.29.450377v1} -#' -#' @param fryDir path to the output directory returned by -#' alevin-fry quant command. This directory should contain a -#' \code{metainfo.json}, and an alevin folder which contains -#' \code{quants_mat.mtx}, \code{quants_mat_cols.txt} and -#' \code{quants_mat_rows.txt} -#' @param outputFormat can be \emph{either} be a list that defines the -#' desired format of the output \code{SingleCellExperiment} object -#' \emph{or} a string that represents one of the pre-defined output -#' formats, which are "scRNA", "snRNA", "scVelo" and "velocity". -#' See details for the explainations of the pre-defined formats and -#' how to define custom format. -#' @param nonzero whether to filter cells with non-zero expression -#' value across all genes (default \code{FALSE}). -#' If \code{TRUE}, this will filter based on all assays. -#' If a string vector of assay names, it will filter based -#' on the matching assays in the vector. -#' @param quiet logical whether to display no messages -#' -#' @section Details about \code{loadFry}: -#' This function consumes the result folder returned by running -#' alevin-fry quant in unspliced, spliced, ambiguous (USA) -#' quantification mode, and returns a \code{SingleCellExperiement} object -#' that contains a final count for each gene within each cell. In -#' USA mode, alevin-fry quant returns a count matrix contains three -#' types of count for each feature (gene) within each sample (cell -#' or nucleus), which represent the spliced mRNA count of the gene (S), -#' the unspliced mRNA count of the gene (U), and the count of UMIs whose -#' splicing status is ambiguous for the gene (A). For each assay -#' defined by \code{outputFormat}, these three counts of a gene -#' within a cell will be summed to get the final count of the gene -#' according to the rule defined in the \code{outputFormat}. The -#' returned object will contains the desired assays defined by -#' \code{outputFormat}, with rownames as the barcode of samples and -#' colnames as the feature names. -#' -#' -#' @section Details about the output format: -#' The \code{outputFormat} argument takes \emph{either} be a list that defines -#' the desired format of the output -#' \code{SingleCellExperiment} object \emph{or} a string that represents one of -#' the pre-defined output format. -#' -#' Currently the pre-defined formats -#' of the output \code{SingleCellExperiment} object are: -#' \describe{ -#' \item{"scRNA":}{This format is recommended for single cell experiments. -#' It returns a \code{counts} assay that contains the S+A count of each gene in each cell.} -#' \item{"snRNA":}{This format is recommended for single nucleus experiments. -#' It returns a \code{counts} assay that contains the U+S+A count of each gene in each cell.} -#' \item{"raw":}{This format put the three kinds of counts into three separate assays, -#' which are \code{unspliced}, \code{spliced} and \code{ambiguous}.} -#' \item{"velocity":}{This format contains two assays. -#' The \code{spliced} assay contains the S+A count of each gene in each cell. -#' The \code{unspliced} assay contains the U counts of each gene in each cell.} -#' \item{"scVelo":}{This format is for direct entry into velociraptor R package or -#' other scVelo downstream analysis pipeline for velocity -#' analysis in R with Bioconductor. It adds the expected -#' "S"-pliced assay and removes errors for size factors being -#' non-positive.} -#' } -#' -#' A custom output format can be defined using a list. Each element in the list -#' defines an assay in the output \code{SingleCellExperiment} object. -#' The name of an element in the list will be the name of the corresponding -#' assay in the output object. Each element in the list should be defined as -#' a vector that takes at least one of the three kinds of count, which are U, S and A. -#' See the provided toy example for defining a custom output format. -#' -#' @section Details about \code{load_fry_raw}: -#' This function processes alevin-fry's quantification result -#' contained within the input folder.This function returns a list -#' that consists of the gene count matrix, the gene names list, the -#' barcode list, and some metadata, such as the number of genes in -#' the experiment and whether alevin-fry was executed in USA -#' mode. In the returned list, the all-in-one count matrix, -#' \code{count_mat}, returned from the USA mode of alevin-fry -#' consists of the spliced count of genes defined in -#' \code{gene.names} for all barcodes defined in \code{barcodes}, -#' followed by the unspliced count of genes in the same order for -#' all cells, then followed by the ambiguous count of genes in the -#' same order for all cells. -#' -#' @return A \code{SingleCellExperiment} object that contains one -#' or more assays. Each assay consists of a gene by cell count matrix. -#' The row names are feature names, and the column names are cell -#' barcodes -#' -#' @concept preprocessing -#' -#' @examples -#' -#' # Get path for minimal example avelin-fry output dir -#' testdat <- fishpond:::readExampleFryData("fry-usa-basic") -#' -#' # This is exactly how the velocity format defined internally. -#' custom_velocity_format <- list("spliced"=c("S","A"), "unspliced"=c("U")) -#' -#' # Load alevin-fry gene quantification in velocity format -#' sce <- loadFry(fryDir=testdat$parent_dir, outputFormat=custom_velocity_format) -#' SummarizedExperiment::assayNames(sce) -#' -#' # Load the same data but use pre-defined, velociraptor R pckage desired format -#' scvelo_format <- "scVelo" -#' -#' scev <- loadFry(fryDir=testdat$parent_dir, outputFormat=scvelo_format, nonzero=TRUE) -#' SummarizedExperiment::assayNames(scev) -#' -#' @name loadFry -NULL - -#' @export -#' @rdname loadFry -#' @importFrom jsonlite fromJSON -#' @importFrom Matrix readMM -#' @importFrom utils read.table -load_fry_raw <- function(fryDir, quiet = FALSE) { - # Check `fryDir` is legit - if (!quiet) { - message("locating quant file") - } - quant.file <- file.path(fryDir, "alevin", "quants_mat.mtx") - if (!file.exists(quant.file)) { - stop("The `fryDir` directory provided does not look like a directory generated from alevin-fry:\n", - sprintf("Missing quant file: %s", quant.file) - ) - } - - # Since alevin-fry 0.4.1, meta_info.json is changed to quant.json, we check both - # read in metadata - qfile <- file.path(fryDir, "quant.json") - if (!file.exists(qfile)) { - qfile <- file.path(fryDir, "meta_info.json") - } - - # read in metadata - meta.info <- fromJSON(qfile) - ng <- meta.info$num_genes - usa.mode <- meta.info$usa_mode - - if (!quiet) { - message("Reading meta data") - message(paste0("USA mode: ", usa.mode)) - } - - # if usa mode, each gene gets 3 rows, so the actual number of genes is ng/3 - if (usa.mode) { - if (ng %% 3 != 0) { - stop("The number of quantified targets is not a multiple of 3") - } - ng <- as.integer(ng/3) - } - - # read in count matrix, gene names, and barcodes - count.mat <- as(readMM(file = quant.file), "dgCMatrix") - afg <- read.table(file.path(fryDir, "alevin", "quants_mat_cols.txt"), - strip.white = TRUE, header = FALSE, nrows = ng, - col.names = c("gene_ids")) - rownames(afg) <- afg$gene_ids - afc <- read.table(file.path(fryDir, "alevin", "quants_mat_rows.txt"), - strip.white = TRUE, header = FALSE, - col.names = c("barcodes")) - rownames(afc) <- afc$barcodes - - if (!quiet) { - message(paste("Processing", ng, "genes", "and", nrow(count.mat), "barcodes")) - } - - list(count.mat = count.mat, gene.names = afg, barcodes = afc, meta.data = list(num.genes = ng, usa.mode = usa.mode)) -} - -#' @export -#' @rdname loadFry -#' @importFrom Matrix t colSums -#' @importFrom SingleCellExperiment SingleCellExperiment -loadFry <- function(fryDir, - outputFormat = "scRNA", - nonzero = FALSE, - quiet = FALSE) { - - # load in fry result - fry.raw <- load_fry_raw(fryDir, quiet) - meta.data <- fry.raw$meta.data - - - # if in usa.mode, sum up counts in different status according to which.counts - if (meta.data$usa.mode) { - # preparation - predefined.format <- list("scRNA" = list("counts" = c("S", "A")), - "snRNA" = list("counts" = c("U", "S", "A")), - "velocity" = list("spliced" = c("S", "A"), "unspliced" = c("U")), - "scVelo" = list("counts" = c("S", "A"), "spliced" = c("S", "A"), "unspliced" = c("U")), - "raw" = list("spliced" = c("S"), "unspliced" = c("U"), "ambiguous" = c("A")) - ) - valid.counts <- c("U", "S", "A") - - # check outputFormat - if (is.character(outputFormat)) { - # Check whether outputFormat is a predefined format - if (! (outputFormat %in% names(predefined.format))) { - stop("Provided outputFormat string is invalid. Please check the function description -for the list of predifined format") - } - - if (!quiet) { - message("Using pre-defined output format: ", outputFormat) - } - - # get the assays - output.assays <- predefined.format[[outputFormat]] - - } else if (is.list(outputFormat)) { - # check whether user-defined assays are all - for (assay.name in names(outputFormat)) { - if (is.null(outputFormat[[assay.name]])) { - stop(paste0("The provided assay '", assay.name, "' is empty. Please remove it")) - } - else if (!all(outputFormat[[assay.name]] %in% valid.counts)) { - stop("Please use U, S and A ONLY to indicate which counts will be considered to build a assay") - } - } - if (!all(unlist(outputFormat) %in% valid.counts)) { - stop("Please use U, S and A ONLY to indicate which counts will be considered to build a assay") - } - - output.assays <- outputFormat - - if (!quiet) { - message("Using user-defined output assays") - } - } - - # If we are here, the output.assays is valid. - # then we check the assay names in nonzero - if (is.logical(nonzero)) { - if (nonzero) { - nonzero <- names(output.assays) - } else { - if (is.character(outputFormat) && outputFormat == "scVelo") { - nonzero <- c("counts") - } else { - nonzero <- c() - } - } - } else if (is.character(nonzero)) { - if (length(nonzero) > 0) { - for (idx in seq_along(nonzero)) { - if (!nonzero[idx] %in% names(output.assays)) { - warning(paste0("In the provided nonzero vector, '", - nonzero[idx], - "' is not one of the output assays, ignored")) - nonzero <- nonzero[-idx] - } - } - } - } else { - warning("Invalid nonzero, ignored") - nonzero <- c() - } - - # assembly - alist <- vector(mode = "list", length = length(output.assays)) - names(alist) <- names(output.assays) - ng <- meta.data$num.genes - rd <- list("S" = seq(1, ng), "U" = seq(ng + 1, 2 * ng), - "A" = seq(2 * ng + 1, 3 * ng)) - # fill in each assay - for (assay.name in names(output.assays)) { - which.counts <- output.assays[[assay.name]] - alist[[assay.name]] <- fry.raw$count.mat[, rd[[which.counts[1]]], drop = FALSE] - if (length(which.counts) > 1) { - # build assay - for (wc in which.counts[-1]) { - alist[[assay.name]] <- alist[[assay.name]] + fry.raw$count.mat[, rd[[wc]], drop = FALSE] - } - } - alist[[assay.name]] <- t(alist[[assay.name]]) - if (!quiet) { - message(paste(c(paste0("Building the '", - assay.name, - "' assay, which contains"), - which.counts), - collapse = " ")) - } - } - } else { - if (!quiet) { - message("Not in USA mode, ignore argument outputFormat") - } - - # define output matrix - alist <- list(counts = t(fry.raw$count.mat)) - } - - if (!quiet) { - message("Constructing output SingleCellExperiment object") - } - - # create SingleCellExperiment object - sce <- SingleCellExperiment(alist, - colData = fry.raw$barcodes, - rowData = fry.raw$gene.names) - - # filter all zero cells in zero, one or multiple assays - for (assay.name in nonzero) { - sce <- sce[, colSums(assay(sce, assay.name)) > 0] - } - - if (!quiet) { - message("Done") - } - - sce -} diff -Nru r-bioc-fishpond-2.0.1+ds/R/plots.R r-bioc-fishpond-2.2.0+ds/R/plots.R --- r-bioc-fishpond-2.0.1+ds/R/plots.R 2021-10-26 19:25:38.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/R/plots.R 2022-04-26 18:25:58.000000000 +0000 @@ -49,6 +49,11 @@ #' be drawn thin (the default switches from 0 [not thin] #' to 1 [thinner] at n=150 cells, and from 1 [thinner] #' to 2 [thinnest] at n=400 cells) +#' @param shiftX when \code{x} is continuous and \code{cov} +#' is provided, the amount to shift the values on the x-axis +#' to improve visibility of the point and line ranges +#' (will be subtracted from the first level of \code{cov} +#' and added to the second level of \code{cov}) #' #' @return nothing, a plot is displayed #' @@ -77,37 +82,36 @@ q=qnorm(.975), applySF=FALSE, reorder, - thin) { - - hasInfReps <- any(grepl("infRep", assayNames(y))) - # logical switch for if variance assay is present - # if not, just show the count or mean - showVar <- TRUE - if (!hasInfReps) { - showVar <- "variance" %in% assayNames(y) - if (useMean & !("mean" %in% assayNames(y))) { - message("using 'counts' assay, as 'mean' is missing, see argument 'useMean'") - useMean <- FALSE - } - } + thin, + shiftX) { + + # define key variables stopifnot(x %in% names(colData(y))) condition <- colData(y)[[x]] # whether x is a factor variable or numeric (e.g. pseudotime) xfac <- is(condition, "factor") if (!xfac) stopifnot(is(condition, "numeric")) + + # boxplot depends on whether there are inf reps (and other factors) + hasInfReps <- any(grepl("infRep", assayNames(y))) + # boxplot if inf reps are present and x factorial + drawBoxplot <- hasInfReps & xfac stopifnot(length(colsDrk) == length(colsLgt)) if (xfac) { + # define colors for boxplot ncond <- nlevels(condition) stopifnot(ncond <= length(colsDrk)) colsDrk <- colsDrk[seq_len(ncond)] colsLgt <- colsLgt[seq_len(ncond)] - # for numeric 'x' we will not make boxplots... - } else { - if (hasInfReps) { - hasInfReps <- FALSE - if (!"variance" %in% assayNames(y)) - stop("'variance' assay is missing, use computeInfRV() first") + } + if (!drawBoxplot) { + if (useMean & !("mean" %in% assayNames(y))) { + message("using 'counts' assay, as 'mean' is missing, see argument 'useMean'") + useMean <- FALSE } + # make sure variance assay is present + if (!"variance" %in% assayNames(y)) + stop("'variance' assay is missing, use computeInfRV() first") } if (missing(xaxis)) { if (xfac) { @@ -172,7 +176,7 @@ } } if (reorder) { - if (hasInfReps) { + if (drawBoxplot) { infReps <- assays(y[idx,])[grep("infRep",assayNames(y))] value <- colMeans(unlist(infReps)) } else { @@ -222,8 +226,10 @@ col.hglt <- colsLgt[covariate] } } - ### boxplot ### - if (hasInfReps) { + ############# + ## boxplot ## + ############# + if (drawBoxplot) { infReps <- assays(y[idx,])[grep("infRep",assayNames(y))] cts <- unlist(infReps)[,o] ymax <- max(cts) @@ -235,21 +241,26 @@ } boxplot2(cts, col=col, col.hglt=col.hglt, ylim=ylim, xlab=xlabel, ylab=ylab, main=main) - ### point and line plot by 'x' levels or numeric 'x' ### + ################################################ + ## point and line plot by factor or numeric x ## + ################################################ } else { which.assay <- if (useMean) "mean" else "counts" cts <- assays(y)[[which.assay]][idx,o] - if (showVar) { - sds <- sqrt(assays(y)[["variance"]][idx,o]) - } + sds <- sqrt(assays(y)[["variance"]][idx,o]) if (applySF & !is.null(y$sizeFactor)) { cts <- cts / y$sizeFactor[o] - if (showVar) { - sds <- sds / y$sizeFactor[o] - } + sds <- sds / y$sizeFactor[o] ylab <- "scaled counts" } - ymax <- if (showVar) max(cts + q*sds) else max(cts) + # shifting x values + if (!missing(shiftX)) { + stopifnot(!is.null(cov)) + covLvl1 <- covariate == levels(covariate)[1] + condition[covLvl1] <- condition[covLvl1] - shiftX + condition[!covLvl1] <- condition[!covLvl1] + shiftX + } + ymax <- max(cts + q*sds) ymin <- 0 if (xfac & !is.null(cov)) { ymin <- -0.02 * ymax @@ -270,16 +281,14 @@ } seg.lwd <- if (thin == 0) 2 else if (thin == 1) 1 else 3 seg.col <- if (thin < 2) col else col.hglt - if (showVar) { - if (xfac) { - segments(seq_along(cts), pmax(cts - q*sds, 0), - seq_along(cts), cts + q*sds, - col=seg.col, lwd=seg.lwd) - } else { - segments(condition, pmax(cts - q*sds, 0), - condition, cts + q*sds, - col=seg.col, lwd=seg.lwd) - } + if (xfac) { + segments(seq_along(cts), pmax(cts - q*sds, 0), + seq_along(cts), cts + q*sds, + col=seg.col, lwd=seg.lwd) + } else { + segments(condition, pmax(cts - q*sds, 0), + condition, cts + q*sds, + col=seg.col, lwd=seg.lwd) } pts.pch <- if (thin == 0) 22 else 15 pts.lwd <- if (thin == 0) 1. else 1 @@ -339,7 +348,7 @@ segments(s-w/2, qs[,5], s+w/2, qs[,5], col=col) } -#' MA plot +#' MA plot - log fold change over average counts #' #' @param y a SummarizedExperiment (see \code{swish}) #' @param alpha the FDR threshold for coloring points diff -Nru r-bioc-fishpond-2.0.1+ds/R/readEDS.R r-bioc-fishpond-2.2.0+ds/R/readEDS.R --- r-bioc-fishpond-2.0.1+ds/R/readEDS.R 2021-10-26 19:25:38.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/R/readEDS.R 1970-01-01 00:00:00.000000000 +0000 @@ -1,14 +0,0 @@ -#' readEDS - a utility function for quickly reading in Alevin's EDS format -#' -#' @param numOfGenes number of genes -#' @param numOfOriginalCells number of cells -#' @param countMatFilename pointer to the EDS file, \code{quants_mat.gz} -#' @param tierImport whether the \code{countMatFilename} refers to a quants tier file -#' -#' @return a genes x cells sparse matrix, of the class \code{dgCMatrix} -#' -#' @useDynLib fishpond -#' @export -readEDS <- function(numOfGenes, numOfOriginalCells, countMatFilename, tierImport=FALSE) { - getSparseMatrix(numOfGenes, numOfOriginalCells, path.expand(countMatFilename), tierImport) -} diff -Nru r-bioc-fishpond-2.0.1+ds/R/salmon-EC.R r-bioc-fishpond-2.2.0+ds/R/salmon-EC.R --- r-bioc-fishpond-2.0.1+ds/R/salmon-EC.R 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/R/salmon-EC.R 2022-04-26 18:25:58.000000000 +0000 @@ -0,0 +1,193 @@ +#' Construct a sparse matrix of transcript compatibility counts from salmon +#' output +#' +#' Constructs a count matrix with equivalence class identifiers +#' in the rows. The count matrix is generated from one or multiple +#' `eq_classes.txt` files that have been created by running salmon with the +#' --dumpEq flag. Salmon - \url{https://doi.org/10.1038/nmeth.4197} +#' +#' @rdname salmonEC +#' +#' @param paths `Charachter` or `character vector`, path specifying the +#' location of the `eq_classes.txt` files generated with salmon. +#' @param tx2gene A `dataframe` linking transcript identifiers to their +#' corresponding gene identifiers. Transcript identifiers must be in a column +#' `isoform_id`. Corresponding gene identifiers must be in a column `gene_id`. +#' @param multigene `Logical`, should equivalence classes that are compatible +#' with multiple genes be retained? Default is `FALSE`, removing such ambiguous +#' equivalence classes. +#' @param ignoreTxVersion logical, whether to split the isoform id on the '.' +#' character to remove version information to facilitate matching with the +#' isoform id in `tx2gene` (default FALSE). +#' @param ignoreAfterBar logical, whether to split the isoform id on the '|' +#' character to facilitate matching with the isoform id in `tx2gene` +#' (default FALSE). +#' @param quiet `Logical`, set `TRUE` to avoid displaying messages. +#' +#' @author Jeroen Gilis +#' +#' @return A list with two elements. The first element `counts` is a sparse +#' count matrix with equivalence class identifiers in the rows. The second +#' element `tx2gene_matched` allows for linking those identifiers to their +#' respective transcripts and genes. +#' +#' @section Details: +#' The resulting count matrix uses equivalence class identifiers as rownames. +#' These can be linked to respective transcripts and genes using the +#' `tx2gene_matched` element of the output. Specifically, if the equivalence +#' class identifier reads 1|2|8, then the equivalence class is compatible with +#' the transcripts and their respective genes in rows 1, 2 and 8 of +#' `tx2gene_matched`. +#' +#' @importFrom Matrix sparseMatrix sparseVector +#' @export +salmonEC <- function(paths, tx2gene, multigene = FALSE, ignoreTxVersion = FALSE, + ignoreAfterBar = FALSE, quiet = FALSE){ + + if (!requireNamespace("data.table", quietly=TRUE)) { + stop("alevinEC() requires CRAN package data.table") + } + + if(!all(file.exists(paths))){ + stop("The following paths do not exist: ", + paste(paths[which(!file.exists(paths))], "\n") + ) + } + + if(!isTRUE(all.equal(colnames(tx2gene), + c("isoform_id","gene_id")))){ + stop("tx2gene does not contain columns gene_id and isoform_id") + } + + # get line number where the TCCs start (same for each file, get from first file) + startread <- data.table::fread(paths[1], nrows=1, sep = " ", + quote = "", header = FALSE)$V1 + 2 + + # order tx2gene like transcripts in ECC files + tx_lookup <- data.table::fread(paths[1], skip = 2, nrows = startread-2, + sep = " ", quote = "", header = FALSE)$V1 + + if (ignoreTxVersion) { + tx2gene$isoform_id <- sub("\\..*", "", tx2gene$isoform_id) + tx_lookup <- sub("\\..*", "", tx_lookup) + } else if (ignoreAfterBar) { + tx2gene$isoform_id <- sub("\\|.*", "", tx2gene$isoform_id) + tx_lookup <- sub("\\|.*", "", tx_lookup) + } + + if (!any(tx_lookup %in% tx2gene$isoform_id)) { + txFromFile <- paste0("Example IDs (file): [", + paste(head(tx_lookup,3),collapse=", "), + ", ...]") + txFromTable <- paste0("Example IDs (tx2gene): [", + paste(head(tx2gene$isoform_id,3), + collapse=", "),", ...]") + stop(paste0(" + None of the transcripts in the quantification files are present + in the first column of tx2gene. Check to see that you are using + the same annotation for both.\n\n",txFromFile,"\n\n",txFromTable, + "\n\n This can sometimes (not always) be fixed using 'ignoreTxVersion' or 'ignoreAfterBar'.\n\n")) + } + + # matching of tx_lookup and tx2gene + tx2gene <- tx2gene[match(tx_lookup, tx2gene$isoform_id),] + + ntxmissing <- sum(!tx_lookup %in% tx2gene$isoform_id) + if (ntxmissing > 0) message("transcripts missing from tx2gene: ", ntxmissing) + + # For each sample/cell generate a list of 2: + # (1) names of the equivalence classes and (2) corresponding counts + unique_ecs <- c() + sample_list <- vector("list", length = length(paths)) + + for (i in seq_along(paths)) { + if (!quiet) svMisc::progress(i, max.value=length(paths), + init=(i==1), gui=FALSE) + sample_list[[i]] <- readEq(file = paths[i], + geneSet = as.character(tx2gene[,"gene_id"]), + startread = startread, + multigene = multigene) + unique_ecs <- c(unique_ecs, setdiff(sample_list[[i]][[1]],unique_ecs)) + } + + # Constructing sparseMatrix output + # message("Constructing sparseMatrix output \n") + vec_list <- vector("list", length = length(sample_list)) + for (i in seq_along(sample_list)) { + # if (!quiet) svMisc::progress(i, max.value=length(paths), + # init=(i==1), gui=FALSE) + # wrangle elements sample_list to use as input for sparseMatrix + ## wrangle rows + EC_names <- sample_list[[i]][[1]] + RowIdx <- match(EC_names, unique_ecs) + + ## wrangle entries + X <- sample_list[[i]][[2]] + + vec_list[[i]] <- sparseVector( + i = RowIdx, + x = X, + length = length(unique_ecs) + ) + } + ECC_mat <- do.call(sv.cbind, vec_list) + rownames(ECC_mat) <- unique_ecs + + rownames(tx2gene) <- seq_len(nrow(tx2gene)) + + return(list(counts = ECC_mat, tx2gene_matched = tx2gene)) +} + +# Helper function that reads eq_classes.txt files and converts them to lists +# with two elements: +# (i) the names of the equivalence classes +# (ii) the count for each equivalence class +# Additionally allows for remove equivalence classes that are compatible with +# multiple genes. +readEq <- function(file, geneSet, startread, multigene){ + + ec_df <- data.table::fread(file, skip=startread, sep = " ", quote = "", + header = FALSE) + eccs <- strsplit(ec_df$V1,"\t",fixed=TRUE) + + # remove first and last -> keep ECCs and not their corresponding TXs and counts + eccs_hlp <- lapply(eccs, function(x) x[-c(1,length(x))]) + + if(!multigene){ # remove ECs corresponding to multiple genes + hlp <- unlist(lapply(eccs_hlp, function(element) { + gene_cur <- geneSet[as.integer(element)+1] + # if annotation not complete -> remove EC with any NA genes + if(any(is.na(gene_cur))){ + return(TRUE) + } else{ + return(length(unique(gene_cur))!=1) + } + })) + eccs_hlp[hlp] <- NULL + eccs[hlp] <- NULL + } + + #ec_txs <- unlist(lapply(eccs_hlp, stringi::stri_c, collapse = "|")) + ec_txs <- unlist(lapply(eccs_hlp, paste, collapse = "|")) + ec_counts <- as.integer(vapply(eccs, data.table::last, + FUN.VALUE = character(1))) + + return(list(ec_txs, ec_counts)) +} + +# Helper function to convert a list of sparseVectors to a sparseMatrix +# obtained from https://stackoverflow.com/questions/8843700, +# response by petrelharp +sv.cbind <- function(...) { + input <- lapply(list(...), as, "dsparseVector") + thelength <- unique(sapply(input,length)) + stopifnot(length(thelength)==1) + return(sparseMatrix( + x = unlist(lapply(input,slot,"x")), + i = unlist(lapply(input,slot,"i")), + p = c(0, cumsum(sapply(input,function(x){length(x@x)}))), + dims = c(thelength, length(input)) + )) +} + + diff -Nru r-bioc-fishpond-2.0.1+ds/R/swish.R r-bioc-fishpond-2.2.0+ds/R/swish.R --- r-bioc-fishpond-2.0.1+ds/R/swish.R 2021-10-26 19:25:38.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/R/swish.R 2022-04-26 18:25:58.000000000 +0000 @@ -1,58 +1,6 @@ -#' Downstream methods for Salmon and Alevin expression data -#' -#' This package provides statistical methods and other tools for -#' working with Salmon and Alevin quantification of RNA-seq data. -#' In particular, it contains the Swish non-parametric method for -#' detecting differential transcript expression (DTE). Swish can -#' also be used to detect differential gene expresion (DGE). -#' -#' The main functions are: -#' \itemize{ -#' \item \code{\link{scaleInfReps}} - scaling transcript or gene expression data -#' \item \code{\link{labelKeep}} - labelling which features have sufficient counts -#' \item \code{\link{swish}} - perform non-parametric differential analysis -#' \item Plots, e.g., \code{\link{plotMASwish}}, \code{\link{plotInfReps}} -#' \item \code{\link{isoformProportions}} - convert counts to isoform proportions -#' \item \code{\link{makeInfReps}} - create pseudo-inferential replicates -#' \item \code{\link{splitSwish}} - split Swish analysis across jobs with Snakemake -#' } -#' -#' All software-related questions should be posted to the Bioconductor Support Site: -#' -#' \url{https://support.bioconductor.org} -#' -#' The code can be viewed at the GitHub repository, -#' which also lists the contributor code of conduct: -#' -#' \url{https://github.com/mikelove/fishpond} -#' -#' @references -#' -#' Swish method: -#' -#' Zhu, A., Srivastava, A., Ibrahim, J.G., Patro, R., Love, M.I. (2019) -#' Nonparametric expression analysis using inferential replicate counts. -#' Nucleic Acids Research. -#' \url{https://doi.org/10.1093/nar/gkz622} -#' -#' Compression, makeInfReps and splitSwish: -#' -#' Van Buren, S., Sarkar, H., Srivastava, A., Rashid, N.U., -#' Patro, R., Love, M.I. (2020) -#' Compression of quantification uncertainty for scRNA-seq counts. -#' bioRxiv. -#' \url{https://doi.org/10.1101/2020.07.06.189639} -#' -#' @author Anqi Zhu, Avi Srivastava, Joseph G. Ibrahim, Rob Patro, Michael I. Love -#' -#' @docType package -#' @name fishpond-package -#' @aliases fishpond-package -#' @keywords package -NULL - -#' swish: SAMseq With Inferential Samples Helps +#' Swish method: differential expression accounting for inferential uncertainty #' +#' The Swish method, or "SAMseq With Inferential Samples Helps". #' Performs non-parametric inference on rows of \code{y} for #' various experimental designs. See References for details. #' @@ -85,15 +33,17 @@ #' 'Note on factor levels') #' @param cov the name of the covariate for adjustment. #' If provided a stratified Wilcoxon in performed. -#' Cannot be used with \code{pair} (unless using \code{cor}) +#' Cannot be used with \code{pair}, unless using +#' either \code{interaction} or \code{cor} #' @param pair the name of the pair variable, which should be the #' number of the pair. Can be an integer or factor. #' If specified, a signed rank test is used #' to build the statistic. All samples across \code{x} must be -#' pairs if this is specified. Cannot be used with \code{cov} -#' (unless using \code{cor}) +#' pairs if this is specified. Cannot be used with \code{cov}, +#' unless using either \code{interaction} or \code{cor} #' @param interaction logical, whether to perform a test of an interaction -#' between \code{x} and \code{cov}. See Details. +#' between \code{x} and \code{cov}. Can use \code{pair} or not. +#' See Details. #' @param cor character, whether to compute correlation of \code{x} #' with the log counts, and signifance testing on the correlation #' as a test statistic. @@ -171,18 +121,6 @@ #' plotInfReps(y, 1, "condition") #' plotInfReps(y, 3, "condition") #' plotInfReps(y, 5, "condition") -#' -#' @importFrom graphics plot points segments rect abline title axis legend -#' @importFrom stats median quantile qnorm rpois runif rnbinom var cor -#' @importFrom utils head tail capture.output read.table write.table read.csv -#' @importFrom methods is as -#' @importFrom SummarizedExperiment SummarizedExperiment -#' assayNames assayNames<- assay assay<- assays assays<- -#' colData colData<- mcols mcols<- -#' @importFrom S4Vectors DataFrame metadata metadata<- -#' @importFrom gtools permutations -#' @importFrom Matrix rowSums -#' @importFrom matrixStats rowRanks rowMedians rowVars rowQuantiles #' #' @export swish <- function(y, x, cov=NULL, pair=NULL, @@ -228,7 +166,7 @@ } if (qvaluePkg == "samr") { if (!requireNamespace("samr", quietly=TRUE)) { - stop("first install the 'samr' package") + stop("first install the 'samr' CRAN package") } } diff -Nru r-bioc-fishpond-2.0.1+ds/README.md r-bioc-fishpond-2.2.0+ds/README.md --- r-bioc-fishpond-2.0.1+ds/README.md 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/README.md 2022-04-26 18:25:58.000000000 +0000 @@ -0,0 +1,77 @@ +# fishpond + +[![R build status](https://github.com/mikelove/fishpond/actions/workflows/check-bioc.yml/badge.svg)](https://github.com/mikelove/fishpond/actions/workflows/check-bioc.yml) + +## Fishpond: downstream methods and tools for expression data + +Fishpond contains a method, `swish()`, for differential transcript and +gene expression analysis of RNA-seq data using inferential replicates. +Also the package contains utilities for working with *Salmon*, +*alevin*, and *alevin-fry* quantification data, including +`loadFry()`. + +## Quick start + +The following paradigm is used for running a Swish analysis: + +``` +y <- tximeta(coldata) # reads in counts and inf reps +y <- scaleInfReps(y) # scales counts +y <- labelKeep(y) # labels features to keep +set.seed(1) +y <- swish(y, x="condition") # simplest Swish case +``` + +## How does Swish work + +Swish accounts for inferential uncertainty in expression estimates +by averaging test statistics over a number of *inferential replicate* +datasets, either posterior samples or bootstrap samples. This is +inspired by a method called +[SAMseq](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4605138/), +hence we named our method *Swish*, for "SAMseq With Inferential +Samples Helps". Averaging over inferential replicates produces a +different test statistic than what one would obtain using only point +estimates for expression level. + +For example, one of the tests possible with `swish()` is a correlation +test of expression level over a condition variable. We can visualize +the distribution of inferential replicates with `plotInfReps()`: + +![](man/figures/plotInfReps.png) + +The test statistic is formed by averaging over these sets of data: + +![](man/figures/swish.gif) + +p-values and q-values are computed through permutation of samples (see +vignette for details on permutation schemes). The Swish method is +described in the following publication: + +> Zhu, A., Srivastava, A., Ibrahim, J.G., Patro, R., Love, M.I. +> "Nonparametric expression analysis using inferential replicate counts" +> *Nucleic Acids Research* (2019) 47(18):e105 +> [PMC6765120](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6765120/) + +## Software issues and support + +You can browse the code or submit an Issue at the following link: + + + +For software support please use the following link and tag with +`swish` or `fishpond`: + + + +## Installation + +This package can be installed via Bioconductor: + +``` +BiocManager::install("fishpond") +``` + +## Funding + +This work was funded by NIH NHGRI R01-HG009937. diff -Nru r-bioc-fishpond-2.0.1+ds/tests/testthat/test_alevinEC.R r-bioc-fishpond-2.2.0+ds/tests/testthat/test_alevinEC.R --- r-bioc-fishpond-2.0.1+ds/tests/testthat/test_alevinEC.R 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/tests/testthat/test_alevinEC.R 2022-04-26 18:25:58.000000000 +0000 @@ -0,0 +1,138 @@ +context("alevinEC") +library(data.table) +library(fishpond) + +test_that("Importing transcript compatibility counts from alevin output works",{ + + if (packageVersion("tximportData") >= "1.23.4") { + + # import test data + dir <- system.file("extdata", package="tximportData") + files <- c(file.path(dir,"alevin/mouse1_unst_50/alevin/bfh.txt"), + file.path(dir,"alevin/mouse1_LPS2_50/alevin/bfh.txt")) + file.exists(files) + + tx2gene <- data.table::fread(file.path(dir, "tx2gene_alevin.tsv"), + header = FALSE) + colnames(tx2gene) <- c("isoform_id", "gene_id") + + slow <- FALSE + + # use alevinEC ~10 seconds + EC_mat <- alevinEC(paths = files, + tx2gene = tx2gene, + multigene = FALSE, + ignoreTxVersion = FALSE, + ignoreAfterBar = FALSE, + quiet = TRUE) + + # test output type + expect_true(validObject(EC_mat)) + expect_true(is(EC_mat$counts, "dgCMatrix")) + expect_true(is(EC_mat$counts, "Matrix")) + expect_true(is(EC_mat$tx2gene_matched, "data.frame")) + + # test output dimensions + expect_true(ncol(EC_mat$counts) == 100) # 2 files with 50 cells + + # test barcode names / colnames + expect_true(is(colnames(EC_mat$counts), "character")) + barcodes1 <- data.table::fread(file.path(dir,"alevin/mouse1_unst_50/alevin/quants_mat_rows.txt"), + header = FALSE)[[1]] + barcodes2 <- data.table::fread(file.path(dir,"alevin/mouse1_LPS2_50/alevin/quants_mat_rows.txt"), + header = FALSE)[[1]] + expect_true(all(colnames(EC_mat$counts) == c(barcodes1, barcodes2))) + + # test rownames type + expect_true(is(rownames(EC_mat$counts), "character")) + + if (slow) { + # test quiet argument + expect_silent(alevinEC(paths = files, + tx2gene = tx2gene, + ignoreTxVersion = FALSE, + ignoreAfterBar = FALSE, + multigene = FALSE, + quiet = TRUE)) + + # test multigene argument + EC_mat_multi <- alevinEC(paths = files, + tx2gene = tx2gene, + multigene = TRUE, + ignoreTxVersion = FALSE, + ignoreAfterBar = FALSE, + quiet = TRUE) + + ## test output type + expect_true(validObject(EC_mat_multi)) + expect_true(is(EC_mat_multi$counts, "dgCMatrix")) + expect_true(is(EC_mat_multi$counts, "Matrix")) + expect_true(is(EC_mat_multi$tx2gene_matched, "data.frame")) + + ## test output dimensions + expect_true(ncol(EC_mat_multi$counts) == 100) # 2 files with 50 cells + expect_true(nrow(EC_mat_multi$counts) >= nrow(EC_mat$counts)) # multigene=FALSE removes ECs + + ## test barcode names + expect_true(all(colnames(EC_mat_multi$counts) == c(barcodes1, barcodes2))) + + ## test rownames type + expect_true(is(rownames(EC_mat_multi$counts), "character")) + } + + # faulty path specification + files2 <- c(file.path(dir,"alevin/mouse1_unst_50/alevin/wrong.txt"), + file.path(dir,"alevin/mouse1_LPS2_50/alevin/wrangAgain.txt")) + + expect_error(alevinEC(paths = files2, + tx2gene = tx2gene, + ignoreTxVersion = FALSE, + ignoreAfterBar = FALSE, + multigene = FALSE, + quiet = TRUE)) + + # faulty tx2gene colnames + colnames(tx2gene) <- c("transcripts", "genes") + expect_error(alevinEC(paths = files, + tx2gene = tx2gene, + multigene = FALSE, + ignoreTxVersion = FALSE, + ignoreAfterBar = FALSE, + quiet = TRUE)) + + tx2gene <- as.data.frame.list(tx2gene) + colnames(tx2gene) <- NULL + expect_error(alevinEC(paths = files, + tx2gene = tx2gene, + ignoreTxVersion = FALSE, + ignoreAfterBar = FALSE, + multigene = FALSE, + quiet = TRUE)) + + colnames(tx2gene) <- c("isoform_id", "gene_id") + + if (slow) { + # test incomplete tx2gene annotation + expect_message(EC_mat2 <- alevinEC(paths = files, + tx2gene = tx2gene[1:50000,], + multigene = FALSE, + ignoreTxVersion = FALSE, + ignoreAfterBar = FALSE, + quiet = TRUE)) + + expect_true(nrow(EC_mat$counts) >= nrow(EC_mat2$counts)) # incomplete annotation + } + + # test faulty tx2gene list (e.g. when not ignoring tx version) + tx2gene_faulty <- tx2gene + tx2gene_faulty$isoform_id <- sub("\\..*", "", tx2gene_faulty$isoform_id) + expect_error(EC_mat_3 <- alevinEC(paths = files, + tx2gene = tx2gene_faulty, + ignoreTxVersion = FALSE, + ignoreAfterBar = FALSE, + multigene = FALSE, + quiet = TRUE)) + + } + +}) diff -Nru r-bioc-fishpond-2.0.1+ds/tests/testthat/test_allelic.R r-bioc-fishpond-2.2.0+ds/tests/testthat/test_allelic.R --- r-bioc-fishpond-2.0.1+ds/tests/testthat/test_allelic.R 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/tests/testthat/test_allelic.R 2022-04-26 18:25:58.000000000 +0000 @@ -0,0 +1,47 @@ +context("ase") +library(SummarizedExperiment) +library(fishpond) + +test_that("fishpond code for ASE works", { + + set.seed(1) + + # test reading in wide format for ASE + if (FALSE) { + dir <- "../../../../ase_quants" + names <- list.files(dir, pattern="sample") + files <- file.path(dir, names, "quant.sf") + coldata <- data.frame(files, names, condition=factor(c("A","B"))) + suppressPackageStartupMessages(library(SummarizedExperiment)) + library(tximeta) + se <- importAllelicCounts(coldata, a1="P", a2="M", format="wide") + colData(se) + assayNames(se) + metadata(se)$alleles + + # check TSS aggregation + library(ensembldb) + library(AnnotationHub) + ah <- AnnotationHub() + query(ah, c("Drosophila melanogaster", "release-100")) + Gtf <- ah["AH80008"] + DbFile <- ensDbFromAH(Gtf) + edb <- EnsDb(DbFile) + t2t <- makeTx2Tss(edb) + + se <- importAllelicCounts(coldata, + a1="P", a2="M", + format="wide", + tx2gene=t2t, + ignoreAfterBar=TRUE) + gr <- rowRanges(se) + gr[2,] + t2t[ gr$tx_id[[2]] ] + + se <- importAllelicCounts(coldata, a1="P", a2="M", format="assays") + colData(se) + assayNames(se) + metadata(se)$alleles + } + +}) diff -Nru r-bioc-fishpond-2.0.1+ds/tests/testthat/test_ase.R r-bioc-fishpond-2.2.0+ds/tests/testthat/test_ase.R --- r-bioc-fishpond-2.0.1+ds/tests/testthat/test_ase.R 2021-10-26 19:25:38.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/tests/testthat/test_ase.R 1970-01-01 00:00:00.000000000 +0000 @@ -1,40 +0,0 @@ -context("ase") -library(SummarizedExperiment) -library(fishpond) - -test_that("swish for ASE works", { - - set.seed(1) - - # looking at how the null distribution varies by sample size - par(mfrow=c(3,1)) - for (np in c(8,10,12)) { - y <- makeSimSwishData(n=np*2) - # no scaling - y <- labelKeep(y) - y$pair <- rep(1:np,2) - y <- swish(y, x="condition", pair="pair") - z <- mcols(y)$stat - hist(z[abs(z)<20], breaks=100, xlim=c(-30,30)) - } - - # test reading in wide format for ASE - if (FALSE) { - # this needs to be added as test data somewhere... - dir <- "../../../ase_quants" - names <- list.files(dir) - files <- file.path(dir, names, "quant.sf") - coldata <- data.frame(files, names, condition=factor(c("A","A","B","B"))) - suppressPackageStartupMessages(library(SummarizedExperiment)) - library(tximeta) - se <- importAllelicCounts(coldata, a1="P", a2="M", format="wide") - colData(se) - assayNames(se) - metadata(se)$alleles - se <- importAllelicCounts(coldata, a1="P", a2="M", format="assays") - colData(se) - assayNames(se) - metadata(se)$alleles - } - -}) diff -Nru r-bioc-fishpond-2.0.1+ds/tests/testthat/test_compress.R r-bioc-fishpond-2.2.0+ds/tests/testthat/test_compress.R --- r-bioc-fishpond-2.0.1+ds/tests/testthat/test_compress.R 2021-10-26 19:25:38.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/tests/testthat/test_compress.R 2022-04-26 18:25:58.000000000 +0000 @@ -4,8 +4,9 @@ test_that("compressing uncertainty works", { + m <- 100 set.seed(1) - y <- makeSimSwishData(m=200, n=100) + y <- makeSimSwishData(m=m, n=100) y$batch <- factor(rep(3:1,c(30,40,30))) plotInfReps(y, 5, x="condition", reorder=TRUE) dev.off() @@ -13,17 +14,14 @@ dev.off() set.seed(1) - y <- makeSimSwishData(m=200, n=100, meanVariance=TRUE) + y <- makeSimSwishData(m=m, n=100, meanVariance=TRUE) y$batch <- factor(rep(3:1,c(30,40,30))) plotInfReps(y, idx=5, x="condition", reorder=TRUE) dev.off() + plotInfReps(y, idx=5, x="condition", reorder=TRUE, useMean=FALSE) + dev.off() plotInfReps(y, idx=5, x="condition", cov="batch", reorder=TRUE) dev.off() - - y0 <- y - assays(y0) <- assays(y0)[1:3] - plotInfReps(y0, idx=5, x="condition", useMean=FALSE) - dev.off() y2 <- y y2 <- makeInfReps(y2, numReps=50) diff -Nru r-bioc-fishpond-2.0.1+ds/tests/testthat/test_correlation.R r-bioc-fishpond-2.2.0+ds/tests/testthat/test_correlation.R --- r-bioc-fishpond-2.0.1+ds/tests/testthat/test_correlation.R 2021-10-26 19:25:38.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/tests/testthat/test_correlation.R 2022-04-26 18:25:58.000000000 +0000 @@ -7,7 +7,8 @@ set.seed(1) n <- 20 - y <- makeSimSwishData(m=500, n=n, null=TRUE) + m <- 250 + y <- makeSimSwishData(m=m, n=n, null=TRUE) nms <- c("counts",paste0("infRep",1:20)) lambda1 <- exp(seq(4, 5, length.out=n)) lambda2 <- rev(lambda1) @@ -38,7 +39,7 @@ set.seed(5) n <- 20 - y <- makeSimSwishData(m=500, n=n, null=TRUE) + y <- makeSimSwishData(m=m, n=n, null=TRUE) nms <- c("counts",paste0("infRep",1:20)) lambda1 <- exp(seq(4, 5, length.out=n/2)) lambda2 <- rev(lambda1) @@ -69,15 +70,17 @@ #plot(-log10(mcols(y)$pvalue[1:30])) mcols(y)[1:4,] - #plotInfReps(y, 1, x="cov", cov="condition", - # legend=TRUE, legendPos="top") - + plotInfReps(y, 1, x="cov", cov="condition", + shiftX=.0005, + legend=TRUE, legendPos="top") + dev.off() + ### up-down-up pattern ### set.seed(1) n <- 40 - y <- makeSimSwishData(m=500, n=n, null=TRUE) + y <- makeSimSwishData(m=m, n=n, null=TRUE) nms <- c("counts",paste0("infRep",1:20)) cov <- rep(1:(n/4),each=2) lambda1 <- (cov - 1)*(cov - 10)*(cov - 10) + 100 @@ -108,7 +111,9 @@ #plot(-log10(mcols(y)$pvalue[1:30])) mcols(y)[1:4,] - #plotInfReps(y, 1, x="cov", cov="condition", - # legend=TRUE, legendPos="top") + plotInfReps(y, 1, x="cov", cov="condition", + shiftX=.05, + legend=TRUE, legendPos="bottom") + dev.off() }) diff -Nru r-bioc-fishpond-2.0.1+ds/tests/testthat/test_readEDS.R r-bioc-fishpond-2.2.0+ds/tests/testthat/test_readEDS.R --- r-bioc-fishpond-2.0.1+ds/tests/testthat/test_readEDS.R 2021-10-26 19:25:38.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/tests/testthat/test_readEDS.R 2022-04-26 18:25:58.000000000 +0000 @@ -2,29 +2,36 @@ library(fishpond) test_that("Reading in Alevin EDS format works", { - dir <- system.file("extdata/alevin/neurons_900_v014/alevin", package="tximportData") - files <- file.path(dir,"quants_mat.gz") - file.exists(files) - barcode.file <- file.path(dir, "quants_mat_rows.txt") - gene.file <- file.path(dir, "quants_mat_cols.txt") - tier.file <- file.path(dir,"quants_tier_mat.gz") - cell.names <- readLines(barcode.file) - gene.names <- readLines(gene.file) - num.cells <- length(cell.names) - num.genes <- length(gene.names) + ## dir0 <- system.file("extdata", package="tximportData") + ## samps <- list.files(file.path(dir, "alevin")) + ## dir <- file.path(dir,"alevin",samps[3],"alevin") + ## file.exists(file.path(dir, "quants_mat.gz")) + + ## barcode.file <- file.path(dir, "quants_mat_rows.txt") + ## gene.file <- file.path(dir, "quants_mat_cols.txt") + ## tier.file <- file.path(dir,"quants_tier_mat.gz") + ## cell.names <- readLines(barcode.file) + ## gene.names <- readLines(gene.file) + ## num.cells <- length(cell.names) + ## num.genes <- length(gene.names) - # reading in quants - mat <- readEDS(numOfGenes=num.genes, numOfOriginalCells=num.cells, countMatFilename=files) + ## # reading in quants + ## mat <- readEDS(numOfGenes=num.genes, + ## numOfOriginalCells=num.cells, + ## countMatFilename=files) - expect_equal(nrow(mat), num.genes) - expect_equal(ncol(mat), num.cells) - cts <- mat@x - # max count is < 1 million for this dataset - expect_lte(max(cts), 1e6) - expect_gte(min(cts), 0) + ## expect_equal(nrow(mat), num.genes) + ## expect_equal(ncol(mat), num.cells) + ## cts <- mat@x + ## # max count is < 1 million for this dataset + ## expect_lte(max(cts), 1e6) + ## expect_gte(min(cts), 0) - # attempt reading in tier file - files <- tier.file - tier <- readEDS(numOfGenes=num.genes, numOfOriginalCells=num.cells, countMatFilename=files, tierImport=TRUE) + ## # attempt reading in tier file + ## files <- tier.file + ## tier <- readEDS(numOfGenes=num.genes, + ## numOfOriginalCells=num.cells, + ## countMatFilename=files, + ## tierImport=TRUE) }) diff -Nru r-bioc-fishpond-2.0.1+ds/tests/testthat/test_salmonEC.R r-bioc-fishpond-2.2.0+ds/tests/testthat/test_salmonEC.R --- r-bioc-fishpond-2.0.1+ds/tests/testthat/test_salmonEC.R 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/tests/testthat/test_salmonEC.R 2022-04-26 18:25:58.000000000 +0000 @@ -0,0 +1,93 @@ +context("salmonEC") +library(data.table) +library(fishpond) + +test_that("Importing transcript compatibility counts from salmon output works",{ + + + if (packageVersion("tximportData") >= "1.23.4") { + + # import test data + dir <- system.file("extdata", package="tximportData") + files <- c(file.path(dir,"salmon_ec/SRR7311351/aux_info/eq_classes.txt"), + file.path(dir,"salmon_ec/SRR7311386/aux_info/eq_classes.txt")) + file.exists(files) + + tx2gene <- read.csv2(file.path(dir, "salmon_ec/tx2gene_tasic.csv"), + header = TRUE, row.names = NULL) + tx2gene <- tx2gene[,c(2,3)] + colnames(tx2gene) <- c("isoform_id", "gene_id") + + slow <- FALSE + + # use salmonEC ~9 seconds + EC_mat <- salmonEC(paths = files, + tx2gene = tx2gene, + ignoreTxVersion = TRUE, + ignoreAfterBar = FALSE, + multigene = FALSE, + quiet = FALSE) + + # test output type + expect_true(validObject(EC_mat)) + expect_true(length(EC_mat) == 2) # list of two + expect_true(is(EC_mat$counts, "dgCMatrix")) + expect_true(is(EC_mat$counts, "Matrix")) + expect_true(is(EC_mat$tx2gene_matched, "data.frame")) + + # test output dimensions + expect_true(ncol(EC_mat$counts) == 2) # 2 cells + + # test colnames + expect_true(is.null(colnames(EC_mat$counts))) + + # test rownames type + expect_true(is(rownames(EC_mat$counts), "character")) + + if (slow) { + # test quiet argument + expect_silent(salmonEC(paths = files, + tx2gene = tx2gene, + ignoreTxVersion = TRUE, + ignoreAfterBar = FALSE, + multigene = FALSE, + quiet = TRUE)) + + # test multigene argument + EC_mat_multi <- salmonEC(paths = files, + tx2gene = tx2gene, + ignoreTxVersion = TRUE, + ignoreAfterBar = FALSE, + multigene = TRUE, + quiet = TRUE) + + # test output type + expect_true(validObject(EC_mat_multi)) + expect_true(length(EC_mat_multi) == 2) # list of two + expect_true(is(EC_mat_multi$counts, "dgCMatrix")) + expect_true(is(EC_mat_multi$tx2gene_matched, "data.frame")) + + expect_true(nrow(EC_mat_multi$counts) >= nrow(EC_mat$counts)) # multigene=FALSE removes ECs + + # test patial but correct tx2gene list + expect_message(EC_mat_2 <- salmonEC(paths = files, + tx2gene = tx2gene[1:50000,], + ignoreTxVersion = TRUE, + ignoreAfterBar = FALSE, + multigene = FALSE, + quiet = TRUE)) + + expect_true(validObject(EC_mat_2)) + expect_true(nrow(EC_mat$counts) >= nrow(EC_mat_2$counts)) # missing transcripts + } + + # test faulty tx2gene list (e.g. when not ignoring tx version) + expect_error(EC_mat_3 <- salmonEC(paths = files, + tx2gene = tx2gene, + ignoreTxVersion = FALSE, + ignoreAfterBar = FALSE, + multigene = FALSE, + quiet = TRUE)) + + } +}) diff -Nru r-bioc-fishpond-2.0.1+ds/vignettes/allelic.Rmd r-bioc-fishpond-2.2.0+ds/vignettes/allelic.Rmd --- r-bioc-fishpond-2.0.1+ds/vignettes/allelic.Rmd 1970-01-01 00:00:00.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/vignettes/allelic.Rmd 2022-04-26 18:25:58.000000000 +0000 @@ -0,0 +1,401 @@ +--- +title: "SEESAW - Allelic expression analysis with Salmon and Swish" +date: "`r format(Sys.Date(), '%m/%d/%Y')`" +output: + rmarkdown::html_document: + highlight: tango + toc: true + toc_float: true +bibliography: library.bib +vignette: | + %\VignetteIndexEntry{2. SEESAW - Allelic expression analysis with Salmon and Swish} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + + + +```{r setup, echo=FALSE, results="hide"} +knitr::opts_chunk$set(tidy=FALSE, cache=FALSE, dev="png", + message=FALSE, error=FALSE, warning=FALSE) +``` + +# Introduction + +In this vignette, we describe usage of a suite of tools, **SEESAW**, +Statistical Estimation of allelic Expression using Salmon and +Swish. + +Running SEESAW involves generation of a diploid transcriptome +(e.g. using [g2gtools](http://churchill-lab.github.io/g2gtools/), +construction of a diploid Salmon index (specifying +`--keepDuplicates`), followed by Salmon quantification with a number of +[bootstrap inferential replicates](https://salmon.readthedocs.io/en/latest/salmon.html#numbootstraps) +(we recommend 30 bootstrap replicates). +These three steps (diploid reference preparation, indexing, quantification +with bootstraps) provide the input data for the following statistical +analyses in R/Bioconductor. The steps shown in this vignette leverage +Bioconductor infrastructure including *SummarizedExperiment* for +storage of input data and results, *tximport* for data import, and +*GRanges* and *Gviz* for plotting. + +In short the SEESAW steps are as listed, and diagrammed below: + +1. g2gtools (diploid reference preparation) +2. Salmon indexing with `--keepDuplicates` +3. Salmon quantification with bootstraps +4. `makeTx2Tss()` aggregates data to TSS-level (optional) +5. `importAllelicCounts()` creates a *SummarizedExperiment* +6. Swish analysis: `labelKeep()` and `swish()` (skip scaling) +7. Plotting + +```{r echo=FALSE} +knitr::include_graphics("images/SEESAW.png") +``` + +SEESAW allows for testing *global allelic imbalance* across all +samples (pairwise testing within each individual), as well as +*differential or dynamic allelic imbalance* (pairwise allelic fold +changes estimated within individual, followed by testing across or +along an additional covariate). Each of these allelic imbalance (AI) +analyses takes into account the potentially heterogeneous amount of +inferential uncertainty per sample, per feature (transcript, +transcript-group, or gene), and per allele. + +Below we demonstrate an analysis where transcripts are grouped by their +transcription start site (TSS), although gene-level or +transcript-level analysis is also possible. New plotting functions +added to *fishpond* facilitate visualization of allelic and isoform +changes at different resolutions, alongside gene models. In the first +example, we perform global AI testing, and in the second example we +perform dynamic AI testing, in both cases on simulated data associated +with human genes. + +# Linking transcripts to TSS + +We begin assuming steps 1-3 have been completed. We can use the +`makeTx2Tss` function to generate a *GRanges* object `t2g` that +connects transcripts to transcript groups. + + +```{r} +suppressPackageStartupMessages(library(ensembldb)) +library(EnsDb.Hsapiens.v86) +library(fishpond) +edb <- EnsDb.Hsapiens.v86 +t2g <- makeTx2Tss(edb) # GRanges object +mcols(t2g)[,c("tx_id","group_id")] +``` + +Alternatively for gene-level analysis, one could either prepare a +`t2g` data.frame with `tx_id` and `gene_id` columns, or a `t2g` +*GRanges* object with a column `group_id` that is equal to `gene_id`. + +# Importing allelic counts + +Here we will use simulated data, but we can import allelic counts with +the `importAllelicCounts()` function. It is best to read over the +manual page for this function. For TSS-level analysis, the `t2g` +*GRanges* generated above should be passed to the `tx2gene` +argument. This will summarize transcript-level counts to the TSS +level, and will attach `rowRanges` that provide the genomic locations +of the grouped transcripts. + +# Filtering features that have no information + +Because we use `--keepDuplicates` in the step when we build the Salmon +index, there will be a number of features in which there is no +information about the allelic expression in the reads. We can find +these features in bootstrap data by examining when the inferential +replicates are nearly identical for the two alleles, as this is how +the EM will split the reads. Removing these features avoids downstream +problems during differential testing. Code for this filtering follows: + +```{r eval=FALSE} +n <- ncol(y)/2 +rep1a1 <- assay(y, "infRep1")[,y$allele == "a1"] +rep1a2 <- assay(y, "infRep1")[,y$allele == "a2"] +mcols(y)$someInfo <- rowSums(abs(rep1a1 - rep1a2) < 1) < n +y <- y[ mcols(y)$someInfo, ] +``` + +# Testing for allelic imbalance across samples + +We begin by generating a simulated data object that resembles what one +would obtain with `importAllelicCounts()`. The import function +arranges the `a2` (non-effect) allelic counts first, followed by the +`a1` (effect) allelic counts. Allelic ratios are calculated at +`a1/a2`. + +```{r} +suppressPackageStartupMessages(library(SummarizedExperiment)) +``` + +```{r} +set.seed(1) +y <- makeSimSwishData(allelic=TRUE) +colData(y) +levels(y$allele) # a1/a2 allelic fold changes +``` + +A hidden code chunk is used to add ranges from the *EnsDb* to the +simulated dataset. For a real dataset, the ranges would be added +either by `importAllelicCounts` (if using `tx2gene`) or could be added +manually for transcript- or gene-level analysis, using the +`rowRanges<-` setter function. The ranges are only needed for the +`plotAllelicGene` plotting function below. + +``` + +``` + +```{r echo=FALSE} +# hidden chunk to add ranges to the `se` +tss <- t2g[!duplicated(t2g$group_id)] +tss$symbol <- mapIds(edb, tss$gene_id, "SYMBOL", "GENEID") +names(tss) <- paste0(tss$symbol, "-", tss$tss) +mcols(tss) <- mcols(tss)[,c("tx_id","gene_id","tss","group_id","symbol")] +# slow... +#tx_id <- CharacterList(split(t2g$tx_id,t2g$group_id)) +#tss$tx_id <- tx_id[names(tss)] +tab <- table(tss$gene_id) +tss$ntss <- as.numeric(tab[tss$gene_id]) +tss <- tss[tss$ntss > 1 & tss$ntss < 5 & seqnames(tss) == "1"] +tss <- tss[order(tss$gene_id),] +tss <- tss[43:1042] +# swap 2nd and 3rd isoform of first gene +tss[2:3] <- tss[3:2] +rowRanges(y) <- tss +``` + +We can already plot a heatmap of allelic ratios, before performing +statistical testing. We can see in the first gene, *ADSS*, there +appear to be two groups of transcripts with opposing allelic fold +change. SEESAW makes use of *pheatmap* for plotting a heatmap of +allelic ratios. + +```{r fig.dim=c(7,3.5)} +y <- computeInfRV(y) # for posterior mean, variance +gene <- rowRanges(y)$gene_id[1] +idx <- mcols(y)$gene_id == gene +plotAllelicHeatmap(y, idx=idx) +``` + +The following two functions perform a Swish analysis, comparing the +allelic counts within sample, while accounting for uncertainty in the +assignment of the reads. The underlying test statistic is a Wilcoxon +signed-rank statistic. + +```{r} +y <- labelKeep(y) +y <- swish(y, x="allele", pair="sample") +``` + +# Plotting results + +We can return to the heatmap, and now add q-values, etc. For details +on adding metadata to a *pheatmap* plot object, see `?pheatmap`. + +```{r fig.dim=c(8,4)} +dat <- data.frame(minusLogQ=-log10(mcols(y)$qvalue[idx]), + row.names=rownames(y)[idx]) +plotAllelicHeatmap(y, idx=idx, annotation_row=dat) +``` + +In order to visualize the inferential uncertainty, we can make use of +`plotInfReps()`: + +```{r fig.dim=c(5,5)} +par(mfrow=c(2,1), mar=c(1,4.1,2,2)) +plotInfReps(y, idx=1, x="allele", cov="sample", xaxis=FALSE, xlab="") +plotInfReps(y, idx=2, x="allele", cov="sample", xaxis=FALSE, xlab="") +``` + +SEESAW provides `plotAllelicGene()` in order to build visualization of +Swish test statistics, allelic proportions, and isoform proportions, +in a genomic context, making use of *Gviz*. The first three arguments +are the *SummarizedExperiment* object, the name of a gene (should +match `gene_id` column), and a *TxDb* or *EnsDb* to use for plotting +the gene model at the top. The statistics and proportions are then +plotted at the first position of the feature (`start` for `+` features +and `end` for `-` features). + +```{r fig.dim=c(8,7)} +gene <- rowRanges(y)$gene_id[1] +plotAllelicGene(y, gene, edb) +``` + +You can also specify the gene using `symbol`: + +```{r fig.dim=c(8,7)} +plotAllelicGene(y, symbol="ADSS", db=edb) +``` + +In the allelic proportion and isoform proportion tracks, a line is +drawn through the mean proportion for a2 and a1 allele, and for the +isoform proportion, across samples, at the start site for each +transcript group. The line is meant only to help visualize the mean +value as it may change across transcript groups, but the line has no +meaning in the ranges in between features. That is, unlike continuous +genomic features (methylation or accessibility), there is no meaning +to the allelic proportion or isoform proportion outside of measured +start sites of transcription. + +We can further customize the plot, for example, +changing the labels displayed on the gene model, and changing the +labels for the alleles. An ideogram can be added with `ideogram=TRUE`, +although this requires connecting to an external FTP site. + +See `importAllelicGene()` manual page for more details. + +```{r fig.dim=c(8,7)} +plotAllelicGene(y, gene, edb, + transcriptAnnotation="transcript", + labels=list(a2="maternal",a1="paternal")) +``` + +We can also customize the display of the alleles in the +`plotInfReps()` plots, by adding a new factor, while carefully noting +the existing and new allele labels, to make sure the annotation is +correct: + +```{r fig.dim=c(5,4)} +y$allele_new <- y$allele +# note a2 is non-effect, a1 is effect: +levels(y$allele) +# replace a2 then a1: +levels(y$allele_new) <- c("maternal","paternal") +plotInfReps(y, idx=1, x="allele_new", + legend=TRUE, legendPos="bottom") +``` + +# Testing for dynamic allelic imbalance + +Above, we tested for global AI, where the allelic fold change is +consistent across all samples. We can also test for differential or +dynamic AI, by adding specification of a `cov` (covariate) which can +be either a two-group factor, or a continuous variable. For continuous +variable, the user should specify a correlation test, either +`cor="pearson"` or `"spearman"`. + +```{r} +set.seed(1) +y <- makeSimSwishData(dynamic=TRUE) +colData(y) +``` + +Again, a hidden code chunk adds ranges to our simulation data. + +``` + +``` + +```{r echo=FALSE} +rowRanges(y) <- tss +``` + +In the following, we test for changes in allelic imbalance within +sample that correlate with a covariate `time`. + +```{r} +y <- labelKeep(y) +y <- swish(y, x="allele", pair="sample", cov="time", cor="pearson") +``` + +Note the first two features have small q-values and opposite test +statistic; here the test statistic is the average Pearson correlation +of the allelic log fold change with the `time` variable, averaging +over bootstrap replicates. + +```{r} +mcols(y)[1:2,c("stat","qvalue")] +``` + +For plotting inferential replicates over a continuous variable, we +must first compute summary statistics of inferential mean and +variance: + +```{r} +y <- computeInfRV(y) +``` + +Now we can examine the allelic counts across the `time` variable: + +```{r fig.dim=c(7,7)} +par(mfrow=c(2,1), mar=c(2.5,4,2,2)) +plotInfReps(y, idx=1, x="time", cov="allele", shiftX=.01, xaxis=FALSE, xlab="", main="") +par(mar=c(4.5,4,0,2)) +plotInfReps(y, idx=2, x="time", cov="allele", shiftX=.01, main="") +``` + +With a little more code, we can add a `lowess` line for each series: + +```{r fig.dim=c(7,5)} +plotInfReps(y, idx=1, x="time", cov="allele", shiftX=.01) +dat <- data.frame( + time = y$time[1:10], + a2 = assay(y, "mean")[1,y$allele=="a2"], + a1 = assay(y, "mean")[1,y$allele=="a1"]) +lines(lowess(dat[,c(1,2)]), col="dodgerblue") +lines(lowess(dat[,c(1,3)]), col="goldenrod4") +``` + +Visualizing the allelic proportion in a heatmap helps to see +relationships with the `time` variable, while also showing data from +multiple features at once: + +```{r fig.dim=c(8,4)} +idx <- c(1:4) +row_dat <- data.frame(minusLogQ=-log10(mcols(y)$qvalue[idx]), + row.names=rownames(y)[idx]) +col_dat <- data.frame(time=y$time[1:10], + row.names=paste0("s",1:10)) +plotAllelicHeatmap(y, idx=idx, + annotation_row=row_dat, + annotation_col=col_dat) +``` + +Finally, by binning the `time` covariate into a few groups, we can +again draw the allelic and isoform proportions in the genomic context, +now facetting across time. + +First we create the binned covariate using `cut`, and rename the +labels for nicer labels in our plot: + +```{r} +y$time_bins <- cut(y$time,breaks=c(0,.25,.75,1), + include.lowest=TRUE, labels=FALSE) +y$time_bins <- paste0("time-",y$time_bins) +table(y$time_bins[ y$allele == "a2" ]) +``` + +We can then make our facetted allelic proportion plot: + +```{r fig.dim=c(8,7)} +gene <- rowRanges(y)$gene_id[1] +plotAllelicGene(y, gene, edb, cov="time_bins", + qvalue=FALSE, log2FC=FALSE) +``` + +If we also want to visualize how isoform proportions may be changing, +we can add `covFacetIsoform=TRUE`, which additionally facets the +isoform proportion plot by the covariate: + +```{r fig.dim=c(8,7)} +plotAllelicGene(y, gene, edb, cov="time_bins", + covFacetIsoform=TRUE, + qvalue=FALSE, log2FC=FALSE) +``` + +For further questions about the SEESAW steps, please post to one of +these locations: + +* Bioconductor support site and use + the tag `fishpond` or `swish` +* GitHub Issue + +# Session info + +```{r} +sessionInfo() +``` Binary files /tmp/tmp2sjhyil9/ls2zZw_ys0/r-bioc-fishpond-2.0.1+ds/vignettes/images/SEESAW.png and /tmp/tmp2sjhyil9/EGIFf3Vtyl/r-bioc-fishpond-2.2.0+ds/vignettes/images/SEESAW.png differ diff -Nru r-bioc-fishpond-2.0.1+ds/vignettes/swish.Rmd r-bioc-fishpond-2.2.0+ds/vignettes/swish.Rmd --- r-bioc-fishpond-2.0.1+ds/vignettes/swish.Rmd 2021-10-26 19:25:38.000000000 +0000 +++ r-bioc-fishpond-2.2.0+ds/vignettes/swish.Rmd 2022-04-26 18:25:58.000000000 +0000 @@ -1,7 +1,6 @@ --- -title: "DTE and DGE with inferential replicates" +title: "Swish: differential expression accounting for inferential uncertainty" date: "`r format(Sys.Date(), '%m/%d/%Y')`" -author: "Anqi Zhu, Avi Srivastava, Joseph Ibrahim, Rob Patro, Michael Love" output: rmarkdown::html_document: highlight: tango @@ -9,7 +8,7 @@ toc_float: true bibliography: library.bib vignette: | - %\VignetteIndexEntry{DTE and DGE with inferential replicates} + %\VignetteIndexEntry{1. Swish: DE analysis accounting for inferential uncertainty} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- @@ -41,9 +40,6 @@ interaction test for matched samples, to see if a condition effect changes in magnitude across two groups of samples. -**Acknowledgments:** We have benefited in the development of *Swish* -from the feedback of Hirak Sarkar and Scott Van Buren. - # Quick start The following lines of code will perform a basic transcript-level @@ -55,18 +51,19 @@ # 'coldata.csv': sample information table coldata <- read.csv("coldata.csv") library(tximeta) -y <- tximeta(coldata) # reads in counts -library(swish) +y <- tximeta(coldata) # reads in counts and inf reps +library(fishpond) y <- scaleInfReps(y) # scales counts -y <- labelKeep(y) # labels genes to keep +y <- labelKeep(y) # labels features to keep set.seed(1) y <- swish(y, x="condition") # simplest Swish case ``` **Note:** Inferential replicates, either from Gibbs sampling or -bootstrapping of reads, are required for the *swish* method. You can -set `--numGibbsSamples 20` when running Salmon to generate Gibbs -samples. +bootstrapping of reads, are required for the *swish* method. +When running Salmon, you can set `--numGibbsSamples 30` to generate +Gibbs samples, or `--numBootstraps 30` to generate bootstrap samples +(we typically recommend 20-30 inferential replicates). The results can be found in `mcols(y)`. For example, one can calculate the number of genes passing a 5% FDR threshold: @@ -226,11 +223,11 @@ We will rename our *SummarizedExperiment* `y` for the statistical analysis. For speed of the vignette, we subset to the transcripts on -chromosome 1. +chromosome 4. ```{r} y <- se -y <- y[seqnames(y) == "chr1",] +y <- y[seqnames(y) == "chr4",] ``` To demonstrate a two group comparison, we subset to the "naive" and @@ -247,7 +244,7 @@ that log2 fold changes are reported as the non-reference level over the reference level. By default, R will choose a *reference level* for factors based on alphabetical order, unless `levels` are -explicitly set. It is recommended to set the factors levels, as in +explicitly set. It is recommended to set the factors levels, as in the above code chunk, with the reference level coming first in the character vector, so that log2 fold changes correspond to the comparison of interest. @@ -263,23 +260,23 @@ permutations, to obtain identical results, one needs to set a random seed before running `swish()`, as we do below. -The default number of permutations in `swish` is -`nperms=100`. However, for paired datasets as this one, you may have -fewer maximum permutations. In this case, there are 64 possible ways -to switch the condition labels for six pairs of samples. We can set -the `nperms` manually (or if we had just used the default value, -`swish` would set `nperms` to the maximum value possible and notify -the user that it had done so). - ```{r results="hide", message=FALSE} library(fishpond) y <- scaleInfReps(y) y <- labelKeep(y) y <- y[mcols(y)$keep,] set.seed(1) -y <- swish(y, x="condition", pair="line", nperms=64) +y <- swish(y, x="condition", pair="line") ``` +The default number of permutations for computing p-values is +`nperms=100`. However, for paired datasets as this one, you may have +fewer maximum permutations. In this case, there are 64 possible ways +to switch the condition labels for six pairs of samples. We can set +the `nperms` manually to 64, or if we had just used the default value, +`swish` would set `nperms` to the maximum value possible and notify +the user that it had done so. + A note about `labelKeep`: by default we keep features with `minN=3` samples with a minimal count of 10. For scRNA-seq data with de-duplicated UMI counts, we recommend to lower the count, e.g. a @@ -410,12 +407,12 @@ We can also run swish at the gene level. First we summarize all of the data to the gene level, using the `summarizeToGene` function from *tximeta*. Again, we rename the object for statistical analysis, and -then we subset to the genes on chromosome 1 for the demonstration. +then we subset to the genes on chromosome 4 for the demonstration. ```{r} gse <- summarizeToGene(se) gy <- gse -gy <- gy[seqnames(gy) == "chr1",] +gy <- gy[seqnames(gy) == "chr4",] ``` Two demonstrate a two group comparison, we subset to the "naive" and @@ -434,7 +431,7 @@ gy <- labelKeep(gy) gy <- gy[mcols(gy)$keep,] set.seed(1) -gy <- swish(gy, x="condition", pair="line", nperms=64) +gy <- swish(gy, x="condition", pair="line") ``` As before, the number of genes in a 1% FDR set: @@ -525,8 +522,7 @@ ```{r} # run on the transcript-level dataset iso <- isoformProportions(y) -# `nperms=64` here is atypical, usually not specified, see note above -iso <- swish(iso, x="condition", pair="line", nperms=64) +iso <- swish(iso, x="condition", pair="line") ``` ```{r eval=FALSE, echo=FALSE} @@ -681,7 +677,7 @@ plotInfReps(y2, idx[2], x="ifng", cov="salmonella") ``` -# Correlation with continuous variables +# Correlation test *Swish* now has methods to compute correlations (`"spearman"` or `"pearson"`) of a continuous variable `x` with the @@ -737,7 +733,7 @@ `tximeta` to add the gene range information and other metadata if working with human, mouse, or fruit fly. -```{r} +```{r eval=FALSE} dir <- system.file("extdata", package="tximportData") files <- file.path(dir,"alevin/neurons_900_v014/alevin/quants_mat.gz") neurons <- tximeta(files, type="alevin", @@ -753,7 +749,7 @@ [Orchestrating Single-Cell Analysis with Bioconductor](https://osca.bioconductor.org/) [@Amezquita2020]. -```{r} +```{r eval=FALSE} library(SingleCellExperiment) sce <- as(neurons, "SingleCellExperiment") sce$cluster <- factor(paste0("cl",sample(1:6,ncol(sce),TRUE))) @@ -764,8 +760,8 @@ reorder=FALSE) ``` -`plotInfReps` has a number of options and convenience arguments. One -can: +`plotInfReps` has a number of options and convenience arguments for +visualizing single cell data. One can: * add a `legend`, * `reorder` the cells by expression value within cluster, @@ -784,7 +780,7 @@ example with $n \in [1,150)$ or $[150,400)$, more visual elements per cell will be included: -```{r echo=FALSE} +```{r echo=FALSE, eval=FALSE} par(mfrow=c(2,1), mar=c(2,4,2,1)) plotInfReps(sce[,1:50], "ENSMUSG00000072235.6", x="cluster") plotInfReps(sce[,1:150], "ENSMUSG00000072235.6", x="cluster") @@ -897,21 +893,25 @@ ## Analysis types supported by Swish -There are currently five types of analysis supported by `swish`: +There are currently seven types of analysis supported by `swish`: * Two group analysis * Two groups with two or more batches * Two group paired or matched samples * Two condition x two group paired samples, interaction test * Two condition x two group samples, not paired, interaction test +* Correlation test of expression with continuous x +* Correlation test of LFC for paired samples with continuous covariate This vignette demonstrated the third in this list, but the others can be run by either not specifying any additional covariates, or by specifying a batch variable with the argument `cov` instead of `pair`. The two interaction tests can be run by specifying `interaction=TRUE` -and providing `x`, `cov`, and optionally `pair`. +and providing `x`, `cov`, and optionally `pair`. The last two tests +can be run using the `cor` argument +(see [Correlation test](#correlation-test) section in this vignette). -## Accounting for estimated / continuous batch effects +## Accounting for continuous variables We have two recommended approaches for using `swish` in combination with estimated batch effects, e.g. factors of unwanted variation @@ -936,7 +936,7 @@ matrices, and these contain both sampling and additional inferential variance, we are able to track how scaling the counts affects the variance, and this informs the test statistic. - + For the second approach, one can directly scale the inferential counts using *limma* and `removeBatchEffect`. @@ -1003,6 +1003,10 @@ y <- swish(y, x="condition") ``` +While we have assessed this approach with a small number of +estimated nuisance covariates, note that with many covariates this +approach may lead to inflation of test statistics. + ## Structure of tximeta output While `tximeta` is the safest way to provide the correct input to @@ -1063,7 +1067,7 @@ ```{r} y3 <- se -y3 <- y3[seqnames(y3) == "chr1",] +y3 <- y3[seqnames(y3) == "chr4",] y3 <- y3[,y3$condition %in% c("naive","IFNg")] y3 <- labelKeep(y3) y3 <- y3[mcols(y3)$keep,]