Introduction

Joint RNA and ATAC analysis using 10x multiomic technology. This workflow is adapted from the Signac guide.

Source code: TBA.

There are datasets in two batches, April and August.

library("irlba")
## Loading required package: Matrix
library("Signac")
## Signac built for for SeuratObject v4 was just loaded with SeuratObject
## v5; disabling v5 assays and validation routines, and ensuring assays
## work in strict v3/v4 compatibility mode
library("Seurat")
## Loading required package: SeuratObject
## Loading required package: sp
## 
## Attaching package: 'SeuratObject'
## The following object is masked from 'package:base':
## 
##     intersect
library("hdf5r")
library("EnsDb.Hsapiens.v86")
## Loading required package: ensembldb
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following object is masked from 'package:SeuratObject':
## 
##     intersect
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:hdf5r':
## 
##     values
## The following objects are masked from 'package:Matrix':
## 
##     expand, unname
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:sp':
## 
##     %over%
## Loading required package: GenomeInfoDb
## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: AnnotationFilter
## 
## Attaching package: 'ensembldb'
## The following object is masked from 'package:stats':
## 
##     filter
library("BSgenome.Hsapiens.UCSC.hg38")
## Loading required package: BSgenome
## Loading required package: Biostrings
## Loading required package: XVector
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
## 
##     strsplit
## Loading required package: BiocIO
## Loading required package: rtracklayer
## 
## Attaching package: 'rtracklayer'
## The following object is masked from 'package:BiocIO':
## 
##     FileForFormat
library("glmGamPoi")
library("SeuratDisk") #devtools::install_github("mojaveazure/seurat-disk")
## Registered S3 method overwritten by 'SeuratDisk':
##   method            from  
##   as.sparse.H5Group Seurat
library("qlcMatrix") #devtools::install_github("cysouw/qlcMatrix")
## Loading required package: slam
## Loading required package: sparsesvd
## 
## Attaching package: 'qlcMatrix'
## The following objects are masked from 'package:Biobase':
## 
##     rowMax, rowMin

Load data

Load the RNA and ATAC data.

Annotate genes by organism.

counts <- Read10X_h5("FLW_april/outs/filtered_feature_bc_matrix.h5")
## Genome matrix has multiple modalities, returning a list of matrices for this genome
fragpath <- "FLW_april/outs/atac_fragments.tsv.gz"

Annotate genes by organism.

#Mycoplasma pneumoniae chromosome
MPstart="gene-AXA72_00005"
MPend="gene-AXA72_03910"
MPgenes <- rownames(counts$`Gene Expression`)[which(rownames(counts$`Gene Expression`) == MPstart) : which(rownames(counts$`Gene Expression`) == MPend)]
MPstart="dnaA"
MPend="gene-EXC43_RS04465"
MPplas1genes <- rownames(counts$`Gene Expression`)[which(rownames(counts$`Gene Expression`) == MPstart) : which(rownames(counts$`Gene Expression`) == MPend)]
MPstart="dnaN"
MPend="gene-EXC43_RS04315"
MPplas2genes <- rownames(counts$`Gene Expression`)[which(rownames(counts$`Gene Expression`) == MPstart) : which(rownames(counts$`Gene Expression`) == MPend)]
MPgenes_full <- unique(c(MPgenes,MPplas1genes,MPplas2genes))
str(MPgenes_full)
##  chr [1:1566] "gene-AXA72_00005" "gene-AXA72_00010" "gyrB" ...
table(colSums(as.data.frame(counts$`Gene Expression`[rownames(counts$`Gene Expression`) %in% MPgenes_full,])))
## 
##     0     1 
## 12287     3
#lambda
Lstart="nu1"
Lend="orf206b"
Lgenes <- rownames(counts$`Gene Expression`)[which(rownames(counts$`Gene Expression`) == Lstart) : which(rownames(counts$`Gene Expression`) == Lend)]
str(Lgenes)
##  chr [1:92] "nu1" "A" "W" "B" "C" "nu3" "D" "E" "Fi" "Fii" "Z" "U" "V" "G" ...
table(colSums(as.data.frame(counts$`Gene Expression`[rownames(counts$`Gene Expression`) %in% Lgenes,])))
## 
##     0 
## 12290
#HHV5
H5start="RL10"
H5end="RL8A"
H5genes <- rownames(counts$`Gene Expression`)[which(rownames(counts$`Gene Expression`) == H5start) : which(rownames(counts$`Gene Expression`) == H5end)]
str(H5genes)
##  chr [1:180] "RL10" "RL11" "RL12" "RL13" "UL1" "UL4" "UL5" "UL6" "UL7" ...
table(colSums(as.data.frame(counts$`Gene Expression`[rownames(counts$`Gene Expression`) %in% H5genes,])))
## 
##     0 
## 12290
#Human gammaherpesvirus 4 (EBV)
Estart="BNRF1"
Eend="LMP-1"
Egenes <- rownames(counts$`Gene Expression`)[which(rownames(counts$`Gene Expression`) == Estart) : which(rownames(counts$`Gene Expression`) == Eend)]
str(Egenes)
##  chr [1:77] "BNRF1" "BORF1" "BORF2" "BRRF1" "BRRF2" "BSRF1" "BTRF1" "BVRF1" ...
table(colSums(as.data.frame(counts$`Gene Expression`[rownames(counts$`Gene Expression`) %in% Egenes,])))
## 
##     0 
## 12290
#Human gene expression
HUstart="MIR1302-2HG"
HUend="LINC00266-4P"
HUgenes <- rownames(counts$`Gene Expression`)[which(rownames(counts$`Gene Expression`) == HUstart) : which(rownames(counts$`Gene Expression`) == HUend)]
str(HUgenes)
##  chr [1:36572] "MIR1302-2HG" "OR4F5" "AL627309.4" "AP006222.2" "AL732372.1" ...
mycolsums <- colSums(as.data.frame(counts$`Gene Expression`[rownames(counts$`Gene Expression`) %in% HUgenes,]))
## Warning in asMethod(object): sparse->dense coercion: allocating vector of size
## 3.3 GiB
str(mycolsums)
##  Named num [1:12290] 1733 2753 7394 1390 15216 ...
##  - attr(*, "names")= chr [1:12290] "AAACAGCCACATTAAC-1" "AAACATGCAATAATCC-1" "AAACATGCACCGGTAT-1" "AAACATGCATAATGTC-1" ...
summary(mycolsums)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      49    1549    2384    4479    4048  854493

Get annotations

annotation <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)

## Warning in .merge_two_Seqinfo_objects(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)
seqlevels(annotation) <- paste0('chr', seqlevels(annotation))
annotation
## GRanges object with 3021151 ranges and 5 metadata columns:
##                   seqnames        ranges strand |           tx_id   gene_name
##                      <Rle>     <IRanges>  <Rle> |     <character> <character>
##   ENSE00001489430     chrX 276322-276394      + | ENST00000399012      PLCXD1
##   ENSE00001536003     chrX 276324-276394      + | ENST00000484611      PLCXD1
##   ENSE00002160563     chrX 276353-276394      + | ENST00000430923      PLCXD1
##   ENSE00001750899     chrX 281055-281121      + | ENST00000445062      PLCXD1
##   ENSE00001489388     chrX 281192-281684      + | ENST00000381657      PLCXD1
##               ...      ...           ...    ... .             ...         ...
##   ENST00000361739    chrMT     7586-8269      + | ENST00000361739      MT-CO2
##   ENST00000361789    chrMT   14747-15887      + | ENST00000361789      MT-CYB
##   ENST00000361851    chrMT     8366-8572      + | ENST00000361851     MT-ATP8
##   ENST00000361899    chrMT     8527-9207      + | ENST00000361899     MT-ATP6
##   ENST00000362079    chrMT     9207-9990      + | ENST00000362079      MT-CO3
##                           gene_id   gene_biotype     type
##                       <character>    <character> <factor>
##   ENSE00001489430 ENSG00000182378 protein_coding     exon
##   ENSE00001536003 ENSG00000182378 protein_coding     exon
##   ENSE00002160563 ENSG00000182378 protein_coding     exon
##   ENSE00001750899 ENSG00000182378 protein_coding     exon
##   ENSE00001489388 ENSG00000182378 protein_coding     exon
##               ...             ...            ...      ...
##   ENST00000361739 ENSG00000198712 protein_coding      cds
##   ENST00000361789 ENSG00000198727 protein_coding      cds
##   ENST00000361851 ENSG00000228253 protein_coding      cds
##   ENST00000361899 ENSG00000198899 protein_coding      cds
##   ENST00000362079 ENSG00000198938 protein_coding      cds
##   -------
##   seqinfo: 25 sequences (1 circular) from GRCh38 genome

Create a Seurat object

It will contain RNA and ATAC data.

pbmc <- CreateSeuratObject( counts = counts$`Gene Expression`, assay = "RNA")
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')

## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
head(pbmc)
##                       orig.ident nCount_RNA nFeature_RNA
## AAACAGCCACATTAAC-1 SeuratProject       1733         1134
## AAACATGCAATAATCC-1 SeuratProject       2753         1398
## AAACATGCACCGGTAT-1 SeuratProject       7394         2904
## AAACATGCATAATGTC-1 SeuratProject       1390          942
## AAACATGCATACTCCT-1 SeuratProject      15216         4875
## AAACATGCATGAGCAG-1 SeuratProject       1930         1032
## AAACATGCATTATGGT-1 SeuratProject       1465          849
## AAACATGCATTCCTCG-1 SeuratProject       3956         1811
## AAACATGCATTGTGTG-1 SeuratProject       2055         1024
## AAACCAACAATTAAGG-1 SeuratProject       1860         1116
pbmc[["ATAC"]] <- CreateChromatinAssay(counts = counts$Peaks,sep = c(":", "-"),
  fragments = fragpath,annotation = annotation)
## Computing hash
pbmc
## An object of class Seurat 
## 164481 features across 12290 samples within 2 assays 
## Active assay: RNA (38544 features, 0 variable features)
##  2 layers present: counts, data
##  1 other assay present: ATAC

ATAC Quality control

Compute per-cell quality metrics using DNA accessibility data.

DefaultAssay(pbmc) <- "ATAC"

pbmc <- NucleosomeSignal(pbmc)
pbmc <- TSSEnrichment(pbmc)
## Extracting TSS positions
## Extracting fragments at TSSs
## 
## Computing TSS enrichment score
VlnPlot(
  object = pbmc,
  features = c("nCount_RNA", "nCount_ATAC", "TSS.enrichment", "nucleosome_signal"),
  ncol = 4,
  pt.size = 0
)

Remove cells that are outliers for these metrics, as well as cells with low or unusually high counts for either the RNA or ATAC assay.

pbmc <- subset(
  x = pbmc,
  subset = nCount_ATAC < 100000 &
    nCount_RNA < 25000 &
    nCount_ATAC > 1000 &
    nCount_RNA > 1000 &
    nucleosome_signal < 2 &
    TSS.enrichment > 1
)
pbmc
## An object of class Seurat 
## 164481 features across 10462 samples within 2 assays 
## Active assay: ATAC (125937 features, 0 variable features)
##  2 layers present: counts, data
##  1 other assay present: RNA

Peak calling

Using MACS2. We will remove peaks on nonstandard chromosomes and in genomic blacklist regions. Then quantify counts in each peak and create a new assay using the MACS2 peak set and add it to the Seurat object

peaks <- CallPeaks(pbmc)

peaks <- keepStandardChromosomes(peaks, pruning.mode = "coarse")
peaks <- subsetByOverlaps(x = peaks, ranges = blacklist_hg38_unified, invert = TRUE)

macs2_counts <- FeatureMatrix(
  fragments = Fragments(pbmc),
  features = peaks,
  cells = colnames(pbmc)
)
## Extracting reads overlapping genomic regions
pbmc[["peaks"]] <- CreateChromatinAssay(
  counts = macs2_counts,
  fragments = fragpath,
  annotation = annotation
)
## Computing hash

Gene expression data processing

We can normalize the gene expression data using SCTransform, and reduce the dimensionality using PCA.

DefaultAssay(pbmc) <- "RNA"
pbmc <- SCTransform(pbmc)
## Running SCTransform on assay: RNA
## vst.flavor='v2' set. Using model with fixed slope and excluding poisson genes.
## Calculating cell attributes from input UMI matrix: log_umi
## Variance stabilizing transformation of count matrix of size 24389 by 10462
## Model formula is y ~ log_umi
## Get Negative Binomial regression parameters per gene
## Using 2000 genes, 5000 cells
## Found 187 outliers - those will be ignored in fitting/regularization step
## Second step: Get residuals using fitted parameters for 24389 genes
## Computing corrected count matrix for 24389 genes
## Calculating gene attributes
## Wall clock passed: Time difference of 30.70272 secs
## Determine variable features
## Centering data matrix
## Place corrected count matrix in counts slot
## Set default assay to SCT
pbmc <- RunPCA(pbmc)
## PC_ 1 
## Positive:  VCAN, ZEB2, DPYD, PLXDC2, SLC8A1, NEAT1, LRMDA, CD36, PLCB1, AOAH 
##     PID1, LYST, DMXL2, FCN1, CSF3R, LRRK2, HDAC9, ARHGAP26, MCTP1, VMP1 
##     IRAK3, RBM47, TBXAS1, TMTC2, RTN1, WDFY3, PICALM, LYN, ZSWIM6, LYZ 
## Negative:  BACH2, SKAP1, LEF1, CAMK4, B2M, BCL11B, ANK3, INPP4B, BANK1, PRKCH 
##     BCL2, ARHGAP15, CD247, MAML2, NELL2, AFF3, THEMIS, TSHZ2, IL7R, TC2N 
##     RPS27, RORA, CD96, RPL13A, PCED1B, KLF12, PDE7A, ETS1, HLA-B, SYNE2 
## PC_ 2 
## Positive:  BANK1, AFF3, RALGPS2, IGHM, IGKC, EBF1, CD74, MS4A1, FCRL1, OSBPL10 
##     COL19A1, PAX5, SOX5, HLA-DRA, CCSER1, BLK, BACH2, CDK14, LINC00926, COBLL1 
##     ARHGAP24, KHDRBS2, CD79A, ADAM28, AUTS2, AP002075.1, PLEKHG1, GPM6A, STEAP1B, IGLC2 
## Negative:  PRKCH, LEF1, CAMK4, BCL11B, CD247, INPP4B, THEMIS, SKAP1, IL7R, PDE3B 
##     NELL2, FYB1, TC2N, RORA, PRKCA, TNIK, MAML2, TSHZ2, NCALD, ITK 
##     TXK, PLCL1, AAK1, STAT4, SYNE2, PITPNC1, CD96, SERINC5, DPYD, LINC01934 
## PC_ 3 
## Positive:  LEF1, MAML2, CAMK4, FOXP1, TSHZ2, BACH2, FHIT, VCAN, NELL2, INPP4B 
##     PRKCA, ANK3, PLCL1, DPYD, ARHGAP15, PDE3B, BCL11B, SERINC5, BCL2, AL589693.1 
##     PDE7A, CSGALNACT1, MLLT3, DOCK10, TCF7, AC139720.1, IL6ST, IL7R, THEMIS, IGF1R 
## Negative:  AC105402.3, GNLY, NKG7, CCL5, PRF1, TGFBR3, KLRD1, KLRF1, PDGFD, ACTB 
##     TMSB4X, B2M, BNC2, FCGR3A, HLA-C, GZMA, C1orf21, GZMH, FGFBP2, NCALD 
##     PPP2R2B, MYBL1, CTSW, GZMB, IL2RB, NCAM1, SPON2, TCF7L2, ADGRG1, DTHD1 
## PC_ 4 
## Positive:  GNLY, NKG7, BNC2, TGFBR3, PDGFD, KLRD1, KLRF1, PRF1, NCALD, TOX 
##     PPP2R2B, MCTP2, C1orf21, NCAM1, SYNE1, SAMD3, PYHIN1, MYBL1, CCL5, IKZF2 
##     SPON2, CD247, IL2RB, LINGO2, VCAN, FGFBP2, LINC00299, PRKCH, GZMA, AL392086.3 
## Negative:  AC105402.3, RPS27, RPL13A, EEF1A1, RPS12, RPS3A, RPL32, RPL19, RPL13, RPL28 
##     RPLP2, RPL23A, RPS8, RPS18, RPS25, RPL41, TCF7L2, TPT1, RPL10, RPL31 
##     RPS15A, RPL21, RPS23, RPS11, RPS20, RPS14, RPL18A, RPS15, RPL30, RPL23 
## PC_ 5 
## Positive:  TCF4, LINC01374, AC023590.1, LINC01478, FAM160A1, RHEX, EPHB1, CUX2, ZFAT, BX284613.2 
##     TCF7L2, UGCG, PDE4B, LINC00996, COL24A1, CCDC50, PTPRS, SCN9A, PLXNA4, RUNX2 
##     NRP1, NEGR1, DACH1, SCN1A-AS1, SLC41A2, FLT3, CLEC4C, RBMS3, SLC35F3, COL26A1 
## Negative:  VCAN, BANK1, PLCB1, IGHM, ARHGAP24, IGKC, DPYD, RALGPS2, CSF3R, CD36 
##     AC105402.3, DYSF, EBF1, VCAN-AS1, ACSL1, MEGF9, FCRL1, MS4A1, ARHGAP26, PDE4D 
##     COL19A1, RPS27, PAX5, NRG1, LRRK2, SLC2A3, BACH2, RPL13A, OSBPL10, CREB5

DNA accessibility data processing

Here we process the DNA accessibility assay the same way we would process a scATAC-seq dataset, by performing latent semantic indexing (LSI).

DefaultAssay(pbmc) <- "peaks"
pbmc <- FindTopFeatures(pbmc, min.cutoff = 5)
pbmc <- RunTFIDF(pbmc)
## Performing TF-IDF normalization
## Warning in RunTFIDF.default(object = GetAssayData(object = object, slot =
## "counts"), : Some features contain 0 total counts
pbmc <- RunSVD(pbmc)
## Running SVD
## Scaling cell embeddings

Annotating cell types

To annotate cell types in the dataset we can transfer cell labels from an existing PBMC reference dataset using tools in the Seurat package. See the Seurat reference mapping vignette for more information.

We’ll use an annotated PBMC reference dataset from Hao et al. (2020), available for download.

Note that the SeuratDisk package is required to load the reference dataset. Installation instructions for SeuratDisk can be found here.

Load reference data.

options(timeout=10000)

if (! file.exists("pbmc_multimodal.h5seurat")) {
download.file("https://atlas.fredhutch.org/data/nygc/multimodal/pbmc_multimodal.h5seurat",
  destfile="pbmc_multimodal.h5seurat")
}

reference <- LoadH5Seurat("pbmc_multimodal.h5seurat", assays = list("SCT" = "counts"), reductions = 'spca')
## Validating h5Seurat file
## Initializing SCT with counts
## Adding variable feature information for SCT
## Adding miscellaneous information for SCT
## Adding reduction spca
## Adding cell embeddings for spca
## Adding feature loadings for spca
## Adding miscellaneous information for spca
## Adding graph wknn
## Adding graph wsnn
## Adding command information
## Adding cell-level metadata
reference <- UpdateSeuratObject(reference)
## Validating object structure
## Updating object slots
## Ensuring keys are in the proper structure
## Updating matrix keys for DimReduc 'spca'
## Ensuring keys are in the proper structure
## Ensuring feature names don't have underscores or pipes
## Updating slots in SCT
## Updating slots in wknn
## Cannot find wknn in the object, setting default assay of wknn to SCT
## Updating slots in wsnn
## Cannot find wsnn in the object, setting default assay of wsnn to SCT
## Updating slots in spca
## Validating object structure for SCTAssay 'SCT'
## Validating object structure for Graph 'wknn'
## Validating object structure for Graph 'wsnn'
## Validating object structure for DimReduc 'spca'
## Object representation is consistent with the most current Seurat version
DefaultAssay(pbmc) <- "SCT"

# transfer cell type labels from reference to query
transfer_anchors <- FindTransferAnchors(
  reference = reference,
  query = pbmc,
  normalization.method = "SCT",
  reference.reduction = "spca",
  recompute.residuals = FALSE,
  dims = 1:50
)
## Projecting cell embeddings
## Counts matrix provided is not sparse; vreating v5 assay in Seurat object
## Finding neighborhoods
## Finding anchors
##  Found 9413 anchors
predictions <- TransferData(
  anchorset = transfer_anchors,
  refdata = reference$celltype.l2,
  weight.reduction = pbmc[['pca']],
  dims = 1:50
)
## Finding integration vectors
## Finding integration vector weights
## Predicting cell labels
pbmc <- AddMetaData(
  object = pbmc,
  metadata = predictions
)

# set the cell identities to the cell type predictions
Idents(pbmc) <- "predicted.id"

Joint UMAP visualization

Using the weighted nearest neighbor methods in Seurat v4, we can compute a joint neighbor graph that represent both the gene expression and DNA accessibility measurements.

# build a joint neighbor graph using both assays
pbmc <- FindMultiModalNeighbors(
  object = pbmc,
  reduction.list = list("pca", "lsi"),
  dims.list = list(1:50, 2:40),
  modality.weight.name = "RNA.weight",
  verbose = TRUE
)
## Calculating cell-specific modality weights
## Finding 20 nearest neighbors for each modality.
## Calculating kernel bandwidths
## Warning in FindMultiModalNeighbors(object = pbmc, reduction.list = list("pca",
## : The number of provided modality.weight.name is not equal to the number of
## modalities. SCT.weight peaks.weight are used to store the modality weights
## Finding multimodal neighbors
## Constructing multimodal KNN graph
## Constructing multimodal SNN graph
# build a joint UMAP visualization
pbmc <- RunUMAP(
  object = pbmc,
  nn.name = "weighted.nn",
  assay = "RNA",
  verbose = TRUE
)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 13:39:47 UMAP embedding parameters a = 0.9922 b = 1.112
## Found more than one class "dist" in cache; using the first, from namespace 'BiocGenerics'
## Also defined by 'spam'
## 13:39:49 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 20
## 13:39:51 Initializing from normalized Laplacian + noise (using RSpectra)
## 13:39:51 Commencing optimization for 200 epochs, with 326568 positive edges
## 13:39:55 Optimization finished
DimPlot(pbmc, label = TRUE, repel = TRUE, reduction = "umap") + NoLegend()

Linking peaks to genes

For each gene, we can find the set of peaks that may regulate the gene by by computing the correlation between gene expression and accessibility at nearby peaks, and correcting for bias due to GC content, overall accessibility, and peak size. See the Signac paper for a full description of the method we use to link peaks to genes.

Running this step on the whole genome can be time consuming, so here we demonstrate peak-gene links for a subset of genes as an example. The same function can be used to find links for all genes by omitting the genes.use parameter.

DefaultAssay(pbmc) <- "peaks"

# first compute the GC content for each peak
pbmc <- RegionStats(pbmc, genome = BSgenome.Hsapiens.UCSC.hg38)

# link peaks to genes
pbmc <- LinkPeaks(
  object = pbmc,
  peak.assay = "peaks",
  expression.assay = "SCT"
)
## Testing 13132 genes and 120708 peaks
## Found gene coordinates for 11040 genes
## Warning in .merge_two_Seqinfo_objects(x, y): Each of the 2 combined objects has sequence levels not in the other:
##   - in 'x': chrM
##   - in 'y': chrY, chrMT
##   Make sure to always combine/compare objects based on the same reference
##   genome (use suppressWarnings() to suppress this warning).
## 'as(<dgCMatrix>, "dgTMatrix")' is deprecated.
## Use 'as(., "TsparseMatrix")' instead.
## See help("Deprecated") and help("Matrix-deprecated").

We can visualize these links using the CoveragePlot() function, or alternatively we could use the CoverageBrowser() function in an interactive analysis.

idents.plot <- c("B naive", "B intermediate", "B memory",
                 "CD14 Mono", "CD16 Mono", "CD8 TEM", "CD8 Naive")

p1 <- CoveragePlot(
  object = pbmc,
  region = "MS4A1",
  features = "MS4A1",
  expression.assay = "SCT",
  idents = idents.plot,
  extend.upstream = 500,
  extend.downstream = 10000
)

p2 <- CoveragePlot(
  object = pbmc,
  region = "LYZ",
  features = "LYZ",
  expression.assay = "SCT",
  idents = idents.plot,
  extend.upstream = 8000,
  extend.downstream = 5000
)

patchwork::wrap_plots(p1, p2, ncol = 1)

Gene activity - looking for pathogen gene expression

gene.activities <- GeneActivity(pbmc)
## Extracting gene coordinates
## Extracting reads overlapping genomic regions
# add the gene activity matrix to the Seurat object as a new assay and normalize it
pbmc[['RNA']] <- CreateAssayObject(counts = gene.activities)
## Warning: Different cells and/or features from existing assay RNA
pbmc <- NormalizeData(
  object = pbmc,
  assay = 'RNA',
  normalization.method = 'LogNormalize',
  scale.factor = median(pbmc$nCount_RNA)
)

Now we can visualize the activities of canonical marker genes.

DefaultAssay(pbmc) <- 'RNA'

FeaturePlot(
  object = pbmc,
  features = c('MS4A1', 'CD3D', 'LEF1', 'NKG7', 'TREM1', 'LYZ'),
  pt.size = 0.1,
  max.cutoff = 'q95',
  ncol = 3
)

Now we can visualize the activities of pathogen genes.

CP014267.1 = Mycoplasma pneumoniae chromosome

NZ_LR214945.1 = Mycoplasmoides pneumoniae strain NCTC10119 plasmid 1

NZ_LR214946.1 = Mycoplasmoides pneumoniae strain NCTC10119 plasmid 2

NC_001416.1 = lambda phage

NC_006273.2 = Human herpesvirus 5

NC_007605.1 = Human gammaherpesvirus 4

FeaturePlot(
  object = pbmc,
  features = c('MS4A1', 'CD3D', 'LEF1', 'NKG7', 'TREM1', 'LYZ'),
  pt.size = 0.1,
  max.cutoff = 'q95',
  ncol = 3
)

Save data

save.image("scmulti.Rdata")

Session information

Show versions of packages and key dependancies.

sessionInfo()
## R version 4.3.2 (2023-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] qlcMatrix_0.9.7                   sparsesvd_0.2-2                  
##  [3] slam_0.1-50                       SeuratDisk_0.0.0.9021            
##  [5] glmGamPoi_1.14.0                  BSgenome.Hsapiens.UCSC.hg38_1.4.5
##  [7] BSgenome_1.70.1                   rtracklayer_1.62.0               
##  [9] BiocIO_1.12.0                     Biostrings_2.70.1                
## [11] XVector_0.42.0                    EnsDb.Hsapiens.v86_2.99.0        
## [13] ensembldb_2.26.0                  AnnotationFilter_1.26.0          
## [15] GenomicFeatures_1.54.1            AnnotationDbi_1.64.1             
## [17] Biobase_2.62.0                    GenomicRanges_1.54.1             
## [19] GenomeInfoDb_1.38.5               IRanges_2.36.0                   
## [21] S4Vectors_0.40.2                  BiocGenerics_0.48.1              
## [23] hdf5r_1.3.9                       Seurat_5.0.1                     
## [25] SeuratObject_5.0.1                sp_2.1-3                         
## [27] Signac_1.12.0                     irlba_2.3.5.1                    
## [29] Matrix_1.6-5                     
## 
## loaded via a namespace (and not attached):
##   [1] ProtGenerics_1.34.0         matrixStats_1.2.0          
##   [3] spatstat.sparse_3.0-3       bitops_1.0-7               
##   [5] httr_1.4.7                  RColorBrewer_1.1-3         
##   [7] docopt_0.7.1                tools_4.3.2                
##   [9] sctransform_0.4.1           backports_1.4.1            
##  [11] utf8_1.2.4                  R6_2.5.1                   
##  [13] lazyeval_0.2.2              uwot_0.1.16                
##  [15] withr_3.0.0                 prettyunits_1.2.0          
##  [17] gridExtra_2.3               progressr_0.14.0           
##  [19] cli_3.6.2                   spatstat.explore_3.2-5     
##  [21] fastDummies_1.7.3           labeling_0.4.3             
##  [23] sass_0.4.8                  spatstat.data_3.0-4        
##  [25] ggridges_0.5.6              pbapply_1.7-2              
##  [27] Rsamtools_2.18.0            foreign_0.8-86             
##  [29] dichromat_2.0-0.1           parallelly_1.36.0          
##  [31] rstudioapi_0.15.0           RSQLite_2.3.5              
##  [33] generics_0.1.3              ica_1.0-3                  
##  [35] spatstat.random_3.2-2       dplyr_1.1.4                
##  [37] fansi_1.0.6                 abind_1.4-5                
##  [39] lifecycle_1.0.4             yaml_2.3.8                 
##  [41] SummarizedExperiment_1.32.0 SparseArray_1.2.3          
##  [43] BiocFileCache_2.10.1        Rtsne_0.17                 
##  [45] grid_4.3.2                  blob_1.2.4                 
##  [47] promises_1.2.1              crayon_1.5.2               
##  [49] miniUI_0.1.1.1              lattice_0.22-5             
##  [51] cowplot_1.1.3               KEGGREST_1.42.0            
##  [53] pillar_1.9.0                knitr_1.45                 
##  [55] rjson_0.2.21                future.apply_1.11.1        
##  [57] codetools_0.2-19            fastmatch_1.1-4            
##  [59] leiden_0.4.3.1              glue_1.7.0                 
##  [61] data.table_1.15.0           vctrs_0.6.5                
##  [63] png_0.1-8                   spam_2.10-0                
##  [65] gtable_0.3.4                cachem_1.0.8               
##  [67] xfun_0.41                   S4Arrays_1.2.0             
##  [69] mime_0.12                   survival_3.5-7             
##  [71] RcppRoll_0.3.0              ellipsis_0.3.2             
##  [73] fitdistrplus_1.1-11         ROCR_1.0-11                
##  [75] nlme_3.1-164                bit64_4.0.5                
##  [77] progress_1.2.3              filelock_1.0.3             
##  [79] RcppAnnoy_0.0.22            bslib_0.6.1                
##  [81] KernSmooth_2.23-22          rpart_4.1.23               
##  [83] colorspace_2.1-0            DBI_1.2.1                  
##  [85] Hmisc_5.1-1                 nnet_7.3-19                
##  [87] tidyselect_1.2.0            bit_4.0.5                  
##  [89] compiler_4.3.2              curl_5.2.0                 
##  [91] htmlTable_2.4.2             xml2_1.3.6                 
##  [93] DelayedArray_0.28.0         plotly_4.10.4              
##  [95] checkmate_2.3.1             scales_1.3.0               
##  [97] lmtest_0.9-40               rappdirs_0.3.3             
##  [99] stringr_1.5.1               digest_0.6.34              
## [101] goftest_1.2-3               spatstat.utils_3.0-4       
## [103] rmarkdown_2.25              htmltools_0.5.7            
## [105] pkgconfig_2.0.3             base64enc_0.1-3            
## [107] sparseMatrixStats_1.14.0    MatrixGenerics_1.14.0      
## [109] highr_0.10                  dbplyr_2.4.0               
## [111] fastmap_1.1.1               rlang_1.1.3                
## [113] htmlwidgets_1.6.4           DelayedMatrixStats_1.24.0  
## [115] shiny_1.8.0                 farver_2.1.1               
## [117] jquerylib_0.1.4             zoo_1.8-12                 
## [119] jsonlite_1.8.8              BiocParallel_1.36.0        
## [121] VariantAnnotation_1.48.1    RCurl_1.98-1.14            
## [123] magrittr_2.0.3              Formula_1.2-5              
## [125] GenomeInfoDbData_1.2.11     dotCall64_1.1-1            
## [127] patchwork_1.2.0             munsell_0.5.0              
## [129] Rcpp_1.0.12                 reticulate_1.34.0          
## [131] stringi_1.8.3               zlibbioc_1.48.0            
## [133] MASS_7.3-60.0.1             plyr_1.8.9                 
## [135] parallel_4.3.2              listenv_0.9.1              
## [137] ggrepel_0.9.5               deldir_2.0-2               
## [139] splines_4.3.2               tensor_1.5                 
## [141] hms_1.1.3                   igraph_2.0.1.1             
## [143] spatstat.geom_3.2-8         RcppHNSW_0.5.0             
## [145] reshape2_1.4.4              biomaRt_2.58.0             
## [147] XML_3.99-0.16.1             evaluate_0.23              
## [149] biovizBase_1.50.0           tweenr_2.0.2               
## [151] httpuv_1.6.14               RANN_2.6.1                 
## [153] tidyr_1.3.1                 purrr_1.0.2                
## [155] polyclip_1.10-6             future_1.33.1              
## [157] scattermore_1.2             ggplot2_3.4.4              
## [159] ggforce_0.4.1               xtable_1.8-4               
## [161] restfulr_0.0.15             RSpectra_0.16-1            
## [163] later_1.3.2                 viridisLite_0.4.2          
## [165] tibble_3.2.1                memoise_2.0.1.9000         
## [167] GenomicAlignments_1.38.1    cluster_2.1.6              
## [169] globals_0.16.2