There are 3 samples.
1-17032023-GEX
2-09032023-GEX
3-13032023-GEX
I had to make a new custom reference genome with the HIV sequence.
suppressPackageStartupMessages({
library("plyr")
library("Seurat")
library("hdf5r")
library("SingleCellExperiment")
library("parallel")
library("stringi")
library("beeswarm")
library("muscat")
library("DESeq2")
library("mitch")
})
Load the h5 matrices.
counts1 <- Read10X_h5("res_1-17032023-GEX/outs/filtered_feature_bc_matrix.h5")
counts2 <- Read10X_h5("res_2-09032023-GEX/outs/filtered_feature_bc_matrix.h5")
counts3 <- Read10X_h5("res_3-13032023-GEX/outs/filtered_feature_bc_matrix.h5")
Examine human and HIV reads.
HIV is located in the top 2 rows. One fwd strand and one negative strand.
dim(counts1)
## [1] 36603 15757
dim(counts2)
## [1] 36603 11250
dim(counts3)
## [1] 36603 15133
summary(colSums(counts1))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 513 6010 8986 11743 15951 154572
summary(colSums(counts2))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 503 5736 8758 11493 15742 104864
summary(colSums(counts3))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 501 6090 9762 11451 15394 85917
sum(colSums(counts1))
## [1] 185028635
sum(colSums(counts2))
## [1] 129301813
sum(colSums(counts3))
## [1] 173284461
colnames(counts1) <- gsub("-1","",colnames(counts1))
colnames(counts2) <- gsub("-1","",colnames(counts2))
colnames(counts3) <- gsub("-1","",colnames(counts3))
counts1[1:10,1:5]
## 10 x 5 sparse Matrix of class "dgCMatrix"
## AAACCCAAGAACAAGG AAACCCAAGAATTTGG AAACCCAAGAGTTGCG AAACCCAAGATAGCAT
## gene-HIV1-1 . . . .
## gene-HIV1-2 . . . .
## MIR1302-2HG . . . .
## FAM138A . . . .
## OR4F5 . . . .
## AL627309.1 . . . .
## AL627309.3 . . . .
## AL627309.2 . . . .
## AL627309.5 . . . .
## AL627309.4 . . . .
## AAACCCAAGATTAGAC
## gene-HIV1-1 .
## gene-HIV1-2 .
## MIR1302-2HG .
## FAM138A .
## OR4F5 .
## AL627309.1 .
## AL627309.3 .
## AL627309.2 .
## AL627309.5 .
## AL627309.4 .
head(colSums(counts1))
## AAACCCAAGAACAAGG AAACCCAAGAATTTGG AAACCCAAGAGTTGCG AAACCCAAGATAGCAT
## 7975 7148 6412 5827
## AAACCCAAGATTAGAC AAACCCAAGCTAGTTC
## 6161 18524
Write the cell barcodes to a file for the CITE-Seq-count program.
countbc1 <- colnames(counts1)
writeLines(countbc1,con="cell_barcodes1.txt")
countbc2 <- colnames(counts2)
writeLines(countbc2,con="cell_barcodes2.txt")
countbc3 <- colnames(counts3)
writeLines(countbc3,con="cell_barcodes3.txt")
Now read HTO data.
It is strange that HTO cell barcodes do not appear to be common with the main RNA-seq counts.
hto1 <- Read10X("hto_count/1-17032023-HTO/read_count/", gene.column=1)
hto2 <- Read10X("hto_count/2-09032023-HTO/read_count/", gene.column=1)
hto3 <- Read10X("hto_count/3-13032023-HTO/read_count/", gene.column=1)
dim(hto1)
## [1] 6 15733
hto1[,1:20]
## 6 x 20 sparse Matrix of class "dgCMatrix"
## [[ suppressing 20 column names 'GTGCTGGAGAATGTTG', 'GGCACGTAGTTACGTC', 'CTTCAATGTCGAGCAA' ... ]]
##
## HTO1-GTCAACTCTTTAGCG 84 55 79 58 125 79 102 3926 58 4632 51 2348
## HTO2-TGATGGCCTATTGGG . . . . . . . . . . . .
## HTO3-TTCCGCCTCTCTTTG 2168 25 2005 21 10 2221 12 9 352 108 618 30
## HTO4-AGTAAGTTCAGCGTA 207 5775 163 192 177 83 130 135 97 406 81 126
## HTO5-AAGTATCGTTTCGCA 2359 45 151 60 2784 53 3163 53 52 114 21 128
## unmapped 154 224 112 30 112 133 92 212 27 234 21 141
##
## HTO1-GTCAACTCTTTAGCG 60 4132 66 3036 77 2482 46 64
## HTO2-TGATGGCCTATTGGG . . . . . . . .
## HTO3-TTCCGCCTCTCTTTG 169 33 1625 97 22 25 1507 19
## HTO4-AGTAAGTTCAGCGTA 165 145 203 203 3695 119 77 2534
## HTO5-AAGTATCGTTTCGCA 149 40 73 69 47 93 72 114
## unmapped 24 207 72 137 147 120 44 120
dim(counts1)
## [1] 36603 15757
dim(hto1)
## [1] 6 15733
str(which(colnames(counts1) %in% colnames(hto1)))
## int [1:15733] 1 2 3 4 5 6 7 8 9 10 ...
str(which(colnames(counts2) %in% colnames(hto2)))
## int [1:11241] 1 2 3 4 5 6 7 8 9 10 ...
str(which(colnames(counts3) %in% colnames(hto3)))
## int [1:15119] 1 2 3 4 5 6 7 8 9 10 ...
Previously there appears only to be 10 cell barcodes in common. To fix this, I obtained the cell barcode whitelist after running cellranger. I then used this whitelist for CITE-Seq-count with a tolerance of 2 sequence mismatches.
summary(colSums(hto1))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 2113 3418 5288 6370 66649
hto1 <- hto1[,which(colSums(hto1)>=100)]
hto2 <- hto2[,which(colSums(hto2)>=100)]
hto3 <- hto3[,which(colSums(hto3)>=100)]
# look at the proportion unmapped
summary(apply(hto1,2,function(x) {x[6]/sum(x) } ) )
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.03021 0.03598 0.03624 0.04171 0.10673
summary(apply(hto2,2,function(x) {x[6]/sum(x) } ) )
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.004274 0.033925 0.038232 0.038896 0.043283 0.151877
summary(apply(hto3,2,function(x) {x[6]/sum(x) } ) )
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.02888 0.03369 0.03392 0.03803 0.12423
For each cell barcode, calculate the ratio of top BC to 2nd BC.
getratio <- function(mx){
res <- lapply(1:ncol(mx), function(i) {
cnt <- mx[,i]
top1 <- cnt[order(-cnt)][1]+1
top2 <- cnt[order(-cnt)][2]+1
top1/top2
})
return(unlist(res))
}
ratio1 <- getratio(hto1)
summary(unlist(ratio1))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 7.496 16.333 16.193 23.765 51.717
length(which(ratio1>3))
## [1] 13211
hto1 <- hto1[,which(ratio1>3)]
ratio2 <- getratio(hto2)
summary(unlist(ratio2))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 7.92 17.59 16.01 23.39 40.35
length(which(ratio2>3))
## [1] 9740
hto2 <- hto2[,which(ratio2>3)]
ratio3 <- getratio(hto3)
summary(unlist(ratio3))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 6.029 15.477 16.151 25.482 55.842
length(which(ratio3>3))
## [1] 12652
hto3 <- hto3[,which(ratio3>3)]
itx1 <- intersect(colnames(counts1),colnames(hto1))
dim(counts1)
## [1] 36603 15757
counts1 <- counts1[,itx1]
dim(counts1)
## [1] 36603 13211
hto1 <- hto1[,itx1]
dim(hto1)
## [1] 6 13211
itx2 <- intersect(colnames(counts2),colnames(hto2))
dim(counts2)
## [1] 36603 11250
counts2 <- counts2[,itx2]
dim(counts2)
## [1] 36603 9740
hto2 <- hto2[,itx2]
dim(hto2)
## [1] 6 9740
itx3 <- intersect(colnames(counts3),colnames(hto3))
dim(counts3)
## [1] 36603 15133
counts3 <- counts3[,itx3]
dim(counts3)
## [1] 36603 12652
hto3 <- hto3[,itx3]
dim(hto3)
## [1] 6 12652
Sample | Plate | HTO |
---|---|---|
CC0003 | 1 | 1 |
AH0018 | 1 | 3 |
PM008 | 1 | 4 |
PM017 | 1 | 5 |
PM0032 | 2 | 1 |
PM0028 | 2 | 2 |
AH0005 | 2 | 3 |
AH0015 | 2 | 4 |
PM0027 | 3 | 2 |
PM0020 | 3 | 3 |
PM001 | 3 | 4 |
CC0016 | 3 | 5 |
table(apply(hto1,2,function(x) { order(-x) } )[1,] )
##
## 1 3 4 5
## 3272 2889 3468 3582
idx1 <- apply(hto1,2,function(x) { order(-x) } )[1,]
c1h1 <- counts1[,which(idx1==1)] # CC0003
c1h3 <- counts1[,which(idx1==3)] # AH0018
c1h4 <- counts1[,which(idx1==4)] # PM008
c1h5 <- counts1[,which(idx1==5)] # PM017
table(apply(hto2,2,function(x) { order(-x) } )[1,] )
##
## 1 2 3 4
## 2069 2534 2233 2904
idx2 <- apply(hto2,2,function(x) { order(-x) } )[1,]
c2h1 <- counts2[,which(idx2==1)] # PM0032
c2h2 <- counts2[,which(idx2==2)] # PM0028
c2h3 <- counts2[,which(idx2==3)] # AH0005
c2h4 <- counts2[,which(idx2==4)] # AH0015
table(apply(hto3,2,function(x) { order(-x) } )[1,] )
##
## 2 3 4 5
## 3005 2765 3437 3445
idx3 <- apply(hto3,2,function(x) { order(-x) } )[1,]
c3h2 <- counts3[,which(idx3==2)] # PM0027
c3h3 <- counts3[,which(idx3==3)] # PM0020
c3h4 <- counts3[,which(idx3==4)] # PM001
c3h5 <- counts3[,which(idx3==5)] # CC0016
colnames(c1h1) <- paste("CC0003",colnames(c1h1))
colnames(c1h3) <- paste("AH0018",colnames(c1h3))
colnames(c1h4) <- paste("PM008",colnames(c1h4))
colnames(c1h5) <- paste("PM017",colnames(c1h5))
colnames(c2h1) <- paste("PM0032",colnames(c2h1))
colnames(c2h2) <- paste("PM0028",colnames(c2h2))
colnames(c2h3) <- paste("AH0005",colnames(c2h3))
colnames(c2h4) <- paste("AH0015",colnames(c2h4))
colnames(c3h2) <- paste("PM0027",colnames(c3h2))
colnames(c3h3) <- paste("PM0020",colnames(c3h3))
colnames(c3h4) <- paste("PM001",colnames(c3h4))
colnames(c3h5) <- paste("CC0016",colnames(c3h5))
comb <- cbind(c1h1,c1h3,c1h4,c1h5,c2h1,c2h2,c2h3,c2h4,c3h2,c3h3,c3h4,c3h5)
Convert data to summarised experiment object and save.
sce <- SingleCellExperiment(list(counts=comb))
sce
## class: SingleCellExperiment
## dim: 36603 35603
## metadata(0):
## assays(1): counts
## rownames(36603): gene-HIV1-1 gene-HIV1-2 ... AC007325.4 AC007325.2
## rowData names(0):
## colnames(35603): CC0003 AAACCCAAGAATTTGG CC0003 AAACCCAAGCTAGTTC ...
## CC0016 TTTGTTGCATTACGGT CC0016 TTTGTTGGTTATGTCG
## colData names(0):
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
saveRDS(sce,"combined_counts.rds")
save.image("scanalyse1.Rdata")