Single cell transcriptomes of nk mono cells with HIV 2023

Introduction

There are 3 samples.

1-17032023-GEX
2-09032023-GEX
3-13032023-GEX

I had to make a new custom reference genome with the HIV sequence.

suppressPackageStartupMessages({
  library("plyr")
  library("Seurat")
  library("hdf5r")
  library("SingleCellExperiment")
  library("parallel")
  library("stringi")
  library("beeswarm")
  library("muscat")
  library("DESeq2")
  library("mitch")
})

Load data

Load the h5 matrices.

counts1 <- Read10X_h5("res_1-17032023-GEX/outs/filtered_feature_bc_matrix.h5")
counts2 <- Read10X_h5("res_2-09032023-GEX/outs/filtered_feature_bc_matrix.h5")
counts3 <- Read10X_h5("res_3-13032023-GEX/outs/filtered_feature_bc_matrix.h5")

Examine human and HIV reads.

HIV is located in the top 2 rows. One fwd strand and one negative strand.

dim(counts1)

## [1] 36603 15757

dim(counts2)

## [1] 36603 11250

dim(counts3)

## [1] 36603 15133

summary(colSums(counts1))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     513    6010    8986   11743   15951  154572

summary(colSums(counts2))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     503    5736    8758   11493   15742  104864

summary(colSums(counts3))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     501    6090    9762   11451   15394   85917

sum(colSums(counts1))

## [1] 185028635

sum(colSums(counts2))

## [1] 129301813

sum(colSums(counts3))

## [1] 173284461

colnames(counts1) <- gsub("-1","",colnames(counts1))
colnames(counts2) <- gsub("-1","",colnames(counts2))
colnames(counts3) <- gsub("-1","",colnames(counts3))

counts1[1:10,1:5]

## 10 x 5 sparse Matrix of class "dgCMatrix"
##             AAACCCAAGAACAAGG AAACCCAAGAATTTGG AAACCCAAGAGTTGCG AAACCCAAGATAGCAT
## gene-HIV1-1                .                .                .                .
## gene-HIV1-2                .                .                .                .
## MIR1302-2HG                .                .                .                .
## FAM138A                    .                .                .                .
## OR4F5                      .                .                .                .
## AL627309.1                 .                .                .                .
## AL627309.3                 .                .                .                .
## AL627309.2                 .                .                .                .
## AL627309.5                 .                .                .                .
## AL627309.4                 .                .                .                .
##             AAACCCAAGATTAGAC
## gene-HIV1-1                .
## gene-HIV1-2                .
## MIR1302-2HG                .
## FAM138A                    .
## OR4F5                      .
## AL627309.1                 .
## AL627309.3                 .
## AL627309.2                 .
## AL627309.5                 .
## AL627309.4                 .

head(colSums(counts1))

## AAACCCAAGAACAAGG AAACCCAAGAATTTGG AAACCCAAGAGTTGCG AAACCCAAGATAGCAT 
##             7975             7148             6412             5827 
## AAACCCAAGATTAGAC AAACCCAAGCTAGTTC 
##             6161            18524

Write the cell barcodes to a file for the CITE-Seq-count program.

countbc1 <- colnames(counts1)
writeLines(countbc1,con="cell_barcodes1.txt")

countbc2 <- colnames(counts2)
writeLines(countbc2,con="cell_barcodes2.txt")

countbc3 <- colnames(counts3)
writeLines(countbc3,con="cell_barcodes3.txt")

Now read HTO data.

It is strange that HTO cell barcodes do not appear to be common with the main RNA-seq counts.

hto1 <- Read10X("hto_count/1-17032023-HTO/read_count/", gene.column=1)
hto2 <- Read10X("hto_count/2-09032023-HTO/read_count/", gene.column=1)
hto3 <- Read10X("hto_count/3-13032023-HTO/read_count/", gene.column=1)

dim(hto1)

## [1]     6 15733

hto1[,1:20]

## 6 x 20 sparse Matrix of class "dgCMatrix"

##   [[ suppressing 20 column names 'GTGCTGGAGAATGTTG', 'GGCACGTAGTTACGTC', 'CTTCAATGTCGAGCAA' ... ]]

##                                                                              
## HTO1-GTCAACTCTTTAGCG   84   55   79  58  125   79  102 3926  58 4632  51 2348
## HTO2-TGATGGCCTATTGGG    .    .    .   .    .    .    .    .   .    .   .    .
## HTO3-TTCCGCCTCTCTTTG 2168   25 2005  21   10 2221   12    9 352  108 618   30
## HTO4-AGTAAGTTCAGCGTA  207 5775  163 192  177   83  130  135  97  406  81  126
## HTO5-AAGTATCGTTTCGCA 2359   45  151  60 2784   53 3163   53  52  114  21  128
## unmapped              154  224  112  30  112  133   92  212  27  234  21  141
##                                                            
## HTO1-GTCAACTCTTTAGCG  60 4132   66 3036   77 2482   46   64
## HTO2-TGATGGCCTATTGGG   .    .    .    .    .    .    .    .
## HTO3-TTCCGCCTCTCTTTG 169   33 1625   97   22   25 1507   19
## HTO4-AGTAAGTTCAGCGTA 165  145  203  203 3695  119   77 2534
## HTO5-AAGTATCGTTTCGCA 149   40   73   69   47   93   72  114
## unmapped              24  207   72  137  147  120   44  120

dim(counts1)

## [1] 36603 15757

dim(hto1)

## [1]     6 15733

str(which(colnames(counts1) %in% colnames(hto1)))

##  int [1:15733] 1 2 3 4 5 6 7 8 9 10 ...

str(which(colnames(counts2) %in% colnames(hto2)))

##  int [1:11241] 1 2 3 4 5 6 7 8 9 10 ...

str(which(colnames(counts3) %in% colnames(hto3)))

##  int [1:15119] 1 2 3 4 5 6 7 8 9 10 ...

Previously there appears only to be 10 cell barcodes in common. To fix this, I obtained the cell barcode whitelist after running cellranger. I then used this whitelist for CITE-Seq-count with a tolerance of 2 sequence mismatches.

Remove low HTO counts

summary(colSums(hto1))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    2113    3418    5288    6370   66649

hto1 <- hto1[,which(colSums(hto1)>=100)]
hto2 <- hto2[,which(colSums(hto2)>=100)]
hto3 <- hto3[,which(colSums(hto3)>=100)]

# look at the proportion unmapped
summary(apply(hto1,2,function(x) {x[6]/sum(x) } ) )

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.03021 0.03598 0.03624 0.04171 0.10673

summary(apply(hto2,2,function(x) {x[6]/sum(x) } ) )

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.004274 0.033925 0.038232 0.038896 0.043283 0.151877

summary(apply(hto3,2,function(x) {x[6]/sum(x) } ) )

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.02888 0.03369 0.03392 0.03803 0.12423

Identify ambiguous HTO counts

For each cell barcode, calculate the ratio of top BC to 2nd BC.

getratio <- function(mx){
  res <- lapply(1:ncol(mx), function(i) {
    cnt <- mx[,i]
    top1 <- cnt[order(-cnt)][1]+1
    top2 <- cnt[order(-cnt)][2]+1
    top1/top2
  })
  return(unlist(res))
}

ratio1 <- getratio(hto1)
summary(unlist(ratio1))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   7.496  16.333  16.193  23.765  51.717

length(which(ratio1>3))

## [1] 13211

hto1 <- hto1[,which(ratio1>3)]

ratio2 <- getratio(hto2)
summary(unlist(ratio2))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    7.92   17.59   16.01   23.39   40.35

length(which(ratio2>3))

## [1] 9740

hto2 <- hto2[,which(ratio2>3)]

ratio3 <- getratio(hto3)
summary(unlist(ratio3))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   6.029  15.477  16.151  25.482  55.842

length(which(ratio3>3))

## [1] 12652

hto3 <- hto3[,which(ratio3>3)]

Match

itx1 <- intersect(colnames(counts1),colnames(hto1))
dim(counts1)

## [1] 36603 15757

counts1 <- counts1[,itx1]
dim(counts1)

## [1] 36603 13211

hto1 <- hto1[,itx1]
dim(hto1)

## [1]     6 13211

itx2 <- intersect(colnames(counts2),colnames(hto2))
dim(counts2)

## [1] 36603 11250

counts2 <- counts2[,itx2]
dim(counts2)

## [1] 36603  9740

hto2 <- hto2[,itx2]
dim(hto2)

## [1]    6 9740

itx3 <- intersect(colnames(counts3),colnames(hto3))
dim(counts3)

## [1] 36603 15133

counts3 <- counts3[,itx3]
dim(counts3)

## [1] 36603 12652

hto3 <- hto3[,itx3]
dim(hto3)

## [1]     6 12652

Get demux result

Sample	Plate	HTO
CC0003	1	1
AH0018	1	3
PM008	1	4
PM017	1	5
PM0032	2	1
PM0028	2	2
AH0005	2	3
AH0015	2	4
PM0027	3	2
PM0020	3	3
PM001	3	4
CC0016	3	5

table(apply(hto1,2,function(x) { order(-x) } )[1,] )

## 
##    1    3    4    5 
## 3272 2889 3468 3582

idx1 <- apply(hto1,2,function(x) { order(-x) } )[1,]
c1h1 <- counts1[,which(idx1==1)] # CC0003
c1h3 <- counts1[,which(idx1==3)] # AH0018
c1h4 <- counts1[,which(idx1==4)] # PM008
c1h5 <- counts1[,which(idx1==5)] # PM017

table(apply(hto2,2,function(x) { order(-x) } )[1,] )

## 
##    1    2    3    4 
## 2069 2534 2233 2904

idx2 <- apply(hto2,2,function(x) { order(-x) } )[1,]
c2h1 <- counts2[,which(idx2==1)] # PM0032
c2h2 <- counts2[,which(idx2==2)] # PM0028
c2h3 <- counts2[,which(idx2==3)] # AH0005
c2h4 <- counts2[,which(idx2==4)] # AH0015

table(apply(hto3,2,function(x) { order(-x) } )[1,] )

## 
##    2    3    4    5 
## 3005 2765 3437 3445

idx3 <- apply(hto3,2,function(x) { order(-x) } )[1,]
c3h2 <- counts3[,which(idx3==2)] # PM0027
c3h3 <- counts3[,which(idx3==3)] # PM0020
c3h4 <- counts3[,which(idx3==4)] # PM001
c3h5 <- counts3[,which(idx3==5)] # CC0016

colnames(c1h1) <- paste("CC0003",colnames(c1h1))
colnames(c1h3) <- paste("AH0018",colnames(c1h3))
colnames(c1h4) <- paste("PM008",colnames(c1h4))
colnames(c1h5) <- paste("PM017",colnames(c1h5))

colnames(c2h1) <- paste("PM0032",colnames(c2h1))
colnames(c2h2) <- paste("PM0028",colnames(c2h2))
colnames(c2h3) <- paste("AH0005",colnames(c2h3))
colnames(c2h4) <- paste("AH0015",colnames(c2h4))

colnames(c3h2) <- paste("PM0027",colnames(c3h2))
colnames(c3h3) <- paste("PM0020",colnames(c3h3))
colnames(c3h4) <- paste("PM001",colnames(c3h4))
colnames(c3h5) <- paste("CC0016",colnames(c3h5))

comb <- cbind(c1h1,c1h3,c1h4,c1h5,c2h1,c2h2,c2h3,c2h4,c3h2,c3h3,c3h4,c3h5)

Convert data to summarised experiment object and save.

sce <- SingleCellExperiment(list(counts=comb))

sce

## class: SingleCellExperiment 
## dim: 36603 35603 
## metadata(0):
## assays(1): counts
## rownames(36603): gene-HIV1-1 gene-HIV1-2 ... AC007325.4 AC007325.2
## rowData names(0):
## colnames(35603): CC0003 AAACCCAAGAATTTGG CC0003 AAACCCAAGCTAGTTC ...
##   CC0016 TTTGTTGCATTACGGT CC0016 TTTGTTGGTTATGTCG
## colData names(0):
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):

saveRDS(sce,"combined_counts.rds")

save.image("scanalyse1.Rdata")