Introduction

Description of the samples TODO.

suppressPackageStartupMessages({
  library("plyr")
  library("Seurat")
  library("hdf5r")
  library("SingleCellExperiment")
  library("parallel")
  library("stringi")
  library("beeswarm")
  library("muscat")
  library("DESeq2")
  library("mitch")
})

Load data

Load the h5 matrices.

CSP1_counts ./1-CSP/outs/filtered_feature_bc_matrix.h5

GEX1_counts ./1-GEX/outs/filtered_feature_bc_matrix.h5

CSP2_counts ./2-CSP/outs/filtered_feature_bc_matrix.h5

GEX2_counts ./2-GEX/outs/filtered_feature_bc_matrix.h5

M239_N239_Alv_counts ./M239-N239-Alv/outs/filtered_feature_bc_matrix.h5

N239Alv_react ./N239Alv-react/outs/filtered_feature_bc_matrix.h5

P239_Alv_and_MDM ./P239-Alv-and-MDM/outs/filtered_feature_bc_matrix.h5

CSP1_counts <- Read10X_h5("./1-CSP/outs/filtered_feature_bc_matrix.h5")
colnames(CSP1_counts) <- gsub("-1","",colnames(CSP1_counts))
dim(CSP1_counts)
## [1] 36603  1887
ncol(CSP1_counts)
## [1] 1887
summary(colSums(CSP1_counts))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     390    1021    1530    1624    2118    5422
CSP1_counts[1:10,1:30]
## 10 x 30 sparse Matrix of class "dgCMatrix"
##   [[ suppressing 30 column names 'AAACCCAAGTGCAACG', 'AAACCCACAATGACCT', 'AAACCCAGTTTCCCAC' ... ]]
##                                                                          
## gene-HIV1-1 . . . . . . . .  . . 1  2 . . . . . . . . . . . . . . . . . .
## gene-HIV1-2 . . . 9 2 . . . 12 . 6 12 . 6 . . . 7 . . . 8 . . . 2 1 . . .
## MIR1302-2HG . . . . . . . .  . . .  . . . . . . . . . . . . . . . . . . .
## FAM138A     . . . . . . . .  . . .  . . . . . . . . . . . . . . . . . . .
## OR4F5       . . . . . . . .  . . .  . . . . . . . . . . . . . . . . . . .
## AL627309.1  . . . . . . . .  . . .  . . . . . . . . . . . . . . . . . . .
## AL627309.3  . . . . . . . .  . . .  . . . . . . . . . . . . . . . . . . .
## AL627309.2  . . . . . . . .  . . .  . . . . . . . . . . . . . . . . . . .
## AL627309.5  . . . . . . . .  . . .  . . . . . . . . . . . . . . . . . . .
## AL627309.4  . . . . . . . .  . . .  . . . . . . . . . . . . . . . . . . .
GEX1_counts <- Read10X_h5("./1-GEX/outs/filtered_feature_bc_matrix.h5")
colnames(GEX1_counts) <- gsub("-1","",colnames(GEX1_counts))
ncol(GEX1_counts)
## [1] 11943
summary(colSums(GEX1_counts))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     500    5514   29095   29074   46098  166793
GEX1_counts[1:10,1:30]
## 10 x 30 sparse Matrix of class "dgCMatrix"
##   [[ suppressing 30 column names 'AAACCCAAGAGGGCGA', 'AAACCCAAGGACAACC', 'AAACCCAAGGTAGTCG' ... ]]
##                                                                              
## gene-HIV1-1 . . . 1 . . . .  22 . . 1 . . . .  . . .  . . .  1 . .  4 . . . .
## gene-HIV1-2 . . . 8 . . 3 1 145 1 2 2 . . 1 6 42 1 3 12 . 5 11 1 7 26 . . 1 2
## MIR1302-2HG . . . . . . . .   . . . . . . . .  . . .  . . .  . . .  . . . . .
## FAM138A     . . . . . . . .   . . . . . . . .  . . .  . . .  . . .  . . . . .
## OR4F5       . . . . . . . .   . . . . . . . .  . . .  . . .  . . .  . . . . .
## AL627309.1  . . . . . . . .   . . . . . . . .  . . .  . . .  . . .  . . . . .
## AL627309.3  . . . . . . . .   . . . . . . . .  . . .  . . .  . . .  . . . . .
## AL627309.2  . . . . . . . .   . . . . . . . .  . . .  . . .  . . .  . . . . .
## AL627309.5  . . . . . . . .   . . . . . . . .  . . .  . . .  . . .  . . . . .
## AL627309.4  . . . . . . . .   . . . . . . . .  . . .  . . .  . . .  . . . . .
CSP2_counts <- Read10X_h5("./2-CSP/outs/filtered_feature_bc_matrix.h5")
colnames(CSP2_counts) <- gsub("-1","",colnames(CSP2_counts))
ncol(CSP2_counts)
## [1] 6266
summary(colSums(CSP2_counts))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20.00   26.00   33.00   54.44   60.00  372.00
CSP2_counts[1:10,1:30]
## 10 x 30 sparse Matrix of class "dgCMatrix"
##   [[ suppressing 30 column names 'AAACCCAAGAGGGCGA', 'AAACCCAAGTGCAACG', 'AAACCCACAATGACCT' ... ]]
##                                                                        
## gene-HIV1-1 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## gene-HIV1-2 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## MIR1302-2HG . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## FAM138A     . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## OR4F5       . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## AL627309.1  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## AL627309.3  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## AL627309.2  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## AL627309.5  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## AL627309.4  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
GEX2_counts <- Read10X_h5("./2-GEX/outs/filtered_feature_bc_matrix.h5")
colnames(GEX2_counts) <- gsub("-1","",colnames(GEX2_counts))
ncol(GEX2_counts)
## [1] 2715
summary(colSums(GEX2_counts))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     711   12815   52245   55394   87348  259921
GEX2_counts[1:10,1:30]
## 10 x 30 sparse Matrix of class "dgCMatrix"
##   [[ suppressing 30 column names 'AAACCCAAGTGCAACG', 'AAACCCACAATGACCT', 'AAACCCAGTGACATCT' ... ]]
##                                                                               
## gene-HIV1-1 . . . . . .  10  . . . . . .   3 .  2 .  12  14 .  12 . . . . .  2
## gene-HIV1-2 . . . 2 4 . 220 98 2 . . . . 205 . 37 . 215 255 . 152 . . . . . 17
## MIR1302-2HG . . . . . .   .  . . . . . .   . .  . .   .   . .   . . . . . .  .
## FAM138A     . . . . . .   .  . . . . . .   . .  . .   .   . .   . . . . . .  .
## OR4F5       . . . . . .   .  . . . . . .   . .  . .   .   . .   . . . . . .  .
## AL627309.1  . . . . . .   .  . . . . . .   . .  . .   .   . .   . 2 . . . .  .
## AL627309.3  . . . . . .   .  . . . . . .   . .  . .   .   . .   . . . . . .  .
## AL627309.2  . . . . . .   .  . . . . . .   . .  . .   .   . .   . . . . . .  .
## AL627309.5  . . . . . .   .  . . . . . .   . .  . .   .   . .   . . . . . .  .
## AL627309.4  . . . . . .   .  . . . . . .   . .  . .   .   . .   . . . . . .  .
##                     
## gene-HIV1-1 .   .  .
## gene-HIV1-2 1 164 21
## MIR1302-2HG .   .  .
## FAM138A     .   .  .
## OR4F5       .   .  .
## AL627309.1  .   .  .
## AL627309.3  .   .  .
## AL627309.2  .   .  .
## AL627309.5  .   .  .
## AL627309.4  .   .  .
M239_N239_Alv_counts <- Read10X_h5("./M239-N239-Alv/outs/filtered_feature_bc_matrix.h5")
colnames(M239_N239_Alv_counts) <- gsub("-1","",colnames(M239_N239_Alv_counts))
ncol(M239_N239_Alv_counts)
## [1] 10096
summary(colSums(M239_N239_Alv_counts))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     500   17449   29574   29758   40389  134943
M239_N239_Alv_counts[1:10,1:30]
## 10 x 30 sparse Matrix of class "dgCMatrix"
##   [[ suppressing 30 column names 'AAACCCACACTGAATC', 'AAACCCACAGTCAGAG', 'AAACCCAGTACCCAGC' ... ]]
##                                                                               
## gene-HIV1-1  14 . . .  87 . . . . . .  158 1 . . . . . . . . . . . . . . . . .
## gene-HIV1-2 135 . 1 . 215 1 1 1 . . 2 1619 3 1 . . 1 . . . 1 1 . 4 4 1 2 . 5 .
## MIR1302-2HG   . . . .   . . . . . . .    . . . . . . . . . . . . . . . . . . .
## FAM138A       . . . .   . . . . . . .    . . . . . . . . . . . . . . . . . . .
## OR4F5         . . . .   . . . . . . .    . . . . . . . . . . . . . . . . . . .
## AL627309.1    . . . .   . . . . . . .    . . . . . . . . . . . . . . . . . . .
## AL627309.3    . . . .   . . . . . . .    . . . . . . . . . . . . . . . . . . .
## AL627309.2    . . . .   . . . . . . .    . . . . . . . . . . . . . . . . . . .
## AL627309.5    . . . 1   . . . . . . .    . . 1 . . . . . . . . . . . . . . . .
## AL627309.4    . . . .   . . . . . . .    . . . . . . . . . . . . . . . . . . .
N239Alv_react_counts <- Read10X_h5("./N239Alv-react/outs/filtered_feature_bc_matrix.h5")
colnames(N239Alv_react_counts) <- gsub("-1","",colnames(N239Alv_react_counts))
ncol(N239Alv_react_counts)
## [1] 3098
summary(colSums(N239Alv_react_counts))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     542   25090   33558   32742   41771   93107
N239Alv_react_counts[1:10,1:30]
## 10 x 30 sparse Matrix of class "dgCMatrix"
##   [[ suppressing 30 column names 'AAACCCATCTCGCTCA', 'AAACGAACAGTGAGCA', 'AAACGAATCACTGGTA' ... ]]
##                                                                        
## gene-HIV1-1 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## gene-HIV1-2 . . . . 1 . . . . . . 2 . . . . . . . . 1 . . . . . . . . .
## MIR1302-2HG . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## FAM138A     . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## OR4F5       . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## AL627309.1  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## AL627309.3  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## AL627309.2  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## AL627309.5  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## AL627309.4  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
P239_Alv_MDM_counts <- Read10X_h5("./P239-Alv-and-MDM/outs/filtered_feature_bc_matrix.h5")
colnames(P239_Alv_MDM_counts) <- gsub("-1","",colnames(P239_Alv_MDM_counts))
ncol(P239_Alv_MDM_counts)
## [1] 9004
summary(colSums(P239_Alv_MDM_counts))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     506   15076   22383   23960   31715  120277
P239_Alv_MDM_counts[1:10,1:30]
## 10 x 30 sparse Matrix of class "dgCMatrix"
##   [[ suppressing 30 column names 'AAACCCACACAATGAA', 'AAACCCACACGCCACA', 'AAACCCACACGCGTGT' ... ]]
##                                                                               
## gene-HIV1-1 . . . . . . . . .  20 .  . 1  8 .  36 . . .  2  1 . . . 46 . . . 1
## gene-HIV1-2 . . . 3 1 . 1 . 4 233 . 66 2 11 8 145 1 4 . 63 16 . . 1 92 4 . 2 3
## MIR1302-2HG . . . . . . . . .   . .  . .  . .   . . . .  .  . . . .  . . . . .
## FAM138A     . . . . . . . . .   . .  . .  . .   . . . .  .  . . . .  . . . . .
## OR4F5       . . . . . . . . .   . .  . .  . .   . . . .  .  . . . .  . . . . .
## AL627309.1  . . . . . . . . .   . .  . .  . .   . . . .  .  . . . .  . . . . .
## AL627309.3  . . . . . . . . .   . .  . .  . .   . . . .  .  . . . .  . . . . .
## AL627309.2  . . . . . . . . .   . .  . .  . .   . . . .  .  . . . .  . . . . .
## AL627309.5  . . . . . . . . .   . .  . .  . .   . . . .  .  . . . .  . . . . .
## AL627309.4  . . . . . . . . .   . .  . .  . .   . . . .  .  . . . .  . . . . .
##              
## gene-HIV1-1 .
## gene-HIV1-2 .
## MIR1302-2HG .
## FAM138A     .
## OR4F5       .
## AL627309.1  .
## AL627309.3  .
## AL627309.2  .
## AL627309.5  .
## AL627309.4  .

CSP1_counts ./1-CSP/outs/filtered_feature_bc_matrix.h5

GEX1_counts ./1-GEX/outs/filtered_feature_bc_matrix.h5

CSP2_counts ./2-CSP/outs/filtered_feature_bc_matrix.h5

GEX2_counts ./2-GEX/outs/filtered_feature_bc_matrix.h5

M239_N239_Alv_counts ./M239-N239-Alv/outs/filtered_feature_bc_matrix.h5

N239Alv_react_counts ./N239Alv-react/outs/filtered_feature_bc_matrix.h5

P239_Alv_MDM_counts ./P239-Alv-and-MDM/outs/filtered_feature_bc_matrix.h5

if (! file.exists("CSP1_cell_barcodes.txt") ) {
  CSP1_bc <- colnames(CSP1_counts)
  writeLines(CSP1_bc,con="CSP1_cell_barcodes.txt")
}

if (! file.exists("GEX1_cell_barcodes.txt") ) {
  GEX1_bc <- colnames(GEX1_counts)
  writeLines(GEX1_bc,con="GEX1_cell_barcodes.txt")
}

if (! file.exists("CSP2_cell_barcodes.txt") ) {
  CSP2_bc <- colnames(CSP2_counts)
  writeLines(CSP2_bc,con="CSP2_cell_barcodes.txt")
}

if (! file.exists("GEX2_cell_barcodes.txt") ) {
  GEX2_bc <- colnames(GEX2_counts)
  writeLines(GEX2_bc,con="GEX2_cell_barcodes.txt")
}

if (! file.exists("M239_N239_Alv_cell_barcodes.txt") ) {
  M239_N239_Alv_bc <- colnames(M239_N239_Alv_counts)
  writeLines(M239_N239_Alv_bc,con="M239_N239_Alv_cell_barcodes.txt")
}

if (! file.exists("N239Alv_react_cell_barcodes.txt") ) {
  N239Alv_react_bc <- colnames(N239Alv_react_counts)
  writeLines(N239Alv_react_bc,con="N239Alv_react_cell_barcodes.txt")
}

if (! file.exists("P239_Alv_and_MDM_cell_barcodes.txt") ) {
  P239_Alv_and_MDM_bc <- colnames(P239_Alv_MDM_counts)
  writeLines(P239_Alv_and_MDM_bc,con="P239_Alv_and_MDM_cell_barcodes.txt")
}

Now read HTO data.

gex1_hto <- Read10X("1-CSP_hto/read_count/", gene.column=1)
#gex1_hto_trash <- Read10X("trash/1-CSP_hto/read_count/", gene.column=1)
summary(colSums(gex1_hto))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      16    3580    7222    9093   12782   51772
dim(gex1_hto)
## [1]     7 11266
gex2_hto <- Read10X("2-CSP_hto/read_count/", gene.column=1)
summary(colSums(gex2_hto))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      27    4110   13737   15216   24504   64861
dim(gex2_hto)
## [1]    7 1689
M239_N239_Alv_hto <- Read10X("M239-N239-Alv_hto/read_count/", gene.column=1)
summary(colSums(M239_N239_Alv_hto))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    1420    2897    4551    6469   40064
dim(M239_N239_Alv_hto)
## [1]     7 10083
P239_Alv_MDM_hto <- Read10X("P239-Alv-and-MDM_hto/read_count/", gene.column=1)
summary(colSums(P239_Alv_MDM_hto))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    2072    5122    6850   10136   53637
dim(P239_Alv_MDM_hto)
## [1]    7 8872
str(which(colnames(GEX1_counts) %in% colnames(gex1_hto)))
##  int [1:11266] 1 2 3 4 5 6 7 8 9 10 ...
str(which(colnames(GEX2_counts) %in% colnames(gex2_hto)))
##  int [1:1689] 3 4 5 6 7 8 9 10 11 12 ...
str(which(colnames(M239_N239_Alv_counts) %in% colnames(M239_N239_Alv_hto)))
##  int [1:10083] 1 2 3 4 5 6 7 8 9 10 ...
str(which(colnames(P239_Alv_MDM_counts) %in% colnames(P239_Alv_MDM_hto)))
##  int [1:8872] 1 2 3 5 6 7 8 9 10 11 ...

Remove low HTO counts

summary(colSums(gex1_hto))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      16    3580    7222    9093   12782   51772
table(colSums(gex1_hto)>=100)
## 
## FALSE  TRUE 
##   213 11053
gex1_hto <- gex1_hto[,which(colSums(gex1_hto)>=100)]
summary(colSums(gex1_hto))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     101    3764    7356    9267   12923   51772
summary(colSums(gex2_hto))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      27    4110   13737   15216   24504   64861
table(colSums(gex2_hto)>=100)
## 
## FALSE  TRUE 
##     1  1688
gex2_hto <- gex2_hto[,which(colSums(gex2_hto)>=100)]
summary(colSums(gex2_hto))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     145    4110   13746   15225   24512   64861
summary(colSums(M239_N239_Alv_hto))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    1420    2897    4551    6469   40064
table(colSums(M239_N239_Alv_hto)>=100)
## 
## FALSE  TRUE 
##   308  9775
M239_N239_Alv_hto <- M239_N239_Alv_hto[,which(colSums(M239_N239_Alv_hto)>=100)]
summary(colSums(M239_N239_Alv_hto))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     104    1530    3039    4694    6620   40064
summary(colSums(P239_Alv_MDM_hto))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    2072    5122    6850   10136   53637
table(colSums(P239_Alv_MDM_hto)>=100)
## 
## FALSE  TRUE 
##   242  8630
P239_Alv_MDM_hto <- P239_Alv_MDM_hto[,which(colSums(P239_Alv_MDM_hto)>=100)]
summary(colSums(P239_Alv_MDM_hto))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     105    2253    5345    7042   10302   53637

Look at the proportion unmapped

summary(apply(gex1_hto,2,function(x) {x[6]/sum(x) } ) )
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.000e+00 0.000e+00 0.000e+00 8.014e-05 9.345e-05 7.634e-03
summary(apply(gex2_hto,2,function(x) {x[6]/sum(x) } ) )
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.000e+00 0.000e+00 0.000e+00 8.596e-05 7.020e-05 1.285e-02
summary(apply(M239_N239_Alv_hto,2,function(x) {x[6]/sum(x) } ) )
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.001816 0.031254 0.085117 0.304038 0.656792 0.976466
summary(apply(P239_Alv_MDM_hto,2,function(x) {x[6]/sum(x) } ) )
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0009302 0.0228943 0.0751835 0.3774595 0.9449641 0.9855072

Identify ambiguous HTO counts

For each cell barcode, calculate the ratio of top BC to 2nd BC.

getratio <- function(mx){
  res <- lapply(1:ncol(mx), function(i) {
    cnt <- mx[,i]
    top1 <- cnt[order(-cnt)][1]+1
    top2 <- cnt[order(-cnt)][2]+1
    top1/top2
  })
  return(unlist(res))
}

gex1_hto_ratio <- getratio(gex1_hto)
summary(unlist(gex1_hto_ratio))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   4.569  13.970  18.889  30.167 287.357
table(gex1_hto_ratio>3)
## 
## FALSE  TRUE 
##  1527  9526
gex1_hto <- gex1_hto[,which(gex1_hto_ratio>3)]

gex2_hto_ratio <- getratio(gex2_hto)
summary(unlist(gex2_hto_ratio))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   6.643  23.788  22.126  35.101  60.985
table(gex2_hto_ratio>3)
## 
## FALSE  TRUE 
##   201  1487
gex2_hto <- gex2_hto[,which(gex2_hto_ratio>3)]

M239_N239_Alv_hto_ratio <- getratio(M239_N239_Alv_hto)
summary(unlist(M239_N239_Alv_hto_ratio))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   5.849  18.830  19.447  31.135  81.322
table(M239_N239_Alv_hto_ratio>3)
## 
## FALSE  TRUE 
##  1492  8283
M239_N239_Alv_hto <- M239_N239_Alv_hto[,which(M239_N239_Alv_hto_ratio>3)]

P239_Alv_MDM_hto_ratio <- getratio(P239_Alv_MDM_hto)
summary(unlist(P239_Alv_MDM_hto_ratio))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   14.70   29.37   25.73   33.78   95.67
table(P239_Alv_MDM_hto_ratio>3)
## 
## FALSE  TRUE 
##   591  8039
P239_Alv_MDM_hto <- P239_Alv_MDM_hto[,which(P239_Alv_MDM_hto_ratio>3)]

Match

gex1_itx <- intersect(colnames(GEX1_counts),colnames(gex1_hto))
dim(GEX1_counts)
## [1] 36603 11943
GEX1_counts <- GEX1_counts[,gex1_itx]
dim(GEX1_counts)
## [1] 36603  9526
gex1_hto <- gex1_hto[,gex1_itx]
dim(gex1_hto)
## [1]    7 9526
gex2_itx <- intersect(colnames(GEX2_counts),colnames(gex2_hto))
dim(GEX2_counts)
## [1] 36603  2715
GEX2_counts <- GEX2_counts[,gex2_itx]
dim(GEX2_counts)
## [1] 36603  1487
gex2_hto <- gex2_hto[,gex2_itx]
dim(gex2_hto)
## [1]    7 1487
M239_N239_Alv_itx <- intersect(colnames(M239_N239_Alv_counts),colnames(M239_N239_Alv_hto))
dim(M239_N239_Alv_counts)
## [1] 36603 10096
M239_N239_Alv_counts <- M239_N239_Alv_counts[,M239_N239_Alv_itx]
dim(M239_N239_Alv_counts)
## [1] 36603  8283
M239_N239_Alv_hto <- M239_N239_Alv_hto[,M239_N239_Alv_itx]
dim(M239_N239_Alv_hto)
## [1]    7 8283
P239_Alv_MDM_itx <- intersect(colnames(P239_Alv_MDM_counts),colnames(P239_Alv_MDM_hto))
dim(P239_Alv_MDM_counts)
## [1] 36603  9004
P239_Alv_MDM_counts <- P239_Alv_MDM_counts[,P239_Alv_MDM_itx]
dim(P239_Alv_MDM_counts)
## [1] 36603  8039
P239_Alv_MDM_hto <- P239_Alv_MDM_hto[,P239_Alv_MDM_itx]
dim(P239_Alv_MDM_hto)
## [1]    7 8039

Get demux result

Hash tagging sheet.

Library HTO Sample
gex1 1 mock (donor P and V)
gex1 2 gfp_neg (donor P and V)
gex1 4 gfp_pos (donor P and V)
gex2 1 mock donor O
gex2 2 gfp_neg donor O
gex2 4 gfp_pos donor O
M239_N239_Alv 1 AlvM239_mock
M239_N239_Alv 2 AlvM239_gfp_neg
M239_N239_Alv 3 AlvM239_gfp_pos
M239_N239_Alv 4 AlvN239_mock
M239_N239_Alv 5 AlvN239_gfp_neg
M239_N239_Alv 6 AlvN239_gfp_pos
P239_Alv_MDM_hto 1 MDM_mock
P239_Alv_MDM_hto 2 MDM_gfp_neg
P239_Alv_MDM_hto 3 MDM_gfp_pos
P239_Alv_MDM_hto 4 Alv_mock
P239_Alv_MDM_hto 5 Alv_gfp_neg
P239_Alv_MDM_hto 6 Alv_gfp_pos

Input counts:

  • GEX1_counts gex1_hto

  • GEX2_counts gex2_hto

  • M239_N239_Alv M239_N239_Alv_hto

  • P239_Alv_MDM P239_Alv_MDM_hto

table(apply(gex1_hto,2,function(x) { order(-x) } )[1,] )
## 
##    1    2    4    7 
## 1532 1448 6030  516
gex1_idx <- apply(gex1_hto,2,function(x) { order(-x) } )[1,]
gex1_h1 <- GEX1_counts[,which(gex1_idx==1)] # mock (donor P and V)
gex1_h2 <- GEX1_counts[,which(gex1_idx==2)] # gfp_neg (donor P and V) 
gex1_h4 <- GEX1_counts[,which(gex1_idx==4)] # gfp_pos (donor P and V)

table(apply(gex2_hto,2,function(x) { order(-x) } )[1,] )
## 
##   1   2   4   7 
## 177 421 888   1
gex2_idx <- apply(gex2_hto,2,function(x) { order(-x) } )[1,]
gex2_h1 <- GEX2_counts[,which(gex2_idx==1)] # mock donor O
gex2_h2 <- GEX2_counts[,which(gex2_idx==2)] # gfp_neg donor O 
gex2_h4 <- GEX2_counts[,which(gex2_idx==4)] # gfp_pos donor O

table(apply(M239_N239_Alv_hto,2,function(x) { order(-x) } )[1,] )
## 
##    1    2    3    4    5    6 
##  804  621 2335 1354  713 2456
M239_N239_Alv_idx <- apply(M239_N239_Alv_hto,2,function(x) { order(-x) } )[1,]
M239_N239_Alv_h1 <- M239_N239_Alv_counts[,which(M239_N239_Alv_idx==1)] # AlvM239_mock
M239_N239_Alv_h2 <- M239_N239_Alv_counts[,which(M239_N239_Alv_idx==2)] # AlvM239_gfp_neg
M239_N239_Alv_h3 <- M239_N239_Alv_counts[,which(M239_N239_Alv_idx==3)] # AlvM239_gfp_pos
M239_N239_Alv_h4 <- M239_N239_Alv_counts[,which(M239_N239_Alv_idx==4)] # AlvN239_mock
M239_N239_Alv_h5 <- M239_N239_Alv_counts[,which(M239_N239_Alv_idx==5)] # AlvN239_gfp_neg        
M239_N239_Alv_h6 <- M239_N239_Alv_counts[,which(M239_N239_Alv_idx==6)] # AlvN239_gfp_pos     

table(apply(P239_Alv_MDM_hto,2,function(x) { order(-x) } )[1,] )
## 
##    1    2    3    4    5    6 
##  815  436 1811  974  945 3058
P239_Alv_MDM_idx <- apply(P239_Alv_MDM_hto,2,function(x) { order(-x) } )[1,]
P239_Alv_MDM_h1 <- P239_Alv_MDM_counts[,which(P239_Alv_MDM_idx==1)] # MDM_mock
P239_Alv_MDM_h2 <- P239_Alv_MDM_counts[,which(P239_Alv_MDM_idx==2)] # MDM_gfp_neg
P239_Alv_MDM_h3 <- P239_Alv_MDM_counts[,which(P239_Alv_MDM_idx==3)] # MDM_gfp_pos
P239_Alv_MDM_h4 <- P239_Alv_MDM_counts[,which(P239_Alv_MDM_idx==4)] # Alv_mock
P239_Alv_MDM_h5 <- P239_Alv_MDM_counts[,which(P239_Alv_MDM_idx==5)] # Alv_gfp_neg
P239_Alv_MDM_h6 <- P239_Alv_MDM_counts[,which(P239_Alv_MDM_idx==6)] # Alv_gfp_pos

Distinguish SNP pooled samples

Data was previously genotyped and called using cellSNP-lite and vireo.

snp <- read.table("1-GEX_vireo/donor_ids.tsv",header=TRUE)

donor0 <- subset(snp,donor_id=="donor0")$cell
donor0 <- gsub("-1","",donor0)

donor1 <- subset(snp,donor_id=="donor1")$cell
donor1 <- gsub("-1","",donor1)

gex1_h1_d0 <- gex1_h1[,colnames(gex1_h1) %in% donor0]
dim(gex1_h1_d0)
## [1] 36603   796
gex1_h1_d1 <- gex1_h1[,colnames(gex1_h1) %in% donor1]
dim(gex1_h1_d1)
## [1] 36603   659
gex1_h2_d0 <- gex1_h2[,colnames(gex1_h2) %in% donor0]
dim(gex1_h2_d0)
## [1] 36603   880
gex1_h2_d1 <- gex1_h2[,colnames(gex1_h2) %in% donor1]
dim(gex1_h2_d1)
## [1] 36603   479
gex1_h4_d0 <- gex1_h4[,colnames(gex1_h4) %in% donor0]
dim(gex1_h4_d0)
## [1] 36603  2666
gex1_h4_d1 <- gex1_h4[,colnames(gex1_h4) %in% donor1]
dim(gex1_h4_d1)
## [1] 36603  2585

Which is female or male?

Let’s look at the expression of genes on sex cheomosomes.

BTW here’s the bash script to get the chr2gene tsv file:

zcat genes.gtf.gz \
| grep ^chr \
| cut -f1,9 \
| sed 's/; /\n/g' \
| egrep  '(chr|gene_name)' \
| sed 's/gene_name "//' \
| tr -d '"' \
| cut -f1 \
| paste - - \
| uniq \
| sort -u > gene_chr.tsv
gene_chr <- read.table("ref_combined1/hum_hiv/genes/gene_chr.tsv")
head(gene_chr)
##      V1         V2
## 1 chr10       A1CF
## 2 chr10      ABCC2
## 3 chr10       ABI1
## 4 chr10     ABLIM1
## 5 chr10   ABRAXAS2
## 6 chr10 AC005383.1
chrx <- subset(gene_chr,V1=="chrX")$V2
chry <- subset(gene_chr,V1=="chrY")$V2
chrm <- subset(gene_chr,V1=="chrM")$V2
chra <- gene_chr$V2[!gene_chr$V2 %in% c(chrx,chry,chrm)]
str(gene_chr)
## 'data.frame':    36562 obs. of  2 variables:
##  $ V1: chr  "chr10" "chr10" "chr10" "chr10" ...
##  $ V2: chr  "A1CF" "ABCC2" "ABI1" "ABLIM1" ...
str(chrx)
##  chr [1:1146] "ABCB7" "ABCD1" "AC000113.1" "AC002072.1" "AC002368.1" ...
str(chry)
##  chr [1:111] "AC006040.1" "AC006157.1" "AC007244.1" "AC007359.1" ...
str(chrm)
##  chr [1:13] "MT-ATP6" "MT-ATP8" "MT-CO1" "MT-CO2" "MT-CO3" "MT-CYB" ...
d0y <- median(colSums(gex1_h4_d0[which(rownames(gex1_h4_d0) %in% chry),]))
d0x <- median(colSums(gex1_h4_d0[which(rownames(gex1_h4_d0) %in% chrx),]))
d0m <- median(colSums(gex1_h4_d0[which(rownames(gex1_h4_d0) %in% chrm),]))
d0a <- median(colSums(gex1_h4_d0[which(rownames(gex1_h4_d0) %in% chra),]))

d1y <- median(colSums(gex1_h4_d1[which(rownames(gex1_h4_d1) %in% chry),]))
d1x <- median(colSums(gex1_h4_d1[which(rownames(gex1_h4_d1) %in% chrx),]))
d1m <- median(colSums(gex1_h4_d1[which(rownames(gex1_h4_d1) %in% chrm),]))
d1a <- median(colSums(gex1_h4_d1[which(rownames(gex1_h4_d1) %in% chra),]))

message(paste("Donor 0 chrY:",signif(d0y/d0a * 100,3),"%"))
## Donor 0 chrY: 0 %
message(paste("Donor 0 chrX:",signif(d0x/d0a * 100,3),"%"))
## Donor 0 chrX: 3.33 %
message(paste("Donor 0 chrM:",signif(d0m/d0a * 100,3),"%"))
## Donor 0 chrM: 5.7 %
message(paste("Donor 1 chrY:",signif(d1y/d1a * 100,3),"%"))
## Donor 1 chrY: 0.068 %
message(paste("Donor 1 chrX:",signif(d1x/d1a * 100,3),"%"))
## Donor 1 chrX: 3.52 %
message(paste("Donor 1 chrM:",signif(d1m/d1a * 100,3),"%"))
## Donor 1 chrM: 11.2 %

Therefore donor 0 is female and 1 is male.

Check HIV in these

hiv <- mean(colSums(gex1_h1_d0[1:2,])) / mean(colSums(gex1_h1_d0[3:nrow(gex1_h1_d0),]))
message(paste("Mock Donor 0 HIV:",signif(hiv,3),"%"))
## Mock Donor 0 HIV: 1.99e-05 %
hiv <- mean(colSums(gex1_h1_d1[1:2,])) / mean(colSums(gex1_h1_d1[3:nrow(gex1_h1_d1),]))
message(paste("Mock Donor 1 HIV:",signif(hiv,3),"%"))
## Mock Donor 1 HIV: 2.69e-05 %
hiv <- mean(colSums(gex1_h2_d0[1:2,])) / mean(colSums(gex1_h2_d0[3:nrow(gex1_h2_d0),]))
message(paste("GFP- Donor 0 HIV:",signif(hiv,3),"%"))
## GFP- Donor 0 HIV: 0.00249 %
hiv <- mean(colSums(gex1_h2_d1[1:2,])) / mean(colSums(gex1_h2_d1[3:nrow(gex1_h2_d1),]))
message(paste("GFP- Donor 1 HIV:",signif(hiv,3),"%"))
## GFP- Donor 1 HIV: 0.00219 %
hiv <- mean(colSums(gex1_h4_d0[1:2,])) / mean(colSums(gex1_h4_d0[3:nrow(gex1_h4_d0),]))
message(paste("GFP+ Donor 0 HIV:",signif(hiv,3),"%"))
## GFP+ Donor 0 HIV: 9.52e-05 %
hiv <- mean(colSums(gex1_h4_d1[1:2,])) / mean(colSums(gex1_h4_d1[3:nrow(gex1_h4_d1),]))
message(paste("GFP+ Donor 1 HIV:",signif(hiv,3),"%"))
## GFP+ Donor 1 HIV: 0.000104 %
hiv <- mean(colSums(gex2_h1[1:2,])) / mean(colSums(gex2_h1[3:nrow(gex2_h1),]))
message(paste("Mock Donor 2 HIV:",signif(hiv,3),"%"))
## Mock Donor 2 HIV: 1.36e-05 %
hiv <- mean(colSums(gex2_h2[1:2,])) / mean(colSums(gex2_h2[3:nrow(gex2_h2),]))
message(paste("GFP- Donor 2 HIV:",signif(hiv,3),"%"))
## GFP- Donor 2 HIV: 0.00297 %
hiv <- mean(colSums(gex2_h4[1:2,])) / mean(colSums(gex2_h4[3:nrow(gex2_h4),]))
message(paste("GFP+ Donor 2 HIV:",signif(hiv,3),"%"))
## GFP+ Donor 2 HIV: 0.000111 %
hiv <- mean(colSums(P239_Alv_MDM_h1[1:2,])) / mean(colSums(P239_Alv_MDM_h1[3:nrow(P239_Alv_MDM_h1),]))
message(paste("Mock Donor 3 HIV:",signif(hiv,3),"%"))
## Mock Donor 3 HIV: 5.87e-05 %
hiv <- mean(colSums(P239_Alv_MDM_h2[1:2,])) / mean(colSums(P239_Alv_MDM_h2[3:nrow(P239_Alv_MDM_h2),]))
message(paste("GFP+ Donor 3 HIV:",signif(hiv,3),"%"))
## GFP+ Donor 3 HIV: 0.00338 %
hiv <- mean(colSums(P239_Alv_MDM_h3[1:2,])) / mean(colSums(P239_Alv_MDM_h3[3:nrow(P239_Alv_MDM_h3),]))
message(paste("GPF- Donor 3 HIV:",signif(hiv,3),"%"))
## GPF- Donor 3 HIV: 6.72e-05 %
hiv <- mean(colSums(P239_Alv_MDM_h4[1:2,])) / mean(colSums(P239_Alv_MDM_h4[3:nrow(P239_Alv_MDM_h4),]))
message(paste("Mock Donor 4 HIV:",signif(hiv,3),"%"))
## Mock Donor 4 HIV: 3.99e-05 %
hiv <- mean(colSums(P239_Alv_MDM_h5[1:2,])) / mean(colSums(P239_Alv_MDM_h5[3:nrow(P239_Alv_MDM_h5),]))
message(paste("GFP+ Donor 4 HIV:",signif(hiv,3),"%"))
## GFP+ Donor 4 HIV: 0.00501 %
hiv <- mean(colSums(P239_Alv_MDM_h6[1:2,])) / mean(colSums(P239_Alv_MDM_h6[3:nrow(P239_Alv_MDM_h6),]))
message(paste("GFP- Donor 4 HIV:",signif(hiv,3),"%"))
## GFP- Donor 4 HIV: 0.000213 %
hiv <- mean(colSums(M239_N239_Alv_h1[1:2,])) / mean(colSums(M239_N239_Alv_h1[3:nrow(M239_N239_Alv_h1),]))
message(paste("Mock Donor 5 HIV:",signif(hiv,3),"%"))
## Mock Donor 5 HIV: 6.94e-05 %
hiv <- mean(colSums(M239_N239_Alv_h2[1:2,])) / mean(colSums(M239_N239_Alv_h2[3:nrow(M239_N239_Alv_h2),]))
message(paste("Mock Donor 5 HIV:",signif(hiv,3),"%"))
## Mock Donor 5 HIV: 0.00456 %

Yes it looks like GFP minus and positive were swapped

Renaming things

From now on we will rename them: “mock”, “latent”, “active”

mock0 <- gex1_h1_d0 #MDMP236 mock
mock1 <- gex1_h1_d1 #MDMV236 mock
mock2 <- gex2_h1 #MDMO236 mock
mock3 <- P239_Alv_MDM_h1 #MDMP239 mock
mock4 <- P239_Alv_MDM_h4 #AlvP239 mock
mock5 <- M239_N239_Alv_h1 #AlvM239 mock
mock6 <- M239_N239_Alv_h4 #AlvN239 mock

active0 <- gex1_h2_d0 #MDMP236 active
active1 <- gex1_h2_d1 #MDMV236 active
active2 <- gex2_h2 #MDMO236 active
active3 <- P239_Alv_MDM_h2 #MDMP239 active
active4 <- P239_Alv_MDM_h5 #AlvP239 active
active5 <- M239_N239_Alv_h2 #AlvM239 active
active6 <- M239_N239_Alv_h5 #AlvN239 active

latent0 <- gex1_h4_d0[,which(colSums(gex1_h4_d0[1:2,])/colSums(gex1_h4_d0)*1e6>=10)] #MDMP236 latent
bystander0 <- gex1_h4_d0[,which(colSums(gex1_h4_d0[1:2,])/colSums(gex1_h4_d0)*1e6<10)] #MDMP236 bystander

latent1 <- gex1_h4_d1[,which(colSums(gex1_h4_d1[1:2,])/colSums(gex1_h4_d1)*1e6>=10)] #MDMV236 latent
bystander1 <- gex1_h4_d1[,which(colSums(gex1_h4_d1[1:2,])/colSums(gex1_h4_d1)*1e6<10)] #MDMV236 bystander

latent2 <- gex2_h4[,which(colSums(gex2_h4[1:2,])/colSums(gex2_h4)*1e6>=10)] #MDMO236 latent
bystander2 <- gex2_h4[,which(colSums(gex2_h4[1:2,])/colSums(gex2_h4)*1e6<10)] #MDMO236 bystander

latent3 <- P239_Alv_MDM_h3[,which(colSums(P239_Alv_MDM_h3[1:2,])/colSums(P239_Alv_MDM_h3)*1e6>=10)] #MDMP239 latent
bystander3 <- P239_Alv_MDM_h3[,which(colSums(P239_Alv_MDM_h3[1:2,])/colSums(P239_Alv_MDM_h3)*1e6<10)] #MDMP239 bystander

latent4 <- P239_Alv_MDM_h6[,which(colSums(P239_Alv_MDM_h6[1:2,])/colSums(P239_Alv_MDM_h6)*1e6>=10)] #AlvP239 latent
bystander4 <- P239_Alv_MDM_h6[,which(colSums(P239_Alv_MDM_h6[1:2,])/colSums(P239_Alv_MDM_h6)*1e6<10)] #AlvP239 bystander

latent5 <- M239_N239_Alv_h3[,which(colSums(M239_N239_Alv_h3[1:2,])/colSums(M239_N239_Alv_h3)*1e6>=10)] #AlvM239 latent
bystander5 <- M239_N239_Alv_h3[,which(colSums(M239_N239_Alv_h3[1:2,])/colSums(M239_N239_Alv_h3)*1e6<10)] #AlvM239 bystander

latent6 <- M239_N239_Alv_h6[,which(colSums(M239_N239_Alv_h6[1:2,])/colSums(M239_N239_Alv_h6)*1e6>=10)] #AlvN239 latent
bystander6 <- M239_N239_Alv_h6[,which(colSums(M239_N239_Alv_h6[1:2,])/colSums(M239_N239_Alv_h6)*1e6<10)] #AlvN239 bystander

react6 <- N239Alv_react_counts #AlvN239

colnames(mock0) <- paste("mock0|",colnames(mock0),sep="")
colnames(mock1) <- paste("mock1|",colnames(mock1),sep="")
colnames(mock2) <- paste("mock2|",colnames(mock2),sep="")
colnames(mock3) <- paste("mock3|",colnames(mock3),sep="")
colnames(mock4) <- paste("mock4|",colnames(mock4),sep="")
colnames(mock5) <- paste("mock5|",colnames(mock5),sep="")
colnames(mock6) <- paste("mock6|",colnames(mock6),sep="")

colnames(active0) <- paste("active0|",colnames(active0),sep="")
colnames(active1) <- paste("active1|",colnames(active1),sep="")
colnames(active2) <- paste("active2|",colnames(active2),sep="")
colnames(active3) <- paste("active3|",colnames(active3),sep="")
colnames(active4) <- paste("active4|",colnames(active4),sep="")
colnames(active5) <- paste("active5|",colnames(active5),sep="")
colnames(active6) <- paste("active6|",colnames(active6),sep="")

colnames(bystander0) <- paste("bystander0|",colnames(bystander0),sep="")
colnames(bystander1) <- paste("bystander1|",colnames(bystander1),sep="")
colnames(bystander2) <- paste("bystander2|",colnames(bystander2),sep="")
colnames(bystander3) <- paste("bystander3|",colnames(bystander3),sep="")
colnames(bystander4) <- paste("bystander4|",colnames(bystander4),sep="")
colnames(bystander5) <- paste("bystander5|",colnames(bystander5),sep="")
colnames(bystander6) <- paste("bystander6|",colnames(bystander6),sep="")

colnames(latent0) <- paste("latent0|",colnames(latent0),sep="")
colnames(latent1) <- paste("latent1|",colnames(latent1),sep="")
colnames(latent2) <- paste("latent2|",colnames(latent2),sep="")
colnames(latent3) <- paste("latent3|",colnames(latent3),sep="")
colnames(latent4) <- paste("latent4|",colnames(latent4),sep="")
colnames(latent5) <- paste("latent5|",colnames(latent5),sep="")
colnames(latent6) <- paste("latent6|",colnames(latent6),sep="")

colnames(react6) <- paste("react6|",colnames(react6),sep="")

mylist <- list(mock0,mock1,mock2,mock3,mock4,mock5,mock6,
  active0,active1,active2,active3,active4,active5,active6,
  latent0,latent1,latent2,latent3,latent4,latent5,latent6,
  bystander0,bystander1,bystander2,bystander3,bystander4,bystander5,bystander6,
  react6)

names(mylist) <- c("mock0","mock1","mock2","mock3","mock4","mock5","mock6",
  "active0","active1","active2","active3","active4","active5","active6",
  "latent0","latent1","latent2","latent3","latent4","latent5","latent6",
  "bystander0","bystander1","bystander2","bystander3","bystander4","bystander5","bystander6",
  "react6")

comb <- do.call(cbind,mylist)

rev(sapply(mylist,ncol))
##     react6 bystander6 bystander5 bystander4 bystander3 bystander2 bystander1 
##       3098        581        504       1122       1059        540       1334 
## bystander0    latent6    latent5    latent4    latent3    latent2    latent1 
##       1494       1875       1831       1936        752        348       1251 
##    latent0    active6    active5    active4    active3    active2    active1 
##       1172        713        621        945        436        421        479 
##    active0      mock6      mock5      mock4      mock3      mock2      mock1 
##        880       1354        804        974        815        177        659 
##      mock0 
##        796
barplot(rev(sapply(mylist,ncol)),horiz=TRUE,las=1,xlab="no. cells")

rev(sapply(mylist,function(mx) {sum(colSums(mx))}))
##     react6 bystander6 bystander5 bystander4 bystander3 bystander2 bystander1 
##  101434715   13581660   14197319   22576981   20899809   16347008   37007345 
## bystander0    latent6    latent5    latent4    latent3    latent2    latent1 
##   40706682   52551196   57038454   43828374   16683014   15146215   39470469 
##    latent0    active6    active5    active4    active3    active2    active1 
##   35367506   24294045   28767640   30368943   13952159   26354854   22949672 
##    active0      mock6      mock5      mock4      mock3      mock2      mock1 
##   31058301   35198714   25658289   20524272   20766932    7574030   21671028 
##      mock0 
##   29204285
barplot(rev(sapply(mylist,function(mx) {sum(colSums(mx))})),horiz=TRUE,las=1,xlab="no. counts")

res <- rev(sapply(mylist,function(mx) {
  mean(colSums(mx[1:2,])) /  mean(colSums(mx[3:nrow(mx),])) *100
}))
res
##       react6   bystander6   bystander5   bystander4   bystander3   bystander2 
## 8.335119e-03 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 3.670398e-05 
##   bystander1   bystander0      latent6      latent5      latent4      latent3 
## 0.000000e+00 0.000000e+00 5.531000e-02 2.802404e-02 3.228175e-02 1.513145e-02 
##      latent2      latent1      latent0      active6      active5      active4 
## 2.300113e-02 2.010515e-02 2.047779e-02 4.908506e-01 4.561895e-01 5.008139e-01 
##      active3      active2      active1      active0        mock6        mock5 
## 3.379896e-01 2.969919e-01 2.189171e-01 2.493880e-01 2.206249e-02 6.941709e-03 
##        mock4        mock3        mock2        mock1        mock0 
## 3.990557e-03 5.870254e-03 1.359929e-03 2.694915e-03 1.986049e-03
barplot(rev(res),horiz=TRUE,las=1,xlab="% HIV reads")

Analysis by chromosome

chrmx <- sapply(mylist,function(mx) {
  rsum <- as.data.frame(rowSums(mx))
  m <- merge(gene_chr,rsum,by.x="V2",by.y=0)
  m$V2=NULL
  ag <- aggregate(. ~ V1 , m , sum)
  out <- ag[,2]
  names(out) <- ag[,1]
  return(out)
})

chryx <- chrmx["chrY",] / chrmx["chrX",] * 100
chryx
##       mock0       mock1       mock2       mock3       mock4       mock5 
## 0.031172346 2.182391164 0.002422481 1.295832427 1.540775049 1.611251947 
##       mock6     active0     active1     active2     active3     active4 
## 0.073153613 0.041594896 2.102213034 0.002521683 1.440742401 1.576667523 
##     active5     active6     latent0     latent1     latent2     latent3 
## 1.677336931 0.040418557 0.048474051 2.338805483 0.002960918 1.359138677 
##     latent4     latent5     latent6  bystander0  bystander1  bystander2 
## 1.627231785 1.618820278 0.106820775 0.032885588 2.341440052 0.002211842 
##  bystander3  bystander4  bystander5  bystander6      react6 
## 1.311121761 1.586910279 1.642328742 0.057817427 0.825632571
barplot(rev(chryx),horiz=TRUE,las=1,xlab="Ratio ChrY to ChrX x100")

These checks look consistent, except for the react6 which appears as more male but that is not consistent with mock6, active6 and latent6.

chrmt <- chrmx["chrM",] / colSums(chrmx[1:22,]) *100
chrmt
##      mock0      mock1      mock2      mock3      mock4      mock5      mock6 
##   5.640547  12.170054  13.618687   4.525319   8.676031  11.244711  13.267361 
##    active0    active1    active2    active3    active4    active5    active6 
##   6.309643   7.977511   9.660507   3.423388   5.698696   5.857497   8.241454 
##    latent0    latent1    latent2    latent3    latent4    latent5    latent6 
##   6.837145  12.548767  15.863535   5.450106   7.388422   9.886849  10.559073 
## bystander0 bystander1 bystander2 bystander3 bystander4 bystander5 bystander6 
##   6.134851  11.734885  12.933701   5.862922   7.932224  10.203284  11.634663 
##     react6 
##   5.610831
barplot(rev(chrmt),horiz=TRUE,las=1,xlab="% chrM reads")

Partitioning bystander and latent

Save data object

saveRDS(mylist,"macrophage_counts.rds")

Session information

For reproducibility.

save.image("macrophage_dataprep.Rdata")


sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] mitch_1.16.0                DESeq2_1.44.0              
##  [3] muscat_1.18.0               beeswarm_0.4.0             
##  [5] stringi_1.8.4               SingleCellExperiment_1.26.0
##  [7] SummarizedExperiment_1.34.0 Biobase_2.64.0             
##  [9] GenomicRanges_1.56.0        GenomeInfoDb_1.40.0        
## [11] IRanges_2.38.0              S4Vectors_0.42.0           
## [13] BiocGenerics_0.50.0         MatrixGenerics_1.16.0      
## [15] matrixStats_1.3.0           hdf5r_1.3.11               
## [17] Seurat_5.1.0                SeuratObject_5.0.2         
## [19] sp_2.1-4                    plyr_1.8.9                 
## 
## loaded via a namespace (and not attached):
##   [1] spatstat.sparse_3.1-0     bitops_1.0-7             
##   [3] httr_1.4.7                RColorBrewer_1.1-3       
##   [5] doParallel_1.0.17         numDeriv_2016.8-1.1      
##   [7] backports_1.5.0           tools_4.4.1              
##   [9] sctransform_0.4.1         utf8_1.2.4               
##  [11] R6_2.5.1                  lazyeval_0.2.2           
##  [13] uwot_0.2.2                mgcv_1.9-1               
##  [15] GetoptLong_1.0.5          GGally_2.2.1             
##  [17] prettyunits_1.2.0         gridExtra_2.3            
##  [19] progressr_0.14.0          cli_3.6.3                
##  [21] spatstat.explore_3.2-7    fastDummies_1.7.3        
##  [23] sass_0.4.9                mvtnorm_1.2-5            
##  [25] spatstat.data_3.1-2       blme_1.0-5               
##  [27] ggridges_0.5.6            pbapply_1.7-2            
##  [29] systemfonts_1.1.0         R.utils_2.12.3           
##  [31] svglite_2.1.3             scater_1.32.0            
##  [33] parallelly_1.37.1         limma_3.60.0             
##  [35] rstudioapi_0.16.0         generics_0.1.3           
##  [37] shape_1.4.6.1             gtools_3.9.5             
##  [39] ica_1.0-3                 spatstat.random_3.2-3    
##  [41] dplyr_1.1.4               Matrix_1.7-0             
##  [43] ggbeeswarm_0.7.2          fansi_1.0.6              
##  [45] abind_1.4-5               R.methodsS3_1.8.2        
##  [47] lifecycle_1.0.4           yaml_2.3.9               
##  [49] edgeR_4.2.0               gplots_3.1.3.1           
##  [51] SparseArray_1.4.3         Rtsne_0.17               
##  [53] grid_4.4.1                promises_1.3.0           
##  [55] crayon_1.5.3              miniUI_0.1.1.1           
##  [57] lattice_0.22-6            echarts4r_0.4.5          
##  [59] beachmat_2.20.0           cowplot_1.1.3            
##  [61] pillar_1.9.0              knitr_1.48               
##  [63] ComplexHeatmap_2.20.0     rjson_0.2.21             
##  [65] boot_1.3-30               corpcor_1.6.10           
##  [67] future.apply_1.11.2       codetools_0.2-20         
##  [69] leiden_0.4.3.1            glue_1.7.0               
##  [71] data.table_1.15.4         vctrs_0.6.5              
##  [73] png_0.1-8                 spam_2.10-0              
##  [75] Rdpack_2.6                gtable_0.3.5             
##  [77] cachem_1.1.0              xfun_0.45                
##  [79] rbibutils_2.2.16          S4Arrays_1.4.0           
##  [81] mime_0.12                 survival_3.7-0           
##  [83] iterators_1.0.14          statmod_1.5.0            
##  [85] fitdistrplus_1.1-11       ROCR_1.0-11              
##  [87] nlme_3.1-165              pbkrtest_0.5.3           
##  [89] bit64_4.0.5               EnvStats_2.8.1           
##  [91] progress_1.2.3            RcppAnnoy_0.0.22         
##  [93] bslib_0.7.0               TMB_1.9.11               
##  [95] irlba_2.3.5.1             vipor_0.4.7              
##  [97] KernSmooth_2.23-24        colorspace_2.1-0         
##  [99] tidyselect_1.2.1          bit_4.0.5                
## [101] compiler_4.4.1            BiocNeighbors_1.22.0     
## [103] xml2_1.3.6                DelayedArray_0.30.1      
## [105] plotly_4.10.4             scales_1.3.0             
## [107] caTools_1.18.2            remaCor_0.0.18           
## [109] lmtest_0.9-40             stringr_1.5.1            
## [111] digest_0.6.36             goftest_1.2-3            
## [113] spatstat.utils_3.0-5      minqa_1.2.7              
## [115] variancePartition_1.34.0  rmarkdown_2.27           
## [117] aod_1.3.3                 XVector_0.44.0           
## [119] RhpcBLASctl_0.23-42       htmltools_0.5.8.1        
## [121] pkgconfig_2.0.3           lme4_1.1-35.3            
## [123] sparseMatrixStats_1.16.0  highr_0.11               
## [125] fastmap_1.2.0             rlang_1.1.4              
## [127] GlobalOptions_0.1.2       htmlwidgets_1.6.4        
## [129] UCSC.utils_1.0.0          shiny_1.8.1.1            
## [131] DelayedMatrixStats_1.26.0 jquerylib_0.1.4          
## [133] zoo_1.8-12                jsonlite_1.8.8           
## [135] BiocParallel_1.38.0       R.oo_1.26.0              
## [137] BiocSingular_1.20.0       magrittr_2.0.3           
## [139] kableExtra_1.4.0          scuttle_1.14.0           
## [141] GenomeInfoDbData_1.2.12   dotCall64_1.1-1          
## [143] patchwork_1.2.0           munsell_0.5.1            
## [145] Rcpp_1.0.12               viridis_0.6.5            
## [147] reticulate_1.38.0         zlibbioc_1.50.0          
## [149] MASS_7.3-61               ggstats_0.6.0            
## [151] listenv_0.9.1             ggrepel_0.9.5            
## [153] deldir_2.0-4              splines_4.4.1            
## [155] tensor_1.5                hms_1.1.3                
## [157] circlize_0.4.16           locfit_1.5-9.10          
## [159] igraph_2.0.3              spatstat.geom_3.2-9      
## [161] RcppHNSW_0.6.0            reshape2_1.4.4           
## [163] ScaledMatrix_1.12.0       evaluate_0.24.0          
## [165] nloptr_2.1.1              foreach_1.5.2            
## [167] httpuv_1.6.15             RANN_2.6.1               
## [169] tidyr_1.3.1               purrr_1.0.2              
## [171] polyclip_1.10-6           future_1.33.2            
## [173] clue_0.3-65               scattermore_1.2          
## [175] ggplot2_3.5.1             rsvd_1.0.5               
## [177] broom_1.0.6               xtable_1.8-4             
## [179] fANCOVA_0.6-1             RSpectra_0.16-1          
## [181] later_1.3.2               viridisLite_0.4.2        
## [183] tibble_3.2.1              lmerTest_3.1-3           
## [185] glmmTMB_1.1.9             cluster_2.1.6            
## [187] globals_0.16.3