Single cell transcriptomes of macrophages with HIV 2024

Introduction

Description of the samples TODO.

suppressPackageStartupMessages({
  library("plyr")
  library("Seurat")
  library("hdf5r")
  library("SingleCellExperiment")
  library("parallel")
  library("stringi")
  library("beeswarm")
  library("muscat")
  library("DESeq2")
  library("mitch")
  library("vioplot")
  library("beeswarm")
})

Load data

Load the h5 matrices.

CSP1_counts ./1-CSP/outs/filtered_feature_bc_matrix.h5

GEX1_counts ./1-GEX/outs/filtered_feature_bc_matrix.h5

CSP2_counts ./2-CSP/outs/filtered_feature_bc_matrix.h5

GEX2_counts ./2-GEX/outs/filtered_feature_bc_matrix.h5

M239_N239_Alv_counts ./M239-N239-Alv/outs/filtered_feature_bc_matrix.h5

N239Alv_react ./N239Alv-react/outs/filtered_feature_bc_matrix.h5

P239_Alv_and_MDM ./P239-Alv-and-MDM/outs/filtered_feature_bc_matrix.h5

CSP1_counts <- Read10X_h5("./1-CSP/outs/filtered_feature_bc_matrix.h5")
colnames(CSP1_counts) <- gsub("-1","",colnames(CSP1_counts))
dim(CSP1_counts)

## [1] 36603  1887

ncol(CSP1_counts)

## [1] 1887

summary(colSums(CSP1_counts))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     390    1021    1530    1624    2118    5422

CSP1_counts[1:10,1:30]

## 10 x 30 sparse Matrix of class "dgCMatrix"

##   [[ suppressing 30 column names 'AAACCCAAGTGCAACG', 'AAACCCACAATGACCT', 'AAACCCAGTTTCCCAC' ... ]]

##                                                                          
## gene-HIV1-1 . . . . . . . .  . . 1  2 . . . . . . . . . . . . . . . . . .
## gene-HIV1-2 . . . 9 2 . . . 12 . 6 12 . 6 . . . 7 . . . 8 . . . 2 1 . . .
## MIR1302-2HG . . . . . . . .  . . .  . . . . . . . . . . . . . . . . . . .
## FAM138A     . . . . . . . .  . . .  . . . . . . . . . . . . . . . . . . .
## OR4F5       . . . . . . . .  . . .  . . . . . . . . . . . . . . . . . . .
## AL627309.1  . . . . . . . .  . . .  . . . . . . . . . . . . . . . . . . .
## AL627309.3  . . . . . . . .  . . .  . . . . . . . . . . . . . . . . . . .
## AL627309.2  . . . . . . . .  . . .  . . . . . . . . . . . . . . . . . . .
## AL627309.5  . . . . . . . .  . . .  . . . . . . . . . . . . . . . . . . .
## AL627309.4  . . . . . . . .  . . .  . . . . . . . . . . . . . . . . . . .

GEX1_counts <- Read10X_h5("./1-GEX/outs/filtered_feature_bc_matrix.h5")
colnames(GEX1_counts) <- gsub("-1","",colnames(GEX1_counts))
ncol(GEX1_counts)

## [1] 11959

summary(colSums(GEX1_counts))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     500    5508   29088   29112   46192  166924

GEX1_counts[1:10,1:30]

## 10 x 30 sparse Matrix of class "dgCMatrix"

##   [[ suppressing 30 column names 'AAACCCAAGAGGGCGA', 'AAACCCAAGGACAACC', 'AAACCCAAGGTAGTCG' ... ]]

##                                                                               
## HIV_LTRR      . 2 .  . . . . .   4 . . . . . . .  . . .  1 . .  . . .  . 1 . .
## HIV_LTRU5     . . . 12 . . 3 1 201 1 2 3 . . 1 5 40 1 1 11 . 5 15 1 5 37 . . 1
## HIV_Gagp17    . . .  1 . . . .  77 . . . . . . .  3 . .  2 . .  3 . 1 15 . . .
## HIV_Gagp24    . . .  . . . . .   . . . . . . . .  . . .  . . .  . . .  . . . .
## HIV_Gagp2p7   . 2 .  . . . . .   2 . . . . . . .  . . .  . . .  . . .  . . . .
## HIV_Gagp1Pol  . . .  2 . . . .   1 . . . . . . .  . . .  . . .  . . .  . . . .
## HIV_Polprot   . 1 .  2 . . . .  62 1 . 3 . . . 1  8 . .  . . .  4 . 3  2 . . 1
## HIV_Polp15p31 . . .  6 . . 1 3 120 . . 7 . . . 3 15 4 .  3 . . 11 1 6 10 . . 3
## HIV_Vif       . . .  . . . . .  12 . . . . . . .  1 . .  1 . .  . . .  . . . .
## HIV_Vpr       . . .  . . . . .   1 . . . . . . .  . . .  . . .  . . .  . . . .
##                
## HIV_LTRR      .
## HIV_LTRU5     2
## HIV_Gagp17    .
## HIV_Gagp24    .
## HIV_Gagp2p7   .
## HIV_Gagp1Pol  .
## HIV_Polprot   .
## HIV_Polp15p31 2
## HIV_Vif       .
## HIV_Vpr       .

CSP2_counts <- Read10X_h5("./2-CSP/outs/filtered_feature_bc_matrix.h5")
colnames(CSP2_counts) <- gsub("-1","",colnames(CSP2_counts))
ncol(CSP2_counts)

## [1] 6266

summary(colSums(CSP2_counts))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20.00   26.00   33.00   54.44   60.00  372.00

CSP2_counts[1:10,1:30]

## 10 x 30 sparse Matrix of class "dgCMatrix"

##   [[ suppressing 30 column names 'AAACCCAAGAGGGCGA', 'AAACCCAAGTGCAACG', 'AAACCCACAATGACCT' ... ]]

##                                                                        
## gene-HIV1-1 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## gene-HIV1-2 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## MIR1302-2HG . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## FAM138A     . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## OR4F5       . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## AL627309.1  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## AL627309.3  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## AL627309.2  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## AL627309.5  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## AL627309.4  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

GEX2_counts <- Read10X_h5("./2-GEX/outs/filtered_feature_bc_matrix.h5")
colnames(GEX2_counts) <- gsub("-1","",colnames(GEX2_counts))
ncol(GEX2_counts)

## [1] 2714

summary(colSums(GEX2_counts))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     717   12870   52452   55604   87448  265374

GEX2_counts[1:10,1:30]

## 10 x 30 sparse Matrix of class "dgCMatrix"

##   [[ suppressing 30 column names 'AAACCCAAGTGCAACG', 'AAACCCACAATGACCT', 'AAACCCAGTGACATCT' ... ]]

##                                                                              
## HIV_LTRR      . . . . . .  12  4 . . . . .   6 .  . .   7   4 .   2 . . . . .
## HIV_LTRU5     . 2 . 1 4 . 320 78 2 . . . . 227 . 35 . 299 361 . 198 . . . . .
## HIV_Gagp17    . . . . . . 175  3 . . . . .  73 . 21 .  91 121 2  66 . . . . .
## HIV_Gagp24    . . . . . .   .  . . . . . .   . .  . .   .   . .   . . . . . .
## HIV_Gagp2p7   . . . . . .   3  . . . . . .   7 .  2 .   3   6 .   7 . . . . .
## HIV_Gagp1Pol  . . . . . .   9  . . . . . .   2 .  1 .   5   7 .   1 . . . . .
## HIV_Polprot   . 2 . 1 1 . 134  4 2 . . . 2  90 2 28 .  83 171 .  72 . . . 1 .
## HIV_Polp15p31 . . . 5 . . 273  1 1 . . . . 129 2 47 . 234 385 . 133 1 . . 4 .
## HIV_Vif       . . . 2 . .  19  2 . . . . 2   9 .  2 .   9  29 .  11 . . . . .
## HIV_Vpr       . . . . . .   .  . . . . . .   . .  . .   2   . .   2 . . . . .
##                          
## HIV_LTRR       1 .   3  2
## HIV_LTRU5     18 1 239 20
## HIV_Gagp17     1 .  52  4
## HIV_Gagp24     . .   .  .
## HIV_Gagp2p7    . .   .  .
## HIV_Gagp1Pol   . .   3  .
## HIV_Polprot   10 1  43  6
## HIV_Polp15p31 24 . 176 29
## HIV_Vif        2 .  11  .
## HIV_Vpr        . .   .  .

M239_N239_Alv_counts <- Read10X_h5("./M239-N239-Alv/outs/filtered_feature_bc_matrix.h5")
colnames(M239_N239_Alv_counts) <- gsub("-1","",colnames(M239_N239_Alv_counts))
ncol(M239_N239_Alv_counts)

## [1] 10100

summary(colSums(M239_N239_Alv_counts))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     501   17451   29598   29818   40450  135607

M239_N239_Alv_counts[1:10,1:30]

## 10 x 30 sparse Matrix of class "dgCMatrix"

##   [[ suppressing 30 column names 'AAACCCACACTGAATC', 'AAACCCACAGTCAGAG', 'AAACCCAGTACCCAGC' ... ]]

##                                                                               
## HIV_LTRR        1 . 1 .  25 . . . . . .   37 . . 2 . . . . . . . . . . . . . .
## HIV_LTRU5     141 1 . . 398 1 . . . . 1 1433 2 . . . 1 . . . . 1 . 3 4 2 1 1 3
## HIV_Gagp17     31 . . .  62 . . . 1 . .  139 . 1 . . . . . . . . . . . . . . .
## HIV_Gagp24      . . . .   . . . . . . .    . . . . . . . . . . . . . . . . . .
## HIV_Gagp2p7     . . . .   4 . . . . . .    7 . . . . . . . . . . . . . . . . .
## HIV_Gagp1Pol    1 . . .   8 . . . . . .   12 . 1 . . . . . . . . . . . . . . .
## HIV_Polprot    19 . . .  70 . . . . . .  297 1 . . . 1 . . . . 1 . . . . . . 1
## HIV_Polp15p31  34 . 1 . 253 . . . 2 . .  426 . . . . . . . 1 . . 1 1 3 . . . 1
## HIV_Vif         2 . . .  19 . . . . . .   33 . . . . . . . . . . . . . . . . .
## HIV_Vpr         1 . . .   3 . . . . . .    6 . . . . . . . . . . . . . . . . .
##                
## HIV_LTRR      .
## HIV_LTRU5     .
## HIV_Gagp17    .
## HIV_Gagp24    .
## HIV_Gagp2p7   .
## HIV_Gagp1Pol  .
## HIV_Polprot   .
## HIV_Polp15p31 .
## HIV_Vif       .
## HIV_Vpr       .

N239Alv_react_counts <- Read10X_h5("./N239Alv-react/outs/filtered_feature_bc_matrix.h5")
colnames(N239Alv_react_counts) <- gsub("-1","",colnames(N239Alv_react_counts))
ncol(N239Alv_react_counts)

## [1] 3098

summary(colSums(N239Alv_react_counts))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     541   25100   33557   32747   41776   93128

N239Alv_react_counts[1:10,1:30]

## 10 x 30 sparse Matrix of class "dgCMatrix"

##   [[ suppressing 30 column names 'AAACCCATCTCGCTCA', 'AAACGAACAGTGAGCA', 'AAACGAATCACTGGTA' ... ]]

##                                                                          
## HIV_LTRR      . . . . . . . . . . . 1 . . . . . . . . . . . . . . . . . .
## HIV_LTRU5     . . . . 1 . . . . . . 1 . . . . . . . . 1 . . . . . . . . .
## HIV_Gagp17    . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## HIV_Gagp24    . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## HIV_Gagp2p7   . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## HIV_Gagp1Pol  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## HIV_Polprot   . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## HIV_Polp15p31 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## HIV_Vif       . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## HIV_Vpr       . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

P239_Alv_MDM_counts <- Read10X_h5("./P239-Alv-and-MDM/outs/filtered_feature_bc_matrix.h5")
colnames(P239_Alv_MDM_counts) <- gsub("-1","",colnames(P239_Alv_MDM_counts))
ncol(P239_Alv_MDM_counts)

## [1] 9004

summary(colSums(P239_Alv_MDM_counts))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     506   15079   22409   24010   31810  120619

P239_Alv_MDM_counts[1:10,1:30]

## 10 x 30 sparse Matrix of class "dgCMatrix"

##   [[ suppressing 30 column names 'AAACCCACACAATGAA', 'AAACCCACACGCCACA', 'AAACCCACACGCGTGT' ... ]]

##                                                                                
## HIV_LTRR      . . . . . . . . .   5 .  1 .  1 .  12 . . .  1  . . . .   5 . . .
## HIV_LTRU5     . . . 2 . . 2 . 4 233 . 53 3 27 5 239 1 3 . 31 21 . . 1 150 1 . 1
## HIV_Gagp17    . . . . . . . . .  59 .  3 .  8 .  67 . . .  3  7 . . 2  19 . . .
## HIV_Gagp24    . . . . . . . . .   . .  . .  . .   . . . .  .  . . . .   . . . .
## HIV_Gagp2p7   . . . . . . . . .   2 .  . .  . .   1 . . .  .  . . . .   3 . . .
## HIV_Gagp1Pol  . . . . . . . . .   2 .  . .  1 .   2 . . .  1  . . . .   2 . . .
## HIV_Polprot   . . . . . . . . .  15 .  3 .  3 .  39 . . . 10  1 . . .  26 . . 1
## HIV_Polp15p31 . . . . . . 1 . .  21 .  1 . 15 1  52 . . . 23  . . . .  70 1 . .
## HIV_Vif       . . . . . . . . .   1 .  . .  1 .   1 . . .  4  1 . . 1   5 . . .
## HIV_Vpr       . . . . . . . . .   1 .  . .  . .   . . . .  1  . . . .   1 . . .
##                  
## HIV_LTRR      . .
## HIV_LTRU5     2 .
## HIV_Gagp17    . .
## HIV_Gagp24    . .
## HIV_Gagp2p7   . .
## HIV_Gagp1Pol  . .
## HIV_Polprot   . .
## HIV_Polp15p31 . .
## HIV_Vif       . .
## HIV_Vpr       . .

CSP1_counts ./1-CSP/outs/filtered_feature_bc_matrix.h5

GEX1_counts ./1-GEX/outs/filtered_feature_bc_matrix.h5

CSP2_counts ./2-CSP/outs/filtered_feature_bc_matrix.h5

GEX2_counts ./2-GEX/outs/filtered_feature_bc_matrix.h5

M239_N239_Alv_counts ./M239-N239-Alv/outs/filtered_feature_bc_matrix.h5

N239Alv_react_counts ./N239Alv-react/outs/filtered_feature_bc_matrix.h5

P239_Alv_MDM_counts ./P239-Alv-and-MDM/outs/filtered_feature_bc_matrix.h5

if (! file.exists("CSP1_cell_barcodes.txt") ) {
  CSP1_bc <- colnames(CSP1_counts)
  writeLines(CSP1_bc,con="CSP1_cell_barcodes.txt")
}

if (! file.exists("GEX1_cell_barcodes.txt") ) {
  GEX1_bc <- colnames(GEX1_counts)
  writeLines(GEX1_bc,con="GEX1_cell_barcodes.txt")
}

if (! file.exists("CSP2_cell_barcodes.txt") ) {
  CSP2_bc <- colnames(CSP2_counts)
  writeLines(CSP2_bc,con="CSP2_cell_barcodes.txt")
}

if (! file.exists("GEX2_cell_barcodes.txt") ) {
  GEX2_bc <- colnames(GEX2_counts)
  writeLines(GEX2_bc,con="GEX2_cell_barcodes.txt")
}

if (! file.exists("M239_N239_Alv_cell_barcodes.txt") ) {
  M239_N239_Alv_bc <- colnames(M239_N239_Alv_counts)
  writeLines(M239_N239_Alv_bc,con="M239_N239_Alv_cell_barcodes.txt")
}

if (! file.exists("N239Alv_react_cell_barcodes.txt") ) {
  N239Alv_react_bc <- colnames(N239Alv_react_counts)
  writeLines(N239Alv_react_bc,con="N239Alv_react_cell_barcodes.txt")
}

if (! file.exists("P239_Alv_and_MDM_cell_barcodes.txt") ) {
  P239_Alv_and_MDM_bc <- colnames(P239_Alv_MDM_counts)
  writeLines(P239_Alv_and_MDM_bc,con="P239_Alv_and_MDM_cell_barcodes.txt")
}

Now read HTO data.

gex1_hto <- Read10X("1-CSP_hto/read_count/", gene.column=1)
#gex1_hto_trash <- Read10X("trash/1-CSP_hto/read_count/", gene.column=1)
summary(colSums(gex1_hto))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      16    3580    7222    9093   12782   51772

dim(gex1_hto)

## [1]     7 11266

gex2_hto <- Read10X("2-CSP_hto/read_count/", gene.column=1)
summary(colSums(gex2_hto))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      27    4110   13737   15216   24504   64861

dim(gex2_hto)

## [1]    7 1689

M239_N239_Alv_hto <- Read10X("M239-N239-Alv_hto/read_count/", gene.column=1)
summary(colSums(M239_N239_Alv_hto))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    1420    2897    4551    6469   40064

dim(M239_N239_Alv_hto)

## [1]     7 10083

P239_Alv_MDM_hto <- Read10X("P239-Alv-and-MDM_hto/read_count/", gene.column=1)
summary(colSums(P239_Alv_MDM_hto))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    2072    5122    6850   10136   53637

dim(P239_Alv_MDM_hto)

## [1]    7 8872

str(which(colnames(GEX1_counts) %in% colnames(gex1_hto)))

##  int [1:11236] 1 2 3 4 5 6 7 8 9 10 ...

str(which(colnames(GEX2_counts) %in% colnames(gex2_hto)))

##  int [1:1686] 3 4 5 6 7 8 9 10 11 12 ...

str(which(colnames(M239_N239_Alv_counts) %in% colnames(M239_N239_Alv_hto)))

##  int [1:10079] 1 2 3 4 5 6 7 8 9 10 ...

str(which(colnames(P239_Alv_MDM_counts) %in% colnames(P239_Alv_MDM_hto)))

##  int [1:8871] 1 2 3 5 6 7 8 9 10 11 ...

Remove low HTO counts

summary(colSums(gex1_hto))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      16    3580    7222    9093   12782   51772

table(colSums(gex1_hto)>=100)

## 
## FALSE  TRUE 
##   213 11053

gex1_hto <- gex1_hto[,which(colSums(gex1_hto)>=100)]
summary(colSums(gex1_hto))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     101    3764    7356    9267   12923   51772

summary(colSums(gex2_hto))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      27    4110   13737   15216   24504   64861

table(colSums(gex2_hto)>=100)

## 
## FALSE  TRUE 
##     1  1688

gex2_hto <- gex2_hto[,which(colSums(gex2_hto)>=100)]
summary(colSums(gex2_hto))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     145    4110   13746   15225   24512   64861

summary(colSums(M239_N239_Alv_hto))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    1420    2897    4551    6469   40064

table(colSums(M239_N239_Alv_hto)>=100)

## 
## FALSE  TRUE 
##   308  9775

M239_N239_Alv_hto <- M239_N239_Alv_hto[,which(colSums(M239_N239_Alv_hto)>=100)]
summary(colSums(M239_N239_Alv_hto))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     104    1530    3039    4694    6620   40064

summary(colSums(P239_Alv_MDM_hto))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    2072    5122    6850   10136   53637

table(colSums(P239_Alv_MDM_hto)>=100)

## 
## FALSE  TRUE 
##   242  8630

P239_Alv_MDM_hto <- P239_Alv_MDM_hto[,which(colSums(P239_Alv_MDM_hto)>=100)]
summary(colSums(P239_Alv_MDM_hto))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     105    2253    5345    7042   10302   53637

Look at the proportion unmapped

summary(apply(gex1_hto,2,function(x) {x[6]/sum(x) } ) )

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.000e+00 0.000e+00 0.000e+00 8.014e-05 9.345e-05 7.634e-03

summary(apply(gex2_hto,2,function(x) {x[6]/sum(x) } ) )

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.000e+00 0.000e+00 0.000e+00 8.596e-05 7.020e-05 1.285e-02

summary(apply(M239_N239_Alv_hto,2,function(x) {x[6]/sum(x) } ) )

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.001816 0.031254 0.085117 0.304038 0.656792 0.976466

summary(apply(P239_Alv_MDM_hto,2,function(x) {x[6]/sum(x) } ) )

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0009302 0.0228943 0.0751835 0.3774595 0.9449641 0.9855072

Identify ambiguous HTO counts

For each cell barcode, calculate the ratio of top BC to 2nd BC.

getratio <- function(mx){
  res <- lapply(1:ncol(mx), function(i) {
    cnt <- mx[,i]
    top1 <- cnt[order(-cnt)][1]+1
    top2 <- cnt[order(-cnt)][2]+1
    top1/top2
  })
  return(unlist(res))
}

gex1_hto_ratio <- getratio(gex1_hto)
summary(unlist(gex1_hto_ratio))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   4.569  13.970  18.889  30.167 287.357

table(gex1_hto_ratio>3)

## 
## FALSE  TRUE 
##  1527  9526

gex1_hto <- gex1_hto[,which(gex1_hto_ratio>3)]

gex2_hto_ratio <- getratio(gex2_hto)
summary(unlist(gex2_hto_ratio))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   6.643  23.788  22.126  35.101  60.985

table(gex2_hto_ratio>3)

## 
## FALSE  TRUE 
##   201  1487

gex2_hto <- gex2_hto[,which(gex2_hto_ratio>3)]

M239_N239_Alv_hto_ratio <- getratio(M239_N239_Alv_hto)
summary(unlist(M239_N239_Alv_hto_ratio))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   5.849  18.830  19.447  31.135  81.322

table(M239_N239_Alv_hto_ratio>3)

## 
## FALSE  TRUE 
##  1492  8283

M239_N239_Alv_hto <- M239_N239_Alv_hto[,which(M239_N239_Alv_hto_ratio>3)]

P239_Alv_MDM_hto_ratio <- getratio(P239_Alv_MDM_hto)
summary(unlist(P239_Alv_MDM_hto_ratio))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   14.70   29.37   25.73   33.78   95.67

table(P239_Alv_MDM_hto_ratio>3)

## 
## FALSE  TRUE 
##   591  8039

P239_Alv_MDM_hto <- P239_Alv_MDM_hto[,which(P239_Alv_MDM_hto_ratio>3)]

Match

gex1_itx <- intersect(colnames(GEX1_counts),colnames(gex1_hto))
dim(GEX1_counts)

## [1] 36622 11959

GEX1_counts <- GEX1_counts[,gex1_itx]
dim(GEX1_counts)

## [1] 36622  9504

gex1_hto <- gex1_hto[,gex1_itx]
dim(gex1_hto)

## [1]    7 9504

gex2_itx <- intersect(colnames(GEX2_counts),colnames(gex2_hto))
dim(GEX2_counts)

## [1] 36622  2714

GEX2_counts <- GEX2_counts[,gex2_itx]
dim(GEX2_counts)

## [1] 36622  1484

gex2_hto <- gex2_hto[,gex2_itx]
dim(gex2_hto)

## [1]    7 1484

M239_N239_Alv_itx <- intersect(colnames(M239_N239_Alv_counts),colnames(M239_N239_Alv_hto))
dim(M239_N239_Alv_counts)

## [1] 36622 10100

M239_N239_Alv_counts <- M239_N239_Alv_counts[,M239_N239_Alv_itx]
dim(M239_N239_Alv_counts)

## [1] 36622  8281

M239_N239_Alv_hto <- M239_N239_Alv_hto[,M239_N239_Alv_itx]
dim(M239_N239_Alv_hto)

## [1]    7 8281

P239_Alv_MDM_itx <- intersect(colnames(P239_Alv_MDM_counts),colnames(P239_Alv_MDM_hto))
dim(P239_Alv_MDM_counts)

## [1] 36622  9004

P239_Alv_MDM_counts <- P239_Alv_MDM_counts[,P239_Alv_MDM_itx]
dim(P239_Alv_MDM_counts)

## [1] 36622  8039

P239_Alv_MDM_hto <- P239_Alv_MDM_hto[,P239_Alv_MDM_itx]
dim(P239_Alv_MDM_hto)

## [1]    7 8039

Get demux result

Hash tagging sheet.

Library	HTO	Sample
gex1	1	mock (donor P and V)
gex1	2	gfp_neg (donor P and V)
gex1	4	gfp_pos (donor P and V)
gex2	1	mock donor O
gex2	2	gfp_neg donor O
gex2	4	gfp_pos donor O
M239_N239_Alv	1	AlvM239_mock
M239_N239_Alv	2	AlvM239_gfp_neg
M239_N239_Alv	3	AlvM239_gfp_pos
M239_N239_Alv	4	AlvN239_mock
M239_N239_Alv	5	AlvN239_gfp_neg
M239_N239_Alv	6	AlvN239_gfp_pos
P239_Alv_MDM_hto	1	MDM_mock
P239_Alv_MDM_hto	2	MDM_gfp_neg
P239_Alv_MDM_hto	3	MDM_gfp_pos
P239_Alv_MDM_hto	4	Alv_mock
P239_Alv_MDM_hto	5	Alv_gfp_neg
P239_Alv_MDM_hto	6	Alv_gfp_pos

Input counts:

GEX1_counts gex1_hto
GEX2_counts gex2_hto
M239_N239_Alv M239_N239_Alv_hto
P239_Alv_MDM P239_Alv_MDM_hto

table(apply(gex1_hto,2,function(x) { order(-x) } )[1,] )

## 
##    1    2    4    7 
## 1532 1448 6014  510

gex1_idx <- apply(gex1_hto,2,function(x) { order(-x) } )[1,]
gex1_h1 <- GEX1_counts[,which(gex1_idx==1)] # mock (donor P and V)
gex1_h2 <- GEX1_counts[,which(gex1_idx==2)] # gfp_neg (donor P and V) 
gex1_h4 <- GEX1_counts[,which(gex1_idx==4)] # gfp_pos (donor P and V)

table(apply(gex2_hto,2,function(x) { order(-x) } )[1,] )

## 
##   1   2   4   7 
## 177 420 886   1

gex2_idx <- apply(gex2_hto,2,function(x) { order(-x) } )[1,]
gex2_h1 <- GEX2_counts[,which(gex2_idx==1)] # mock donor O
gex2_h2 <- GEX2_counts[,which(gex2_idx==2)] # gfp_neg donor O 
gex2_h4 <- GEX2_counts[,which(gex2_idx==4)] # gfp_pos donor O

table(apply(M239_N239_Alv_hto,2,function(x) { order(-x) } )[1,] )

## 
##    1    2    3    4    5    6 
##  804  620 2335 1354  713 2455

M239_N239_Alv_idx <- apply(M239_N239_Alv_hto,2,function(x) { order(-x) } )[1,]
M239_N239_Alv_h1 <- M239_N239_Alv_counts[,which(M239_N239_Alv_idx==1)] # AlvM239_mock
M239_N239_Alv_h2 <- M239_N239_Alv_counts[,which(M239_N239_Alv_idx==2)] # AlvM239_gfp_neg
M239_N239_Alv_h3 <- M239_N239_Alv_counts[,which(M239_N239_Alv_idx==3)] # AlvM239_gfp_pos
M239_N239_Alv_h4 <- M239_N239_Alv_counts[,which(M239_N239_Alv_idx==4)] # AlvN239_mock
M239_N239_Alv_h5 <- M239_N239_Alv_counts[,which(M239_N239_Alv_idx==5)] # AlvN239_gfp_neg        
M239_N239_Alv_h6 <- M239_N239_Alv_counts[,which(M239_N239_Alv_idx==6)] # AlvN239_gfp_pos     

table(apply(P239_Alv_MDM_hto,2,function(x) { order(-x) } )[1,] )

## 
##    1    2    3    4    5    6 
##  815  436 1811  974  945 3058

P239_Alv_MDM_idx <- apply(P239_Alv_MDM_hto,2,function(x) { order(-x) } )[1,]
P239_Alv_MDM_h1 <- P239_Alv_MDM_counts[,which(P239_Alv_MDM_idx==1)] # MDM_mock
P239_Alv_MDM_h2 <- P239_Alv_MDM_counts[,which(P239_Alv_MDM_idx==2)] # MDM_gfp_neg
P239_Alv_MDM_h3 <- P239_Alv_MDM_counts[,which(P239_Alv_MDM_idx==3)] # MDM_gfp_pos
P239_Alv_MDM_h4 <- P239_Alv_MDM_counts[,which(P239_Alv_MDM_idx==4)] # Alv_mock
P239_Alv_MDM_h5 <- P239_Alv_MDM_counts[,which(P239_Alv_MDM_idx==5)] # Alv_gfp_neg
P239_Alv_MDM_h6 <- P239_Alv_MDM_counts[,which(P239_Alv_MDM_idx==6)] # Alv_gfp_pos

Distinguish SNP pooled samples

Data was previously genotyped and called using cellSNP-lite and vireo.

snp <- read.table("1-GEX_vireo/donor_ids.tsv",header=TRUE)

donor0 <- subset(snp,donor_id=="donor0")$cell
donor0 <- gsub("-1","",donor0)

donor1 <- subset(snp,donor_id=="donor1")$cell
donor1 <- gsub("-1","",donor1)

gex1_h1_d0 <- gex1_h1[,colnames(gex1_h1) %in% donor0]
dim(gex1_h1_d0)

## [1] 36622   796

gex1_h1_d1 <- gex1_h1[,colnames(gex1_h1) %in% donor1]
dim(gex1_h1_d1)

## [1] 36622   659

gex1_h2_d0 <- gex1_h2[,colnames(gex1_h2) %in% donor0]
dim(gex1_h2_d0)

## [1] 36622   880

gex1_h2_d1 <- gex1_h2[,colnames(gex1_h2) %in% donor1]
dim(gex1_h2_d1)

## [1] 36622   479

gex1_h4_d0 <- gex1_h4[,colnames(gex1_h4) %in% donor0]
dim(gex1_h4_d0)

## [1] 36622  2660

gex1_h4_d1 <- gex1_h4[,colnames(gex1_h4) %in% donor1]
dim(gex1_h4_d1)

## [1] 36622  2584

Which is female or male?

Let’s look at the expression of genes on sex chromosomes.

BTW here’s the bash script to get the chr2gene tsv file:

zcat genes.gtf.gz \
| grep ^chr \
| cut -f1,9 \
| sed 's/; /\n/g' \
| egrep  '(chr|gene_name)' \
| sed 's/gene_name "//' \
| tr -d '"' \
| cut -f1 \
| paste - - \
| uniq \
| sort -u > gene_chr.tsv

gene_chr <- read.table("ref_combined1/hum_hiv/genes/gene_chr.tsv")
head(gene_chr)

##      V1         V2
## 1 chr10       A1CF
## 2 chr10      ABCC2
## 3 chr10       ABI1
## 4 chr10     ABLIM1
## 5 chr10   ABRAXAS2
## 6 chr10 AC005383.1

chrx <- subset(gene_chr,V1=="chrX")$V2
chry <- subset(gene_chr,V1=="chrY")$V2
chrm <- subset(gene_chr,V1=="chrM")$V2
chra <- gene_chr$V2[!gene_chr$V2 %in% c(chrx,chry,chrm)]
str(gene_chr)

## 'data.frame':    36562 obs. of  2 variables:
##  $ V1: chr  "chr10" "chr10" "chr10" "chr10" ...
##  $ V2: chr  "A1CF" "ABCC2" "ABI1" "ABLIM1" ...

str(chrx)

##  chr [1:1146] "ABCB7" "ABCD1" "AC000113.1" "AC002072.1" "AC002368.1" ...

str(chry)

##  chr [1:111] "AC006040.1" "AC006157.1" "AC007244.1" "AC007359.1" ...

str(chrm)

##  chr [1:13] "MT-ATP6" "MT-ATP8" "MT-CO1" "MT-CO2" "MT-CO3" "MT-CYB" ...

d0y <- median(colSums(gex1_h4_d0[which(rownames(gex1_h4_d0) %in% chry),]))
d0x <- median(colSums(gex1_h4_d0[which(rownames(gex1_h4_d0) %in% chrx),]))
d0m <- median(colSums(gex1_h4_d0[which(rownames(gex1_h4_d0) %in% chrm),]))
d0a <- median(colSums(gex1_h4_d0[which(rownames(gex1_h4_d0) %in% chra),]))

d1y <- median(colSums(gex1_h4_d1[which(rownames(gex1_h4_d1) %in% chry),]))
d1x <- median(colSums(gex1_h4_d1[which(rownames(gex1_h4_d1) %in% chrx),]))
d1m <- median(colSums(gex1_h4_d1[which(rownames(gex1_h4_d1) %in% chrm),]))
d1a <- median(colSums(gex1_h4_d1[which(rownames(gex1_h4_d1) %in% chra),]))

message(paste("Donor 0 chrY:",signif(d0y/d0a * 100,3),"%"))

## Donor 0 chrY: 0 %

message(paste("Donor 0 chrX:",signif(d0x/d0a * 100,3),"%"))

## Donor 0 chrX: 3.33 %

message(paste("Donor 0 chrM:",signif(d0m/d0a * 100,3),"%"))

## Donor 0 chrM: 5.7 %

message(paste("Donor 1 chrY:",signif(d1y/d1a * 100,3),"%"))

## Donor 1 chrY: 0.068 %

message(paste("Donor 1 chrX:",signif(d1x/d1a * 100,3),"%"))

## Donor 1 chrX: 3.53 %

message(paste("Donor 1 chrM:",signif(d1m/d1a * 100,3),"%"))

## Donor 1 chrM: 11.2 %

Therefore donor 0 is female and 1 is male.

Check HIV in these

HIV genes are rows 1 to 21.

hiv <- mean(colSums(gex1_h1_d0[1:21,])) / mean(colSums(gex1_h1_d0[22:nrow(gex1_h1_d0),]))
message(paste("Mock Donor 0 HIV:",signif(hiv,3),"%"))

## Mock Donor 0 HIV: 5.8e-05 %

hiv <- mean(colSums(gex1_h1_d1[1:21,])) / mean(colSums(gex1_h1_d1[22:nrow(gex1_h1_d1),]))
message(paste("Mock Donor 1 HIV:",signif(hiv,3),"%"))

## Mock Donor 1 HIV: 0.000114 %

hiv <- mean(colSums(gex1_h2_d0[1:21,])) / mean(colSums(gex1_h2_d0[22:nrow(gex1_h2_d0),]))
message(paste("GFP- Donor 0 HIV:",signif(hiv,3),"%"))

## GFP- Donor 0 HIV: 0.0106 %

hiv <- mean(colSums(gex1_h2_d1[1:21,])) / mean(colSums(gex1_h2_d1[22:nrow(gex1_h2_d1),]))
message(paste("GFP- Donor 1 HIV:",signif(hiv,3),"%"))

## GFP- Donor 1 HIV: 0.00992 %

hiv <- mean(colSums(gex1_h4_d0[1:21,])) / mean(colSums(gex1_h4_d0[22:nrow(gex1_h4_d0),]))
message(paste("GFP+ Donor 0 HIV:",signif(hiv,3),"%"))

## GFP+ Donor 0 HIV: 0.00031 %

hiv <- mean(colSums(gex1_h4_d1[1:21,])) / mean(colSums(gex1_h4_d1[22:nrow(gex1_h4_d1),]))
message(paste("GFP+ Donor 1 HIV:",signif(hiv,3),"%"))

## GFP+ Donor 1 HIV: 0.000366 %

hiv <- mean(colSums(gex2_h1[1:21,])) / mean(colSums(gex2_h1[22:nrow(gex2_h1),]))
message(paste("Mock Donor 2 HIV:",signif(hiv,3),"%"))

## Mock Donor 2 HIV: 5.64e-05 %

hiv <- mean(colSums(gex2_h2[1:21,])) / mean(colSums(gex2_h2[22:nrow(gex2_h2),]))
message(paste("GFP- Donor 2 HIV:",signif(hiv,3),"%"))

## GFP- Donor 2 HIV: 0.0146 %

hiv <- mean(colSums(gex2_h4[1:21,])) / mean(colSums(gex2_h4[22:nrow(gex2_h4),]))
message(paste("GFP+ Donor 2 HIV:",signif(hiv,3),"%"))

## GFP+ Donor 2 HIV: 0.000417 %

hiv <- mean(colSums(P239_Alv_MDM_h1[1:21,])) / mean(colSums(P239_Alv_MDM_h1[22:nrow(P239_Alv_MDM_h1),]))
message(paste("Mock Donor 3 HIV:",signif(hiv,3),"%"))

## Mock Donor 3 HIV: 0.000111 %

hiv <- mean(colSums(P239_Alv_MDM_h2[1:21,])) / mean(colSums(P239_Alv_MDM_h2[22:nrow(P239_Alv_MDM_h2),]))
message(paste("GFP+ Donor 3 HIV:",signif(hiv,3),"%"))

## GFP+ Donor 3 HIV: 0.00913 %

hiv <- mean(colSums(P239_Alv_MDM_h3[1:21,])) / mean(colSums(P239_Alv_MDM_h3[22:nrow(P239_Alv_MDM_h3),]))
message(paste("GPF- Donor 3 HIV:",signif(hiv,3),"%"))

## GPF- Donor 3 HIV: 0.000148 %

hiv <- mean(colSums(P239_Alv_MDM_h4[1:21,])) / mean(colSums(P239_Alv_MDM_h4[22:nrow(P239_Alv_MDM_h4),]))
message(paste("Mock Donor 4 HIV:",signif(hiv,3),"%"))

## Mock Donor 4 HIV: 7.51e-05 %

hiv <- mean(colSums(P239_Alv_MDM_h5[1:21,])) / mean(colSums(P239_Alv_MDM_h5[22:nrow(P239_Alv_MDM_h5),]))
message(paste("GFP+ Donor 4 HIV:",signif(hiv,3),"%"))

## GFP+ Donor 4 HIV: 0.012 %

hiv <- mean(colSums(P239_Alv_MDM_h6[1:21,])) / mean(colSums(P239_Alv_MDM_h6[22:nrow(P239_Alv_MDM_h6),]))
message(paste("GFP- Donor 4 HIV:",signif(hiv,3),"%"))

## GFP- Donor 4 HIV: 0.000428 %

hiv <- mean(colSums(M239_N239_Alv_h1[1:21,])) / mean(colSums(M239_N239_Alv_h1[22:nrow(M239_N239_Alv_h1),]))
message(paste("Mock Donor 5 HIV:",signif(hiv,3),"%"))

## Mock Donor 5 HIV: 0.000123 %

hiv <- mean(colSums(M239_N239_Alv_h2[1:21,])) / mean(colSums(M239_N239_Alv_h2[22:nrow(M239_N239_Alv_h2),]))
message(paste("Mock Donor 5 HIV:",signif(hiv,3),"%"))

## Mock Donor 5 HIV: 0.0125 %

Yes it looks like GFP minus and positive were swapped

Renaming things

From now on we will rename them: “mock”, “latent”, “active”

mock0 -> mdm_mock1 mock1 -> mdm_mock2 mock2 -> mdm_mock3 mock3 -> mdm_mock4

mock4 -> alv_mock1 mock5 -> alv_mock3 mock6 -> alv_mock4

active0 -> mdm_active1 active1 -> mdm_active2 active2 -> mdm_active3 active3 -> mdm_active4

active4 -> alv_active1 active5 -> alv_active2 active6 -> alv_active3

latent0 -> mdm_latent1 latent1 -> mdm_latent2 latent2 -> mdm_latent3 latent3 -> mdm_latent4

latent4 -> alv_latent1 latent5 -> alv_latent2 latent6 -> alv_latent3

bystander0 -> mdm_bystander1 bystander1 -> mdm_bystander2 bystander2 -> mdm_bystander3 bystander3 -> mdm_bystander4

bystander4 -> alv_bystander1 bystander5 -> alv_bystander2 bystander6 -> alv_bystander3

mdm_mock1 <- gex1_h1_d0 #MDMP236 MDMmock
mdm_mock2 <- gex1_h1_d1 #MDMV236 MDMmock
mdm_mock3 <- gex2_h1 #MDMO236 MDMmock
mdm_mock4 <- P239_Alv_MDM_h1 #MDMP239 MDMmock

alv_mock1 <- P239_Alv_MDM_h4 #AlvP239 AlvMDMmock
alv_mock2 <- M239_N239_Alv_h1 #AlvM239 AlvMDMmock
alv_mock3 <- M239_N239_Alv_h4 #AlvN239 AlvMDMmock

mdm_active1 <- gex1_h2_d0 #MDMP236 MDMactive
mdm_active2 <- gex1_h2_d1 #MDMV236 MDMactive
mdm_active3 <- gex2_h2 #MDMO236 MDMactive
mdm_active4 <- P239_Alv_MDM_h2 #MDMP239 MDMactive

alv_active1 <- P239_Alv_MDM_h5 #AlvP239 AlvMDMactive
alv_active2 <- M239_N239_Alv_h2 #AlvM239 AlvMDMactive
alv_active3 <- M239_N239_Alv_h5 #AlvN239 AlvMDMactive

mdm_gfpneg1 <- colSums(gex1_h4_d0[1:21,])/colSums(gex1_h4_d0)*1e6
mdm_gfpneg2 <- colSums(gex1_h4_d1[1:21,])/colSums(gex1_h4_d1)*1e6
mdm_gfpneg3 <- colSums(gex2_h4[1:21,])/colSums(gex2_h4)*1e6
mdm_gfpneg4 <- colSums(P239_Alv_MDM_h3[1:21,])/colSums(P239_Alv_MDM_h3)*1e6

alv_gfpneg1 <- colSums(P239_Alv_MDM_h6[1:21,])/colSums(P239_Alv_MDM_h6)*1e6
alv_gfpneg2 <- colSums(M239_N239_Alv_h3[1:21,])/colSums(M239_N239_Alv_h3)*1e6
alv_gfpneg3 <- colSums(M239_N239_Alv_h6[1:21,])/colSums(M239_N239_Alv_h6)*1e6

gfp <- list("mdm_mock1"=colSums(mdm_mock1[1:21,])/colSums(mdm_mock1)*1e6 ,
  "mdm_mock2"=colSums(mdm_mock2[1:21,])/colSums(mdm_mock2)*1e6 ,
  "mdm_mock3"=colSums(mdm_mock3[1:21,])/colSums(mdm_mock3)*1e6 ,
  "mdm_mock4"=colSums(mdm_mock4[1:21,])/colSums(mdm_mock4)*1e6 ,
  "alv_mock1"=colSums(alv_mock1[1:21,])/colSums(alv_mock1)*1e6 ,
  "alv_mock2"=colSums(alv_mock2[1:21,])/colSums(alv_mock2)*1e6 ,
  "alv_mock3"=colSums(alv_mock3[1:21,])/colSums(alv_mock3)*1e6 ,
  "mdm_active1"=colSums(mdm_active1[1:21,])/colSums(mdm_active1)*1e6,
  "mdm_active2"=colSums(mdm_active2[1:21,])/colSums(mdm_active2)*1e6,
  "mdm_active3"=colSums(mdm_active3[1:21,])/colSums(mdm_active3)*1e6,
  "mdm_active4"=colSums(mdm_active4[1:21,])/colSums(mdm_active4)*1e6,
  "alv_active1"=colSums(alv_active1[1:21,])/colSums(alv_active1)*1e6,
  "alv_active2"=colSums(alv_active2[1:21,])/colSums(alv_active2)*1e6,
  "alv_active3"=colSums(alv_active3[1:21,])/colSums(alv_active3)*1e6,
  "mdm_gfpneg1"=colSums(gex1_h4_d0[1:21,])/colSums(gex1_h4_d0)*1e6,
  "mdm_gfpneg2"=colSums(gex1_h4_d1[1:21,])/colSums(gex1_h4_d1)*1e6,
  "mdm_gfpneg3"=colSums(gex2_h4[1:21,])/colSums(gex2_h4)*1e6,
  "mdm_gfpneg4"=colSums(P239_Alv_MDM_h3[1:21,])/colSums(P239_Alv_MDM_h3)*1e6,
  "alv_gfpneg1"=colSums(P239_Alv_MDM_h6[1:21,])/colSums(P239_Alv_MDM_h6)*1e6,
  "alv_gfpneg2"=colSums(M239_N239_Alv_h3[1:21,])/colSums(M239_N239_Alv_h3)*1e6,
  "alv_gfpneg3"=colSums(M239_N239_Alv_h6[1:21,])/colSums(M239_N239_Alv_h6)*1e6)

par(mar=c(7.1, 4.1, 4.1, 2.1))

lgfp <- lapply(gfp,function(x) { log10(x+1) } )
lapply(lgfp,summary)

## $mdm_mock1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.431   1.127   1.862   3.534 
## 
## $mdm_mock2
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.370   1.019   1.839   4.059 
## 
## $mdm_mock3
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.8746  1.7000  3.4063 
## 
## $mdm_mock4
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.706   1.585   4.272 
## 
## $alv_mock1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.7422  1.6938  3.9895 
## 
## $alv_mock2
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.272   1.606   1.450   1.980   3.992 
## 
## $alv_mock3
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.202   1.713   1.537   2.140   4.682 
## 
## $mdm_active1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   3.482   3.794   3.699   4.093   5.125 
## 
## $mdm_active2
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   3.513   3.819   3.709   4.045   4.991 
## 
## $mdm_active3
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   3.485   3.887   3.674   4.225   5.338 
## 
## $mdm_active4
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   3.495   3.768   3.678   3.986   5.407 
## 
## $alv_active1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   3.572   3.846   3.770   4.156   5.607 
## 
## $alv_active2
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   3.632   3.917   3.760   4.167   5.241 
## 
## $alv_active3
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   3.436   3.801   3.676   4.106   5.518 
## 
## $mdm_gfpneg1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.765   1.587   2.386   5.120 
## 
## $mdm_gfpneg2
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.846   1.625   2.325   4.748 
## 
## $mdm_gfpneg3
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.676   1.487   2.421   4.792 
## 
## $mdm_gfpneg4
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.459   1.067   1.873   4.591 
## 
## $alv_gfpneg1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.822   1.551   2.249   4.801 
## 
## $alv_gfpneg2
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.645   1.980   1.906   2.324   5.299 
## 
## $alv_gfpneg3
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.648   2.037   1.952   2.398   5.623

boxplot(lgfp,col="white",cex=0,las=2,border="darkgray",ylab="log10+1 RPM",main="All HIV genes")
boxplot(lgfp,col="white",cex=0,las=2,border="darkgray",ylab="log10+1 RPM",main="All HIV genes")
beeswarm(lgfp,add=TRUE,pch=1,cex=0.1)

lgfp2 <- lgfp[grep("neg",names(lgfp),invert=TRUE)]
boxplot(lgfp2,col="white",cex=0,las=2,border="darkgray",ylab="log10+1 RPM",main="All HIV genes")
boxplot(lgfp2,col="white",cex=0,las=2,border="darkgray",ylab="log10+1 RPM",main="All HIV genes")
beeswarm(lgfp2,add=TRUE,pch=1,cex=0.1)

#blah()
par(mar=c(8.1, 4.1, 4.1, 2.1))

vioplot(lgfp,las=2,ylab="log10+1 RPM",main="All HIV genes")

vioplot(lgfp2,las=2,ylab="log10+1 RPM",main="All HIV genes")

par(mar=c(7.1, 4.1, 4.1, 2.1))

# plot the median for groups

vgfp <- unlist(lapply(lgfp,median))
mm <- vgfp[grep("mdm_mock",names(vgfp))]
ma <- vgfp[grep("mdm_active",names(vgfp))]
am <- vgfp[grep("alv_mock",names(vgfp))]
aa <- vgfp[grep("alv_active",names(vgfp))]

bgfp <- list("MDM mock"=mm, "MDM active"=ma,
  "Alv mock"=am,"Alv active"=aa)

boxplot(bgfp)

boxplot(bgfp,col="white",cex=0,las=0,border="darkgray",ylab="log10+1 RPM",main="All HIV genes")
beeswarm(bgfp,add=TRUE,pch=19,cex=1)

nullres <- lapply(1:21, function(i) {

  gfp <- list("mdm_mock1"=mdm_mock1[i,]/colSums(mdm_mock1)*1e6 ,
    "mdm_mock2"=mdm_mock2[i,]/colSums(mdm_mock2)*1e6 ,
    "mdm_mock3"=mdm_mock3[i,]/colSums(mdm_mock3)*1e6 ,
    "mdm_mock4"=mdm_mock4[i,]/colSums(mdm_mock4)*1e6 ,
    "alv_mock1"=alv_mock1[i,]/colSums(alv_mock1)*1e6 ,
    "alv_mock2"=alv_mock2[i,]/colSums(alv_mock2)*1e6 ,
    "alv_mock3"=alv_mock3[i,]/colSums(alv_mock3)*1e6 ,
    "mdm_active1"=mdm_active1[i,]/colSums(mdm_active1)*1e6,
    "mdm_active2"=mdm_active2[i,]/colSums(mdm_active2)*1e6,
    "mdm_active3"=mdm_active3[i,]/colSums(mdm_active3)*1e6,
    "mdm_active4"=mdm_active4[i,]/colSums(mdm_active4)*1e6,
    "alv_active1"=alv_active1[i,]/colSums(alv_active1)*1e6,
    "alv_active2"=alv_active2[i,]/colSums(alv_active2)*1e6,
    "alv_active3"=alv_active3[i,]/colSums(alv_active3)*1e6,
    "mdm_gfpneg1"=gex1_h4_d0[i,]/colSums(gex1_h4_d0)*1e6,
    "mdm_gfpneg2"=gex1_h4_d1[i,]/colSums(gex1_h4_d1)*1e6,
    "mdm_gfpneg3"=gex2_h4[i,]/colSums(gex2_h4)*1e6,
    "mdm_gfpneg4"=P239_Alv_MDM_h3[i,]/colSums(P239_Alv_MDM_h3)*1e6,
    "alv_gfpneg1"=P239_Alv_MDM_h6[i,]/colSums(P239_Alv_MDM_h6)*1e6,
    "alv_gfpneg2"=M239_N239_Alv_h3[i,]/colSums(M239_N239_Alv_h3)*1e6,
    "alv_gfpneg3"=M239_N239_Alv_h6[i,]/colSums(M239_N239_Alv_h6)*1e6)

  lgfp <- lapply(gfp,function(x) { log10(x+1) } )
  boxplot(lgfp,col="white",cex=0,las=2,border="darkgray",ylab="log10+1 RPM", main=rownames(mdm_mock1)[i])
  boxplot(lgfp,col="white",cex=0,las=2,border="darkgray",ylab="log10+1 RPM", main=rownames(mdm_mock1)[i])
  beeswarm(lgfp,add=TRUE,pch=1,cex=0.1)
})

par(mar=c(5.1, 4.1, 4.1, 2.1))

#pdf("latent_threshold.pdf",width=7,height=9)

# mdm1 mock vs active
m1a <- log10( colSums(mdm_active1[1:21,]) / colSums(mdm_active1) *1e6 )
m1m <- log10( colSums(mdm_mock1[1:21,]) / colSums(mdm_mock1) *1e6 )
m1n <- log10(colSums(gex1_h4_d0[1:21,]) / colSums(gex1_h4_d0) *1e6)
myrange <- seq(1,6,0.1)
m1res <- sapply( myrange , function(j) {
  ( length(which(m1m < j))/length(m1m) + length(which(m1a >= j))/length(m1a) ) / 2  }
)
names(m1res) <- myrange
MAX=as.numeric(names(m1res[order(-m1res)][1]))
MAXVAL=signif(m1res[order(-m1res)][1],3)
par(mfrow=c(4,1))
hist( m1m , main="MDM1 mock" , xlab="log10 HIV RPM", xlim=c(1,6), breaks=25)
hist( m1a , main="MDM1 active", xlab="log10 HIV RPM", xlim=c(1,6), breaks=35)
plot(myrange,m1res,type="b",main="Best partition",xlab="log10 HIV RPM",bty="n")
abline(v=MAX,col="red")
mtext(paste("Max=",MAXVAL,"@",MAX),cex=0.7)
hist( m1n , main="MDM1 GFP-", xlab="log10 HIV RPM", xlim=c(1,6), breaks=35)
abline(v=MAX,col="red")
mtext(paste(as.character(table(m1n>=MAX)), collapse=" "),cex=0.7)

mdm_latent1 <- gex1_h4_d0[,which(m1n>=MAX)] #MDMP236 latent
mdm_bystander1 <- gex1_h4_d0[,which(m1n<MAX)] #MDMP236 bystander

# mdm2 mock vs active
m2a <- log10( colSums(mdm_active2[1:21,]) / colSums(mdm_active2) *1e6 )
m2m <- log10( colSums(mdm_mock2[1:21,]) / colSums(mdm_mock2) *1e6 )
m2n <- log10( colSums(gex1_h4_d1[1:21,]) / colSums(gex1_h4_d1) *1e6)
myrange <- seq(1,6,0.1)
m2res <- sapply( myrange , function(j) {
  ( length(which(m2m < j))/length(m2m) + length(which(m2a >= j))/length(m2a) ) / 2  }
)
names(m2res) <- myrange
MAX=as.numeric(names(m2res[order(-m2res)][1]))
MAXVAL=signif(m2res[order(-m2res)][1],3)
par(mfrow=c(4,1))
hist( m2m , main="MDM2 mock" , xlab="log10 HIV RPM", xlim=c(1,6), breaks=25)
hist( m2a , main="MDM2 active", xlab="log10 HIV RPM", xlim=c(1,6), breaks=25)
plot(myrange,m2res,type="b",main="Best partition",xlab="log10 HIV RPM",bty="n")
abline(v=MAX,col="red")
mtext(paste("Max=",MAXVAL,"@",MAX),cex=0.7)
hist( m2n , main="MDM2 GFP-", xlab="log10 HIV RPM", xlim=c(1,6), breaks=35)
abline(v=MAX,col="red")
mtext(paste(as.character(table(m2n>=MAX)), collapse=" "),cex=0.7)

mdm_latent2 <- gex1_h4_d1[,which(m2n>=MAX)] #MDMV236 latent
mdm_bystander2 <- gex1_h4_d1[,which(m2n<MAX)] #MDMV236 bystander

# mdm3 mock vs active
m3a <- log10( colSums(mdm_active3[1:21,]) / colSums(mdm_active3) *1e6 )
m3m <- log10( colSums(mdm_mock3[1:21,]) / colSums(mdm_mock3) *1e6 )
m3n <- log10( colSums(gex2_h4[1:21,]) / colSums(gex2_h4) *1e6)
myrange <- seq(1,6,0.1)
m3res <- sapply( myrange , function(j) {
  ( length(which(m3m < j))/length(m3m) + length(which(m3a >= j))/length(m3a) ) / 2  }
)
names(m3res) <- myrange
MAX=as.numeric(names(m3res[order(-m3res)][1]))
MAXVAL=signif(m3res[order(-m3res)][1],3)
par(mfrow=c(4,1))
hist( m3m , main="MDM3 mock" , xlab="log10 HIV RPM", xlim=c(1,6), breaks=25)
hist( m3a , main="MDM3 active", xlab="log10 HIV RPM", xlim=c(1,6), breaks=35)
plot(myrange,m3res,type="b",main="Best partition",xlab="log10 HIV RPM",bty="n")
abline(v=MAX,col="red")
mtext(paste("Max=",MAXVAL,"@",MAX),cex=0.7)
hist( m3n , main="MDM3 GFP-", xlab="log10 HIV RPM", xlim=c(1,6), breaks=35)
abline(v=MAX,col="red")
mtext(paste(as.character(table(m3n>=MAX)), collapse=" "),cex=0.7)

mdm_latent3 <- gex2_h4[,which(m3n>=MAX)] #MDMO236 latent
mdm_bystander3 <- gex2_h4[,which(m3n<MAX)] #MDMO236 bystander

# mdm4 mock vs active
m4a <- log10( colSums(mdm_active4[1:21,]) / colSums(mdm_active4) *1e6 )
m4m <- log10( colSums(mdm_mock4[1:21,]) / colSums(mdm_mock4) *1e6 )
m4n <- log10( colSums(P239_Alv_MDM_h3[1:21,]) / colSums(P239_Alv_MDM_h3) *1e6)
myrange <- seq(1,6,0.1)
m4res <- sapply( myrange , function(j) {
  ( length(which(m4m < j))/length(m4m) + length(which(m4a >= j))/length(m4a) ) / 2  }
)
names(m4res) <- myrange
MAX=as.numeric(names(m4res[order(-m4res)][1]))
MAXVAL=signif(m4res[order(-m4res)][1],3)
par(mfrow=c(4,1))
hist( m4m , main="MDM4 mock" , xlab="log10 HIV RPM", xlim=c(1,6), breaks=25)
hist( m4a , main="MDM4 active", xlab="log10 HIV RPM", xlim=c(1,6), breaks=35)
plot(myrange,m4res,type="b",main="Best partition",xlab="log10 HIV RPM",bty="n")
abline(v=MAX,col="red")
mtext(paste("Max=",MAXVAL,"@",MAX),cex=0.7)
hist( m4n , main="MDM4 GFP-", xlab="log10 HIV RPM", xlim=c(1,6), breaks=35)
abline(v=MAX,col="red")
mtext(paste(as.character(table(m4n>=MAX)), collapse=" "),cex=0.7)

mdm_latent4 <- P239_Alv_MDM_h3[,which(m4n>=MAX)] #MDMP239 latent
mdm_bystander4 <- P239_Alv_MDM_h3[,which(m4n<MAX)] #MDMP239 bystander

# alv1 mock vs active
a1a <- log10( colSums(alv_active1[1:21,]) / colSums(alv_active1) *1e6 )
a1m <- log10( colSums(alv_mock1[1:21,]) / colSums(alv_mock1) *1e6 )
a1n <- log10( colSums(P239_Alv_MDM_h6[1:21,]) / colSums(P239_Alv_MDM_h6) *1e6)
myrange <- seq(1,6,0.1)
a1res <- sapply( myrange , function(j) {
  ( length(which(a1m < j))/length(a1m) + length(which(a1a >= j))/length(a1a) ) / 2  }
)
names(a1res) <- myrange
MAX=as.numeric(names(a1res[order(-a1res)][1]))
MAXVAL=signif(a1res[order(-a1res)][1],3)
par(mfrow=c(4,1))
hist( a1m , main="Alv1 mock" , xlab="log10 HIV RPM", xlim=c(1,6), breaks=25)
hist( a1a , main="Alv1 active", xlab="log10 HIV RPM", xlim=c(1,6), breaks=35)
plot(myrange,a1res,type="b",main="Best partition",xlab="log10 HIV RPM",bty="n")
abline(v=MAX,col="red")
mtext(paste("Max=",MAXVAL,"@",MAX),cex=0.7)
hist( a1n , main="Alv1 GFP-", xlab="log10 HIV RPM", xlim=c(1,6), breaks=35)
abline(v=MAX,col="red")
mtext(paste(as.character(table(a1n>=MAX)), collapse=" "),cex=0.7)

alv_latent1 <- P239_Alv_MDM_h6[,which(a1n>=MAX)] #AlvP239 latent
alv_bystander1 <- P239_Alv_MDM_h6[,which(a1n<MAX)] #AlvP239 bystander

# alv2 mock vs active
a2a <- log10( colSums(alv_active2[1:21,]) / colSums(alv_active2) *1e6 )
a2m <- log10( colSums(alv_mock2[1:21,]) / colSums(alv_mock2) *1e6 )
a2n <- log10( colSums(M239_N239_Alv_h3[1:21,]) / colSums(M239_N239_Alv_h3) *1e6)
myrange <- seq(1,6,0.1)
a2res <- sapply( myrange , function(j) {
  ( length(which(a2m < j))/length(a2m) + length(which(a2a >= j))/length(a2a) ) / 2  }
)
names(a2res) <- myrange
MAX=as.numeric(names(a2res[order(-a2res)][1]))
MAXVAL=signif(a2res[order(-a2res)][1],3)
par(mfrow=c(4,1))
hist( a2m , main="Alv2 mock" , xlab="log10 HIV RPM", xlim=c(1,6), breaks=25)
hist( a2a , main="Alv2 active", xlab="log10 HIV RPM", xlim=c(1,6), breaks=35)
plot(myrange,a2res,type="b",main="Best partition",xlab="log10 HIV RPM",bty="n")
abline(v=MAX,col="red")
mtext(paste("Max=",MAXVAL,"@",MAX),cex=0.7)
hist( a2n , main="Alv2 GFP-", xlab="log10 HIV RPM", xlim=c(1,6), breaks=35)
abline(v=MAX,col="red")
mtext(paste(as.character(table(a2n>=MAX)), collapse=" "),cex=0.7)

alv_latent2 <- M239_N239_Alv_h3[,which(a2n>=MAX)] #AlvM239 latent
alv_bystander2 <- M239_N239_Alv_h3[,which(a2n<MAX)] #AlvM239 bystander

# alv3 mock vs active
a3a <- log10( colSums(alv_active3[1:21,]) / colSums(alv_active3) *1e6 )
a3m <- log10( colSums(alv_mock3[1:21,]) / colSums(alv_mock3) *1e6 )
a3n <- log10( colSums(M239_N239_Alv_h6[1:21,]) / colSums(M239_N239_Alv_h6) *1e6)
myrange <- seq(1,6,0.1)
a3res <- sapply( myrange , function(j) {
  ( length(which(a3m < j))/length(a3m) + length(which(a3a >= j))/length(a3a) ) / 2  }
)
names(a3res) <- myrange
MAX=as.numeric(names(a3res[order(-a3res)][1]))
MAXVAL=signif(a3res[order(-a3res)][1],3)
par(mfrow=c(4,1))
hist( a3m , main="Alv3 mock" , xlab="log10 HIV RPM", xlim=c(1,6), breaks=25)
hist( a3a , main="Alv3 active", xlab="log10 HIV RPM", xlim=c(1,6), breaks=35)
plot(myrange,a3res,type="b",main="Best partition",xlab="log10 HIV RPM",bty="n")
abline(v=MAX,col="red")
mtext(paste("Max=",MAXVAL,"@",MAX),cex=0.7)
hist( a3n , main="Alv3 GFP-", xlab="log10 HIV RPM", xlim=c(1,6), breaks=35)
abline(v=MAX,col="red")
mtext(paste(as.character(table(a3n>=MAX)), collapse=" "),cex=0.7)

alv_latent3 <- M239_N239_Alv_h6[,which(a3n>=MAX)] #AlvN239 latent
alv_bystander3 <- M239_N239_Alv_h6[,which(a3n<MAX)] #AlvN239 bystander

#dev.off()
par(mfrow=c(1,1))

react6 <- N239Alv_react_counts #AlvN239

colnames(mdm_mock1) <- paste("mdm_mock1|",colnames(mdm_mock1),sep="")
colnames(mdm_mock2) <- paste("mdm_mock2|",colnames(mdm_mock2),sep="")
colnames(mdm_mock3) <- paste("mdm_mock3|",colnames(mdm_mock3),sep="")
colnames(mdm_mock4) <- paste("mdm_mock4|",colnames(mdm_mock4),sep="")
colnames(alv_mock1) <- paste("alv_mock1|",colnames(alv_mock1),sep="")
colnames(alv_mock2) <- paste("alv_mock2|",colnames(alv_mock2),sep="")
colnames(alv_mock3) <- paste("alv_mock3|",colnames(alv_mock3),sep="")

colnames(mdm_active1) <- paste("mdm_active1|",colnames(mdm_active1),sep="")
colnames(mdm_active2) <- paste("mdm_active2|",colnames(mdm_active2),sep="")
colnames(mdm_active3) <- paste("mdm_active3|",colnames(mdm_active3),sep="")
colnames(mdm_active4) <- paste("mdm_active4|",colnames(mdm_active4),sep="")

colnames(alv_active1) <- paste("alv_active1|",colnames(alv_active1),sep="")
colnames(alv_active2) <- paste("alv_active2|",colnames(alv_active2),sep="")
colnames(alv_active3) <- paste("alv_active3|",colnames(alv_active3),sep="")

colnames(mdm_bystander1) <- paste("mdm_bystander1|",colnames(mdm_bystander1),sep="")
colnames(mdm_bystander2) <- paste("mdm_bystander2|",colnames(mdm_bystander2),sep="")
colnames(mdm_bystander3) <- paste("mdm_bystander3|",colnames(mdm_bystander3),sep="")
colnames(mdm_bystander4) <- paste("mdm_bystander4|",colnames(mdm_bystander4),sep="")

colnames(alv_bystander1) <- paste("alv_bystander1|",colnames(alv_bystander1),sep="")
colnames(alv_bystander2) <- paste("alv_bystander2|",colnames(alv_bystander2),sep="")
colnames(alv_bystander3) <- paste("alv_bystander3|",colnames(alv_bystander3),sep="")

colnames(mdm_latent1) <- paste("mdm_latent1|",colnames(mdm_latent1),sep="")
colnames(mdm_latent2) <- paste("mdm_latent2|",colnames(mdm_latent2),sep="")
colnames(mdm_latent3) <- paste("mdm_latent3|",colnames(mdm_latent3),sep="")
colnames(mdm_latent4) <- paste("mdm_latent4|",colnames(mdm_latent4),sep="")

colnames(alv_latent1) <- paste("alv_latent1|",colnames(alv_latent1),sep="")
colnames(alv_latent2) <- paste("alv_latent2|",colnames(alv_latent2),sep="")
colnames(alv_latent3) <- paste("alv_latent3|",colnames(alv_latent3),sep="")

colnames(react6) <- paste("react6|",colnames(react6),sep="")

mylist <- list(mdm_mock1,mdm_mock2,mdm_mock3,mdm_mock4,alv_mock1,alv_mock2,alv_mock3,
  mdm_active1,mdm_active2,mdm_active3,mdm_active4,alv_active1,alv_active2,alv_active3,
  mdm_latent1,mdm_latent2,mdm_latent3,mdm_latent4,alv_latent1,alv_latent2,alv_latent3,
  mdm_bystander1,mdm_bystander2,mdm_bystander3,mdm_bystander4,alv_bystander1,
  alv_bystander2,alv_bystander3,react6)

names(mylist) <- c("mdm_mock1","mdm_mock2","mdm_mock3","mdm_mock4","alv_mock1","alv_mock2",
  "alv_mock3", "mdm_active1","mdm_active2","mdm_active3","mdm_active4","alv_active1",
  "alv_active2","alv_active3", "mdm_latent1","mdm_latent2","mdm_latent3","mdm_latent4",
  "alv_latent1","mdm_latent2","alv_latent3", "mdm_bystander1","mdm_bystander2",
  "mdm_bystander3","mdm_bystander4","alv_bystander1","alv_bystander2","alv_bystander3",
  "react6")

comb <- do.call(cbind,mylist)

rev(sapply(mylist,ncol))

##         react6 alv_bystander3 alv_bystander2 alv_bystander1 mdm_bystander4 
##           3098           2186           2119           2589           1696 
## mdm_bystander3 mdm_bystander2 mdm_bystander1    alv_latent3    mdm_latent2 
##            674           2216           2192            269            216 
##    alv_latent1    mdm_latent4    mdm_latent3    mdm_latent2    mdm_latent1 
##            469            115            212            368            468 
##    alv_active3    alv_active2    alv_active1    mdm_active4    mdm_active3 
##            713            620            945            436            420 
##    mdm_active2    mdm_active1      alv_mock3      alv_mock2      alv_mock1 
##            479            880           1354            804            974 
##      mdm_mock4      mdm_mock3      mdm_mock2      mdm_mock1 
##            815            177            659            796

barplot(rev(sapply(mylist,ncol)),horiz=TRUE,las=1,xlab="no. cells")

rev(sapply(mylist,function(mx) {sum(colSums(mx))}))

##         react6 alv_bystander3 alv_bystander2 alv_bystander1 mdm_bystander4 
##      101451368       60413891       66543739       58712646       36814707 
## mdm_bystander3 mdm_bystander2 mdm_bystander1    alv_latent3    mdm_latent2 
##       27778607       71281322       72157029        5772635        4757482 
##    alv_latent1    mdm_latent4    mdm_latent3    mdm_latent2    mdm_latent1 
##        7743379         784796        3730028        5265191        3976851 
##    alv_active3    alv_active2    alv_active1    mdm_active4    mdm_active3 
##       24459874       29009554       30595751       14037120       26664866 
##    mdm_active2    mdm_active1      alv_mock3      alv_mock2      alv_mock1 
##       23140392       31329200       35231275       25675982       20535554 
##      mdm_mock4      mdm_mock3      mdm_mock2      mdm_mock1 
##       20775431        7576260       21687833       29223817

par(mar = c(5.1, 10.1, 4.1, 2.1))

barplot(rev(sapply(mylist,function(mx) {sum(colSums(mx))})),
  horiz=TRUE,las=1,xlab="no. counts")

res <- rev(sapply(mylist,function(mx) {
  mean(colSums(mx[1:2,])) /  mean(colSums(mx[3:nrow(mx),])) *100
}))
res

##         react6 alv_bystander3 alv_bystander2 alv_bystander1 mdm_bystander4 
##    0.007834909    0.005242447    0.004706900    0.003765940    0.002140495 
## mdm_bystander3 mdm_bystander2 mdm_bystander1    alv_latent3    mdm_latent2 
##    0.001918780    0.003127144    0.002311679    0.353580553    0.239522270 
##    alv_latent1    mdm_latent4    mdm_latent3    mdm_latent2    mdm_latent1 
##    0.153851447    0.238079425    0.093195995    0.146343403    0.173401945 
##    alv_active3    alv_active2    alv_active1    mdm_active4    mdm_active3 
##    0.455250262    0.470702608    0.572785675    0.418568084    0.377000064 
##    mdm_active2    mdm_active1      alv_mock3      alv_mock2      alv_mock1 
##    0.324009847    0.347506555    0.021644548    0.005729418    0.003871484 
##      mdm_mock4      mdm_mock3      mdm_mock2      mdm_mock1 
##    0.005603086    0.001676317    0.003656562    0.002135290

barplot(rev(res),horiz=TRUE,las=1,xlab="% HIV reads")

par(mar = c(5.1, 4.1, 4.1, 2.1))

Analysis by chromosome

chrmx <- sapply(mylist,function(mx) {
  rsum <- as.data.frame(rowSums(mx))
  m <- merge(gene_chr,rsum,by.x="V2",by.y=0)
  m$V2=NULL
  ag <- aggregate(. ~ V1 , m , sum)
  out <- ag[,2]
  names(out) <- ag[,1]
  return(out)
})

chryx <- chrmx["chrY",] / chrmx["chrX",] * 100
chryx

##      mdm_mock1      mdm_mock2      mdm_mock3      mdm_mock4      alv_mock1 
##    0.031048772    2.179144183    0.002420438    1.296282596    1.540759280 
##      alv_mock2      alv_mock3    mdm_active1    mdm_active2    mdm_active3 
##    1.610838109    0.072887083    0.041329692    2.101593980    0.002100176 
##    mdm_active4    alv_active1    alv_active2    alv_active3    mdm_latent1 
##    1.439916925    1.575515319    1.679002469    0.041973010    0.103961480 
##    mdm_latent2    mdm_latent3    mdm_latent4    alv_latent1    mdm_latent2 
##    2.438855134    0.003558550    1.450606837    1.424623324    1.546480185 
##    alv_latent3 mdm_bystander1 mdm_bystander2 mdm_bystander3 mdm_bystander4 
##    0.311285052    0.037411060    2.334047706    0.003166245    1.330740607 
## alv_bystander1 alv_bystander2 alv_bystander3         react6 
##    1.635928551    1.628753519    0.080568820    0.825985139

par(mar = c(5.1, 10.1, 4.1, 2.1))
barplot(rev(chryx),horiz=TRUE,las=1,xlab="Ratio ChrY to ChrX x100")

These checks look consistent, except for the react6 which appears as more male but that is not consistent with mock6, active6 and latent6.

chrmt <- chrmx["chrM",] / colSums(chrmx[1:22,]) *100
chrmt

##      mdm_mock1      mdm_mock2      mdm_mock3      mdm_mock4      alv_mock1 
##       5.637862      12.162175      13.619365       4.521635       8.667173 
##      alv_mock2      alv_mock3    mdm_active1    mdm_active2    mdm_active3 
##      11.232158      13.252584       6.306394       7.973163       9.661587 
##    mdm_active4    alv_active1    alv_active2    alv_active3    mdm_latent1 
##       3.420345       5.692860       5.851298       8.232841      27.441490 
##    mdm_latent2    mdm_latent3    mdm_latent4    alv_latent1    mdm_latent2 
##      27.617443      41.934616      27.903735      10.991995      16.112492 
##    alv_latent3 mdm_bystander1 mdm_bystander2 mdm_bystander3 mdm_bystander4 
##      11.950936       5.498538      11.150296      11.399728       5.283145 
## alv_bystander1 alv_bystander2 alv_bystander3         react6 
##       7.128923       9.522615      10.656024       5.608272

barplot(rev(chrmt),horiz=TRUE,las=1,xlab="% chrM reads")

Let’s take a look at mtDNA transcripts at the level of individual cells.

Remove cells with >20% mito reads.

mt <- lapply(mylist, function(x) {
  xmt <- x[grep("MT-",rownames(x)),]
  sapply(1:ncol(x), function(j) { sum(xmt[,j])/sum(x[,j]) } )
} )

vioplot(mt)

boxplot(mt,horizontal=TRUE,cex=0,col="white",las=1,
 main="mtDNA gene proportion")

par(mar = c(5.1, 9.1, 4.1, 2.1))

barplot(sapply(mt, function(x) { length(which(x>0.2))/length(x) } ),horiz=TRUE,las=1, main="Proportion of cells with >15% mito reads")

fmt <- lapply(mylist, function(x) {
  xmt <- x[grep("MT-",rownames(x)),]
  v <- sapply(1:ncol(x), function(j) { sum(xmt[,j])/sum(x[,j]) } )
  f <-  x[,which(v<0.2)]
  return(f)
} )

lapply(fmt,dim)

## $mdm_mock1
## [1] 36622   692
## 
## $mdm_mock2
## [1] 36622   550
## 
## $mdm_mock3
## [1] 36622   137
## 
## $mdm_mock4
## [1] 36622   775
## 
## $alv_mock1
## [1] 36622   884
## 
## $alv_mock2
## [1] 36622   649
## 
## $alv_mock3
## [1] 36622  1034
## 
## $mdm_active1
## [1] 36622   602
## 
## $mdm_active2
## [1] 36622   414
## 
## $mdm_active3
## [1] 36622   307
## 
## $mdm_active4
## [1] 36622   401
## 
## $alv_active1
## [1] 36622   774
## 
## $alv_active2
## [1] 36622   546
## 
## $alv_active3
## [1] 36622   541
## 
## $mdm_latent1
## [1] 36622   172
## 
## $mdm_latent2
## [1] 36622   195
## 
## $mdm_latent3
## [1] 36622    75
## 
## $mdm_latent4
## [1] 36622    23
## 
## $alv_latent1
## [1] 36622   302
## 
## $mdm_latent2
## [1] 36622    99
## 
## $alv_latent3
## [1] 36622   138
## 
## $mdm_bystander1
## [1] 36622  1857
## 
## $mdm_bystander2
## [1] 36622  1965
## 
## $mdm_bystander3
## [1] 36622   509
## 
## $mdm_bystander4
## [1] 36622  1496
## 
## $alv_bystander1
## [1] 36622  2459
## 
## $alv_bystander2
## [1] 36622  1948
## 
## $alv_bystander3
## [1] 36622  1937
## 
## $react6
## [1] 36622  2830

mt2 <- lapply(fmt, function(x) {
  xmt <- x[grep("MT-",rownames(x)),]
  sapply(1:ncol(x), function(j) { sum(xmt[,j])/sum(x[,j]) } )
} )

vioplot(mt2)

boxplot(mt2,horizontal=TRUE,cex=0,col="white",las=1,
 main="mtDNA gene proportion after filtering")

Save data object

saveRDS(fmt,"macrophage_counts.rds")

Session information

For reproducibility.

save.image("macrophage_dataprep.Rdata")


sessionInfo()

## R version 4.4.2 (2024-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.5 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] vioplot_0.5.0               zoo_1.8-12                 
##  [3] sm_2.2-6.0                  mitch_1.19.3               
##  [5] DESeq2_1.44.0               muscat_1.18.0              
##  [7] beeswarm_0.4.0              stringi_1.8.4              
##  [9] SingleCellExperiment_1.26.0 SummarizedExperiment_1.34.0
## [11] Biobase_2.64.0              GenomicRanges_1.56.2       
## [13] GenomeInfoDb_1.40.1         IRanges_2.38.1             
## [15] S4Vectors_0.42.1            BiocGenerics_0.50.0        
## [17] MatrixGenerics_1.16.0       matrixStats_1.4.1          
## [19] hdf5r_1.3.11                Seurat_5.1.0               
## [21] SeuratObject_5.0.2          sp_2.1-4                   
## [23] plyr_1.8.9                 
## 
## loaded via a namespace (and not attached):
##   [1] bitops_1.0-9              spatstat.sparse_3.1-0    
##   [3] httr_1.4.7                RColorBrewer_1.1-3       
##   [5] doParallel_1.0.17         numDeriv_2016.8-1.1      
##   [7] backports_1.5.0           tools_4.4.2              
##   [9] sctransform_0.4.1         utf8_1.2.4               
##  [11] R6_2.5.1                  lazyeval_0.2.2           
##  [13] uwot_0.2.2                mgcv_1.9-1               
##  [15] GetoptLong_1.0.5          GGally_2.2.1             
##  [17] prettyunits_1.2.0         gridExtra_2.3            
##  [19] progressr_0.15.1          cli_3.6.3                
##  [21] spatstat.explore_3.3-3    fastDummies_1.7.4        
##  [23] sass_0.4.9                mvtnorm_1.3-2            
##  [25] spatstat.data_3.1-4       blme_1.0-6               
##  [27] ggridges_0.5.6            pbapply_1.7-2            
##  [29] systemfonts_1.1.0         R.utils_2.12.3           
##  [31] svglite_2.1.3             scater_1.32.1            
##  [33] parallelly_1.39.0         limma_3.60.6             
##  [35] rstudioapi_0.17.1         generics_0.1.3           
##  [37] shape_1.4.6.1             gtools_3.9.5             
##  [39] ica_1.0-3                 spatstat.random_3.3-2    
##  [41] dplyr_1.1.4               Matrix_1.7-1             
##  [43] ggbeeswarm_0.7.2          fansi_1.0.6              
##  [45] abind_1.4-8               R.methodsS3_1.8.2        
##  [47] lifecycle_1.0.4           yaml_2.3.10              
##  [49] edgeR_4.2.2               gplots_3.2.0             
##  [51] SparseArray_1.4.8         Rtsne_0.17               
##  [53] grid_4.4.2                promises_1.3.1           
##  [55] crayon_1.5.3              miniUI_0.1.1.1           
##  [57] lattice_0.22-6            echarts4r_0.4.5          
##  [59] beachmat_2.20.0           cowplot_1.1.3            
##  [61] pillar_1.9.0              knitr_1.49               
##  [63] ComplexHeatmap_2.20.0     rjson_0.2.23             
##  [65] boot_1.3-31               corpcor_1.6.10           
##  [67] future.apply_1.11.3       codetools_0.2-20         
##  [69] leiden_0.4.3.1            glue_1.8.0               
##  [71] spatstat.univar_3.1-1     data.table_1.16.2        
##  [73] vctrs_0.6.5               png_0.1-8                
##  [75] spam_2.11-0               Rdpack_2.6.2             
##  [77] gtable_0.3.6              cachem_1.1.0             
##  [79] xfun_0.49                 rbibutils_2.3            
##  [81] S4Arrays_1.4.1            mime_0.12                
##  [83] reformulas_0.4.0          survival_3.8-3           
##  [85] iterators_1.0.14          statmod_1.5.0            
##  [87] fitdistrplus_1.2-1        ROCR_1.0-11              
##  [89] nlme_3.1-167              pbkrtest_0.5.3           
##  [91] bit64_4.5.2               EnvStats_3.0.0           
##  [93] progress_1.2.3            RcppAnnoy_0.0.22         
##  [95] bslib_0.8.0               TMB_1.9.15               
##  [97] irlba_2.3.5.1             vipor_0.4.7              
##  [99] KernSmooth_2.23-24        colorspace_2.1-1         
## [101] tidyselect_1.2.1          bit_4.5.0                
## [103] compiler_4.4.2            BiocNeighbors_1.22.0     
## [105] xml2_1.3.6                DelayedArray_0.30.1      
## [107] plotly_4.10.4             scales_1.3.0             
## [109] caTools_1.18.3            remaCor_0.0.18           
## [111] lmtest_0.9-40             stringr_1.5.1            
## [113] digest_0.6.37             goftest_1.2-3            
## [115] spatstat.utils_3.1-1      minqa_1.2.8              
## [117] variancePartition_1.34.0  rmarkdown_2.29           
## [119] aod_1.3.3                 XVector_0.44.0           
## [121] RhpcBLASctl_0.23-42       htmltools_0.5.8.1        
## [123] pkgconfig_2.0.3           lme4_1.1-35.5            
## [125] sparseMatrixStats_1.16.0  fastmap_1.2.0            
## [127] rlang_1.1.4               GlobalOptions_0.1.2      
## [129] htmlwidgets_1.6.4         UCSC.utils_1.0.0         
## [131] shiny_1.9.1               DelayedMatrixStats_1.26.0
## [133] farver_2.1.2              jquerylib_0.1.4          
## [135] jsonlite_1.8.9            BiocParallel_1.38.0      
## [137] R.oo_1.27.0               BiocSingular_1.20.0      
## [139] magrittr_2.0.3            kableExtra_1.4.0         
## [141] scuttle_1.14.0            GenomeInfoDbData_1.2.12  
## [143] dotCall64_1.2             patchwork_1.3.0          
## [145] munsell_0.5.1             Rcpp_1.0.13-1            
## [147] viridis_0.6.5             reticulate_1.40.0        
## [149] zlibbioc_1.50.0           MASS_7.3-64              
## [151] ggstats_0.7.0             listenv_0.9.1            
## [153] ggrepel_0.9.6             deldir_2.0-4             
## [155] splines_4.4.2             tensor_1.5               
## [157] hms_1.1.3                 circlize_0.4.16          
## [159] locfit_1.5-9.10           igraph_2.1.1             
## [161] spatstat.geom_3.3-4       RcppHNSW_0.6.0           
## [163] reshape2_1.4.4            ScaledMatrix_1.12.0      
## [165] evaluate_1.0.1            nloptr_2.1.1             
## [167] foreach_1.5.2             httpuv_1.6.15            
## [169] RANN_2.6.2                tidyr_1.3.1              
## [171] purrr_1.0.2               polyclip_1.10-7          
## [173] future_1.34.0             clue_0.3-66              
## [175] scattermore_1.2           ggplot2_3.5.1            
## [177] rsvd_1.0.5                broom_1.0.7              
## [179] xtable_1.8-4              fANCOVA_0.6-1            
## [181] RSpectra_0.16-2           later_1.4.0              
## [183] viridisLite_0.4.2         tibble_3.2.1             
## [185] lmerTest_3.1-3            glmmTMB_1.1.10           
## [187] cluster_2.1.8             globals_0.16.3

Single cell transcriptomes of macrophages with HIV 2024 - data preparation

Burnet Bioinformatics group

2025-02-25