Source: https://github.com/markziemann/gene_sig_commons

Background

This analysis is an example of how gene signatures can be generated from publicly available omics data. In this example, we will obtain data from the DEE2 database, perform differential analysis with DESeq2 and then return the gene signature.

source("../de_functions.R")
# let's assume species is human for now
SPECIES = "hsapiens"

# read in the file
x <- readLines("../contrasts/SARS_MERS.md")

# this is the best way to get studies with contrasts set up
x <- x[grep("RP",x)]

# how many contrasts to do?
length(x)
## [1] 63
# get the metadata now to save time
mdat <- getDEE2::getDEE2Metadata(species = SPECIES)

# lets prototype this: main(x[[2]])
# debug like this: x1 <- x[[15]]

# now go ahead and run the whole thing
data <- lapply(x,function(x) { main(x,mdat) })
## SRP253951: NHBE mock treatment versus infected with SARS-CoV-2 (Series 1): mock;SRX7990866,SRX7990867,SRX7990868: SARS-CoV-2;    SRX7990869,SRX7990870,SRX7990871;
## For more information about DEE2 QC metrics, visit
##     https://github.com/markziemann/dee2/blob/master/qc/qc_metrics.md
## List of 8
##  $ GeneCounts     :'data.frame': 58302 obs. of  20 variables:
##   ..$ SRR11412215: int [1:58302] 0 1 0 0 0 0 0 0 0 0 ...
##   ..$ SRR11412216: int [1:58302] 0 1 0 0 0 0 0 0 0 0 ...
##   ..$ SRR11412217: int [1:58302] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR11412218: int [1:58302] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR11412219: int [1:58302] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR11412220: int [1:58302] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR11412221: int [1:58302] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR11412222: int [1:58302] 0 0 1 0 0 0 0 0 0 0 ...
##   ..$ SRR11412223: int [1:58302] 0 1 0 0 0 0 0 0 0 0 ...
##   ..$ SRR11412224: int [1:58302] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR11412225: int [1:58302] 0 3 1 0 0 0 0 0 0 0 ...
##   ..$ SRR11412226: int [1:58302] 0 1 1 0 0 0 0 0 0 0 ...
##   ..$ SRR11412231: int [1:58302] 0 0 1 0 0 0 0 0 0 0 ...
##   ..$ SRR11412232: int [1:58302] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR11412233: int [1:58302] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR11412234: int [1:58302] 0 0 1 0 0 0 0 0 0 0 ...
##   ..$ SRR11412235: int [1:58302] 0 2 1 0 0 0 0 0 0 0 ...
##   ..$ SRR11412236: int [1:58302] 0 5 1 0 0 0 0 0 0 2 ...
##   ..$ SRR11412237: int [1:58302] 0 2 1 0 0 0 0 0 0 0 ...
##   ..$ SRR11412238: int [1:58302] 0 3 2 0 0 0 0 0 0 0 ...
##  $ TxCounts       :'data.frame': 180869 obs. of  20 variables:
##   ..$ SRR11412215: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR11412216: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR11412217: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR11412218: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR11412219: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR11412220: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR11412221: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR11412222: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR11412223: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR11412224: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR11412225: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR11412226: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR11412231: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR11412232: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR11412233: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR11412234: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR11412235: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR11412236: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR11412237: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR11412238: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##  $ GeneInfo       :'data.frame': 58302 obs. of  5 variables:
##   ..$ GeneSymbol     : chr [1:58302] "DDX11L1" "WASH7P" "MIR6859-1" "MIR1302-2HG" ...
##   ..$ mean           : int [1:58302] 973 1351 68 641 138 948 840 1295 2227 1082 ...
##   ..$ median         : int [1:58302] 632 1351 68 712 138 1187 840 1414 2618 629 ...
##   ..$ longest_isoform: int [1:58302] 1657 1351 68 712 138 1187 840 1414 2618 2748 ...
##   ..$ merged         : int [1:58302] 1735 1351 68 1021 138 1219 840 1414 2618 3726 ...
##  $ TxInfo         :'data.frame': 180869 obs. of  3 variables:
##   ..$ GeneID    : chr [1:180869] "ENSG00000237235.2" "ENSG00000228985.1" "ENSG00000223997.1" "ENSG00000282253.1" ...
##   ..$ GeneSymbol: chr [1:180869] "TRDD2" "TRDD3" "TRDD1" "AC239618.6" ...
##   ..$ TxLength  : int [1:180869] 9 13 8 12 12 17 20 19 16 18 ...
##  $ QcMx           :'data.frame': 30 obs. of  20 variables:
##   ..$ SRR11412215: chr [1:30] "SE" "Sanger/Illumina1.9" "49" "136" ...
##   ..$ SRR11412216: chr [1:30] "SE" "Sanger/Illumina1.9" "49" "136" ...
##   ..$ SRR11412217: chr [1:30] "SE" "Sanger/Illumina1.9" "57" "137" ...
##   ..$ SRR11412218: chr [1:30] "SE" "Sanger/Illumina1.9" "52" "137" ...
##   ..$ SRR11412219: chr [1:30] "SE" "Sanger/Illumina1.9" "46" "131" ...
##   ..$ SRR11412220: chr [1:30] "SE" "Sanger/Illumina1.9" "42" "131" ...
##   ..$ SRR11412221: chr [1:30] "SE" "Sanger/Illumina1.9" "52" "130" ...
##   ..$ SRR11412222: chr [1:30] "SE" "Sanger/Illumina1.9" "53" "132" ...
##   ..$ SRR11412223: chr [1:30] "SE" "Sanger/Illumina1.9" "48" "121" ...
##   ..$ SRR11412224: chr [1:30] "SE" "Sanger/Illumina1.9" "42" "121" ...
##   ..$ SRR11412225: chr [1:30] "SE" "Sanger/Illumina1.9" "43" "122" ...
##   ..$ SRR11412226: chr [1:30] "SE" "Sanger/Illumina1.9" "43" "121" ...
##   ..$ SRR11412231: chr [1:30] "SE" "Sanger/Illumina1.9" "47" "144" ...
##   ..$ SRR11412232: chr [1:30] "SE" "Sanger/Illumina1.9" "44" "141" ...
##   ..$ SRR11412233: chr [1:30] "SE" "Sanger/Illumina1.9" "44" "144" ...
##   ..$ SRR11412234: chr [1:30] "SE" "Sanger/Illumina1.9" "53" "144" ...
##   ..$ SRR11412235: chr [1:30] "SE" "Sanger/Illumina1.9" "46" "139" ...
##   ..$ SRR11412236: chr [1:30] "SE" "Sanger/Illumina1.9" "47" "137" ...
##   ..$ SRR11412237: chr [1:30] "SE" "Sanger/Illumina1.9" "52" "139" ...
##   ..$ SRR11412238: chr [1:30] "SE" "Sanger/Illumina1.9" "52" "139" ...
##  $ MetadataSummary:'data.frame': 20 obs. of  7 variables:
##   ..$ QC_summary   : chr [1:20] "WARN(1,5,7)" "WARN(1,5,7)" "WARN(1,5,7)" "WARN(1,5,7)" ...
##   ..$ SRX_accession: chr [1:20] "SRX7990866" "SRX7990866" "SRX7990866" "SRX7990866" ...
##   ..$ SRS_accession: chr [1:20] "SRS6374419" "SRS6374419" "SRS6374419" "SRS6374419" ...
##   ..$ SRP_accession: chr [1:20] "SRP253951" "SRP253951" "SRP253951" "SRP253951" ...
##   ..$ Sample_name  : chr [1:20] "GSM4432378" "GSM4432378" "GSM4432378" "GSM4432378" ...
##   ..$ GEO_series   : chr [1:20] "GSE147507" "GSE147507" "GSE147507" "GSE147507" ...
##   ..$ Library_name : logi [1:20] NA NA NA NA NA NA ...
##  $ MetadataFull   :'data.frame': 20 obs. of  52 variables:
##   ..$ QC_summary           : chr [1:20] "WARN(1,5,7)" "WARN(1,5,7)" "WARN(1,5,7)" "WARN(1,5,7)" ...
##   ..$ SRX_accession        : chr [1:20] "SRX7990866" "SRX7990866" "SRX7990866" "SRX7990866" ...
##   ..$ SRS_accession        : chr [1:20] "SRS6374419" "SRS6374419" "SRS6374419" "SRS6374419" ...
##   ..$ SRP_accession        : chr [1:20] "SRP253951" "SRP253951" "SRP253951" "SRP253951" ...
##   ..$ Sample_name          : chr [1:20] "GSM4432378" "GSM4432378" "GSM4432378" "GSM4432378" ...
##   ..$ GEO_series           : chr [1:20] "GSE147507" "GSE147507" "GSE147507" "GSE147507" ...
##   ..$ Library_name         : logi [1:20] NA NA NA NA NA NA ...
##   ..$ SampleName           : chr [1:20] "GSM4432378" "GSM4432378" "GSM4432378" "GSM4432378" ...
##   ..$ ReleaseDate          : chr [1:20] "2020-03-26 16:41:06" "2020-03-26 16:41:06" "2020-03-26 16:41:06" "2020-03-26 16:41:06" ...
##   ..$ LoadDate             : chr [1:20] "2020-03-25 01:58:30" "2020-03-25 01:59:43" "2020-03-25 01:59:39" "2020-03-25 01:59:57" ...
##   ..$ spots                : int [1:20] 4260400 4175640 4322242 4245291 4106965 4027775 4144278 4032103 6111163 6020043 ...
##   ..$ bases                : int [1:20] 558210660 547162776 566041144 555639544 524883325 514317390 529851552 514225167 734468035 723823606 ...
##   ..$ spots_with_mates     : int [1:20] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ avgLength            : int [1:20] 131 131 130 130 127 127 127 127 120 120 ...
##   ..$ size_MB              : int [1:20] 231 224 219 214 212 205 199 192 296 289 ...
##   ..$ AssemblyName         : logi [1:20] NA NA NA NA NA NA ...
##   ..$ download_path        : chr [1:20] "https://sra-download.ncbi.nlm.nih.gov/traces/sra60/SRR/011144/SRR11412215" "https://sra-download.ncbi.nlm.nih.gov/traces/sra61/SRR/011144/SRR11412216" "https://sra-download.ncbi.nlm.nih.gov/traces/sra60/SRR/011144/SRR11412217" "https://sra-download.ncbi.nlm.nih.gov/traces/sra62/SRR/011144/SRR11412218" ...
##   ..$ LibraryName          : logi [1:20] NA NA NA NA NA NA ...
##   ..$ LibraryStrategy      : chr [1:20] "RNA-Seq" "RNA-Seq" "RNA-Seq" "RNA-Seq" ...
##   ..$ LibrarySelection     : chr [1:20] "cDNA" "cDNA" "cDNA" "cDNA" ...
##   ..$ LibrarySource        : chr [1:20] "TRANSCRIPTOMIC" "TRANSCRIPTOMIC" "TRANSCRIPTOMIC" "TRANSCRIPTOMIC" ...
##   ..$ LibraryLayout        : chr [1:20] "SINGLE" "SINGLE" "SINGLE" "SINGLE" ...
##   ..$ InsertSize           : int [1:20] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ InsertDev            : num [1:20] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ Platform             : chr [1:20] "ILLUMINA" "ILLUMINA" "ILLUMINA" "ILLUMINA" ...
##   ..$ Model                : chr [1:20] "NextSeq 500" "NextSeq 500" "NextSeq 500" "NextSeq 500" ...
##   ..$ SRAStudy             : chr [1:20] "SRP253951" "SRP253951" "SRP253951" "SRP253951" ...
##   ..$ BioProject           : chr [1:20] "PRJNA615032" "PRJNA615032" "PRJNA615032" "PRJNA615032" ...
##   ..$ Study_Pubmed_id      : int [1:20] 3 3 3 3 3 3 3 3 3 3 ...
##   ..$ ProjectID            : int [1:20] 615032 615032 615032 615032 615032 615032 615032 615032 615032 615032 ...
##   ..$ Sample               : chr [1:20] "SRS6374419" "SRS6374419" "SRS6374419" "SRS6374419" ...
##   ..$ BioSample            : chr [1:20] "SAMN14444845" "SAMN14444845" "SAMN14444845" "SAMN14444845" ...
##   ..$ SampleType           : chr [1:20] "simple" "simple" "simple" "simple" ...
##   ..$ TaxID                : int [1:20] 9606 9606 9606 9606 9606 9606 9606 9606 9606 9606 ...
##   ..$ ScientificName       : chr [1:20] "Homo sapiens" "Homo sapiens" "Homo sapiens" "Homo sapiens" ...
##   ..$ g1k_pop_code         : logi [1:20] NA NA NA NA NA NA ...
##   ..$ source               : logi [1:20] NA NA NA NA NA NA ...
##   ..$ g1k_analysis_group   : logi [1:20] NA NA NA NA NA NA ...
##   ..$ Subject_ID           : logi [1:20] NA NA NA NA NA NA ...
##   ..$ Sex                  : logi [1:20] NA NA NA NA NA NA ...
##   ..$ Disease              : logi [1:20] NA NA NA NA NA NA ...
##   ..$ Tumor                : chr [1:20] "no" "no" "no" "no" ...
##   ..$ Affection_Status     : logi [1:20] NA NA NA NA NA NA ...
##   ..$ Analyte_Type         : logi [1:20] NA NA NA NA NA NA ...
##   ..$ Histological_Type    : logi [1:20] NA NA NA NA NA NA ...
##   ..$ Body_Site            : logi [1:20] NA NA NA NA NA NA ...
##   ..$ CenterName           : chr [1:20] "GEO" "GEO" "GEO" "GEO" ...
##   ..$ Submission           : chr [1:20] "SRA1059108" "SRA1059108" "SRA1059108" "SRA1059108" ...
##   ..$ dbgap_study_accession: logi [1:20] NA NA NA NA NA NA ...
##   ..$ Consent              : chr [1:20] "public" "public" "public" "public" ...
##   ..$ RunHash              : chr [1:20] "C9643B1011B36CC2CAE79D18E9120F87" "D3A2CE0CDD17532F6157163A3FF456A4" "C06BC457E87BBBC803861FCA11F6CBDD" "DEF377A1149EF3CB5627ACCB8137B344" ...
##   ..$ ReadHash             : chr [1:20] "D2457CA5524A8D5D14BA65949754A4CD" "3B884D79213D131813C331DC0DBEA21E" "4A4BE0DAD12A14C30DE9021FD3626BC2" "FB641B4BE3336742B17DC01D0525CEA3" ...
##  $ absent         : chr(0)
## converting counts to integer mode
## using supplied model matrix
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing

## SRP253951: A549 mock treatment versus infected with SARS-CoV-2 (Series 2): mock;SRX7990872,SRX7990873,SRX7990874: SARS-CoV-2;    SRX7990875,SRX7990876,SRX7990877;
## For more information about DEE2 QC metrics, visit
##     https://github.com/markziemann/dee2/blob/master/qc/qc_metrics.md