Source: https://github.com/markziemann/gene_sig_commons

Background

This analysis is an example of how gene signatures can be generated from publicly available omics data. In this example, we will obtain data from the DEE2 database, perform differential analysis with DESeq2 and then return the gene signature.

source("../de_functions.R")
# let's assume species is human for now
SPECIES = "hsapiens"

# read in the file
x <- readLines("../contrasts/epilepsy.md")

# this is the best way to get studies with contrasts set up
x <- x[grep("RP",x)]

# how many contrasts to do?
length(x)
## [1] 8
# prepare by deleting the directory called "gmt"
unlink("gmt", recursive = TRUE)

# get the metadata now to save time
mdat <- getDEE2::getDEE2Metadata(species = SPECIES)

# lets prototype this: main(x[[2]],mdat)

# now go ahead and run the whole thing
data <- lapply(x,function(x) { main(x,mdat) })
## SRP222346: mock treatment versus dCas9 transduced SH-SY5Y cells:mock; SRX6867493, SRX6867494: dCas9; SRX6867495 , SRX6867496 , SRX6867497;
## For more information about DEE2 QC metrics, visit
##     https://github.com/markziemann/dee2/blob/master/qc/qc_metrics.md
## List of 8
##  $ GeneCounts     :'data.frame': 58302 obs. of  4 variables:
##   ..$ SRR10139483: int [1:58302] 1 101 4 0 0 0 0 0 0 0 ...
##   ..$ SRR10139485: int [1:58302] 1 243 0 2 0 0 0 0 0 0 ...
##   ..$ SRR10139486: int [1:58302] 0 166 2 1 0 0 0 0 0 0 ...
##   ..$ SRR10139487: int [1:58302] 0 107 0 1 0 0 0 0 0 0 ...
##  $ TxCounts       :'data.frame': 180869 obs. of  4 variables:
##   ..$ SRR10139483: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR10139485: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR10139486: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR10139487: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##  $ GeneInfo       :'data.frame': 58302 obs. of  5 variables:
##   ..$ GeneSymbol     : chr [1:58302] "DDX11L1" "WASH7P" "MIR6859-1" "MIR1302-2HG" ...
##   ..$ mean           : int [1:58302] 973 1351 68 641 138 948 840 1295 2227 1082 ...
##   ..$ median         : int [1:58302] 632 1351 68 712 138 1187 840 1414 2618 629 ...
##   ..$ longest_isoform: int [1:58302] 1657 1351 68 712 138 1187 840 1414 2618 2748 ...
##   ..$ merged         : int [1:58302] 1735 1351 68 1021 138 1219 840 1414 2618 3726 ...
##  $ TxInfo         :'data.frame': 180869 obs. of  3 variables:
##   ..$ GeneID    : chr [1:180869] "ENSG00000237235.2" "ENSG00000228985.1" "ENSG00000223997.1" "ENSG00000282253.1" ...
##   ..$ GeneSymbol: chr [1:180869] "TRDD2" "TRDD3" "TRDD1" "AC239618.6" ...
##   ..$ TxLength  : int [1:180869] 9 13 8 12 12 17 20 19 16 18 ...
##  $ QcMx           :'data.frame': 30 obs. of  4 variables:
##   ..$ SRR10139483: chr [1:30] "PE" "Sanger/Illumina1.9" "151" "151" ...
##   ..$ SRR10139485: chr [1:30] "PE" "Sanger/Illumina1.9" "151" "151" ...
##   ..$ SRR10139486: chr [1:30] "PE" "Sanger/Illumina1.9" "151" "151" ...
##   ..$ SRR10139487: chr [1:30] "PE" "Sanger/Illumina1.9" "151" "151" ...
##  $ MetadataSummary:'data.frame': 4 obs. of  7 variables:
##   ..$ QC_summary   : chr [1:4] "PASS" "PASS" "PASS" "PASS"
##   ..$ SRX_accession: chr [1:4] "SRX6867493" "SRX6867495" "SRX6867496" "SRX6867497"
##   ..$ SRS_accession: chr [1:4] "SRS5404929" "SRS5404931" "SRS5404932" "SRS5404933"
##   ..$ SRP_accession: chr [1:4] "SRP222346" "SRP222346" "SRP222346" "SRP222346"
##   ..$ Sample_name  : chr [1:4] "GSM4083893" "GSM4083895" "GSM4083896" "GSM4083897"
##   ..$ GEO_series   : chr [1:4] "GSE137663" "GSE137663" "GSE137663" "GSE137663"
##   ..$ Library_name : logi [1:4] NA NA NA NA
##  $ MetadataFull   :'data.frame': 4 obs. of  52 variables:
##   ..$ QC_summary           : chr [1:4] "PASS" "PASS" "PASS" "PASS"
##   ..$ SRX_accession        : chr [1:4] "SRX6867493" "SRX6867495" "SRX6867496" "SRX6867497"
##   ..$ SRS_accession        : chr [1:4] "SRS5404929" "SRS5404931" "SRS5404932" "SRS5404933"
##   ..$ SRP_accession        : chr [1:4] "SRP222346" "SRP222346" "SRP222346" "SRP222346"
##   ..$ Sample_name          : chr [1:4] "GSM4083893" "GSM4083895" "GSM4083896" "GSM4083897"
##   ..$ GEO_series           : chr [1:4] "GSE137663" "GSE137663" "GSE137663" "GSE137663"
##   ..$ Library_name         : logi [1:4] NA NA NA NA
##   ..$ SampleName           : chr [1:4] "GSM4083893" "GSM4083895" "GSM4083896" "GSM4083897"
##   ..$ ReleaseDate          : chr [1:4] "2020-01-02 09:35:26" "2020-01-02 09:35:26" "2020-01-02 09:35:26" "2020-01-02 09:35:27"
##   ..$ LoadDate             : chr [1:4] "2019-09-18 16:14:29" "2019-09-18 16:31:25" "2019-09-18 16:18:21" "2019-09-18 16:16:55"
##   ..$ spots                : int [1:4] 22316828 35473828 21219962 19974486
##   ..$ bases                : num [1:4] 6.74e+09 1.07e+10 6.41e+09 6.03e+09
##   ..$ spots_with_mates     : int [1:4] 22316828 35473828 21219962 19974486
##   ..$ avgLength            : int [1:4] 302 302 302 302
##   ..$ size_MB              : int [1:4] 3134 4688 2798 2660
##   ..$ AssemblyName         : logi [1:4] NA NA NA NA
##   ..$ download_path        : chr [1:4] "https://sra-download.ncbi.nlm.nih.gov/traces/sra15/SRR/009901/SRR10139483" "https://sra-download.ncbi.nlm.nih.gov/traces/sra20/SRR/009901/SRR10139485" "https://sra-download.ncbi.nlm.nih.gov/traces/sra4/SRR/009901/SRR10139486" "https://sra-download.ncbi.nlm.nih.gov/traces/sra12/SRR/009901/SRR10139487"
##   ..$ LibraryName          : logi [1:4] NA NA NA NA
##   ..$ LibraryStrategy      : chr [1:4] "RNA-Seq" "RNA-Seq" "RNA-Seq" "RNA-Seq"
##   ..$ LibrarySelection     : chr [1:4] "cDNA" "cDNA" "cDNA" "cDNA"
##   ..$ LibrarySource        : chr [1:4] "TRANSCRIPTOMIC" "TRANSCRIPTOMIC" "TRANSCRIPTOMIC" "TRANSCRIPTOMIC"
##   ..$ LibraryLayout        : chr [1:4] "PAIRED" "PAIRED" "PAIRED" "PAIRED"
##   ..$ InsertSize           : int [1:4] 0 0 0 0
##   ..$ InsertDev            : num [1:4] 0 0 0 0
##   ..$ Platform             : chr [1:4] "ILLUMINA" "ILLUMINA" "ILLUMINA" "ILLUMINA"
##   ..$ Model                : chr [1:4] "Illumina HiSeq 4000" "Illumina HiSeq 4000" "Illumina HiSeq 4000" "Illumina HiSeq 4000"
##   ..$ SRAStudy             : chr [1:4] "SRP222346" "SRP222346" "SRP222346" "SRP222346"
##   ..$ BioProject           : chr [1:4] "PRJNA566176" "PRJNA566176" "PRJNA566176" "PRJNA566176"
##   ..$ Study_Pubmed_id      : int [1:4] 3 3 3 3
##   ..$ ProjectID            : int [1:4] 566176 566176 566176 566176
##   ..$ Sample               : chr [1:4] "SRS5404929" "SRS5404931" "SRS5404932" "SRS5404933"
##   ..$ BioSample            : chr [1:4] "SAMN12783336" "SAMN12783334" "SAMN12783333" "SAMN12783332"
##   ..$ SampleType           : chr [1:4] "simple" "simple" "simple" "simple"
##   ..$ TaxID                : int [1:4] 9606 9606 9606 9606
##   ..$ ScientificName       : chr [1:4] "Homo sapiens" "Homo sapiens" "Homo sapiens" "Homo sapiens"
##   ..$ g1k_pop_code         : logi [1:4] NA NA NA NA
##   ..$ source               : logi [1:4] NA NA NA NA
##   ..$ g1k_analysis_group   : logi [1:4] NA NA NA NA
##   ..$ Subject_ID           : logi [1:4] NA NA NA NA
##   ..$ Sex                  : logi [1:4] NA NA NA NA
##   ..$ Disease              : logi [1:4] NA NA NA NA
##   ..$ Tumor                : chr [1:4] "no" "no" "no" "no"
##   ..$ Affection_Status     : logi [1:4] NA NA NA NA
##   ..$ Analyte_Type         : logi [1:4] NA NA NA NA
##   ..$ Histological_Type    : logi [1:4] NA NA NA NA
##   ..$ Body_Site            : logi [1:4] NA NA NA NA
##   ..$ CenterName           : chr [1:4] "GEO" "GEO" "GEO" "GEO"
##   ..$ Submission           : chr [1:4] "SRA964414" "SRA964414" "SRA964414" "SRA964414"
##   ..$ dbgap_study_accession: logi [1:4] NA NA NA NA
##   ..$ Consent              : chr [1:4] "public" "public" "public" "public"
##   ..$ RunHash              : chr [1:4] "679F5D4FD993E5632F27FC0C8127970B" "B3CC1CCE229D0D74E282538392193341" "836DEF2E4064B5C3314275D8F3B31CA7" "BCFF9CCCF5E86A982610202D9BEBAB82"
##   ..$ ReadHash             : chr [1:4] "C7D1F4D497C862CFBC7A1FE93F03393A" "09C85CF051642945248374D8FE061A13" "34F36375414F117F84B38A132FD85588" "4F4FFA91983CBB24C270C65C3B8296B7"
##  $ absent         : chr(0)
## SRP222346: dCas9 transduced versus dCas9 with TET1 co-expression in SH-SY5Y cells:dCas9; SRX6867495 , SRX6867496 , SRX6867497:dCas9+TET1; SRX6867498 ,SRX6867499;
## For more information about DEE2 QC metrics, visit
##     https://github.com/markziemann/dee2/blob/master/qc/qc_metrics.md
## List of 8
##  $ GeneCounts     :'data.frame': 58302 obs. of  5 variables:
##   ..$ SRR10139485: int [1:58302] 1 243 0 2 0 0 0 0 0 0 ...
##   ..$ SRR10139486: int [1:58302] 0 166 2 1 0 0 0 0 0 0 ...
##   ..$ SRR10139487: int [1:58302] 0 107 0 1 0 0 0 0 0 0 ...
##   ..$ SRR10139488: int [1:58302] 0 236 9 2 0 0 0 0 0 0 ...
##   ..$ SRR10139489: int [1:58302] 1 129 0 0 0 0 0 0 0 0 ...
##  $ TxCounts       :'data.frame': 180869 obs. of  5 variables:
##   ..$ SRR10139485: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR10139486: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR10139487: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR10139488: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR10139489: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##  $ GeneInfo       :'data.frame': 58302 obs. of  5 variables:
##   ..$ GeneSymbol     : chr [1:58302] "DDX11L1" "WASH7P" "MIR6859-1" "MIR1302-2HG" ...
##   ..$ mean           : int [1:58302] 973 1351 68 641 138 948 840 1295 2227 1082 ...
##   ..$ median         : int [1:58302] 632 1351 68 712 138 1187 840 1414 2618 629 ...
##   ..$ longest_isoform: int [1:58302] 1657 1351 68 712 138 1187 840 1414 2618 2748 ...
##   ..$ merged         : int [1:58302] 1735 1351 68 1021 138 1219 840 1414 2618 3726 ...
##  $ TxInfo         :'data.frame': 180869 obs. of  3 variables:
##   ..$ GeneID    : chr [1:180869] "ENSG00000237235.2" "ENSG00000228985.1" "ENSG00000223997.1" "ENSG00000282253.1" ...
##   ..$ GeneSymbol: chr [1:180869] "TRDD2" "TRDD3" "TRDD1" "AC239618.6" ...
##   ..$ TxLength  : int [1:180869] 9 13 8 12 12 17 20 19 16 18 ...
##  $ QcMx           :'data.frame': 30 obs. of  5 variables:
##   ..$ SRR10139485: chr [1:30] "PE" "Sanger/Illumina1.9" "151" "151" ...
##   ..$ SRR10139486: chr [1:30] "PE" "Sanger/Illumina1.9" "151" "151" ...
##   ..$ SRR10139487: chr [1:30] "PE" "Sanger/Illumina1.9" "151" "151" ...
##   ..$ SRR10139488: chr [1:30] "PE" "Sanger/Illumina1.9" "151" "151" ...
##   ..$ SRR10139489: chr [1:30] "PE" "Sanger/Illumina1.9" "151" "151" ...
##  $ MetadataSummary:'data.frame': 5 obs. of  7 variables:
##   ..$ QC_summary   : chr [1:5] "PASS" "PASS" "PASS" "PASS" ...
##   ..$ SRX_accession: chr [1:5] "SRX6867495" "SRX6867496" "SRX6867497" "SRX6867498" ...
##   ..$ SRS_accession: chr [1:5] "SRS5404931" "SRS5404932" "SRS5404933" "SRS5404934" ...
##   ..$ SRP_accession: chr [1:5] "SRP222346" "SRP222346" "SRP222346" "SRP222346" ...
##   ..$ Sample_name  : chr [1:5] "GSM4083895" "GSM4083896" "GSM4083897" "GSM4083898" ...
##   ..$ GEO_series   : chr [1:5] "GSE137663" "GSE137663" "GSE137663" "GSE137663" ...
##   ..$ Library_name : logi [1:5] NA NA NA NA NA
##  $ MetadataFull   :'data.frame': 5 obs. of  52 variables:
##   ..$ QC_summary           : chr [1:5] "PASS" "PASS" "PASS" "PASS" ...
##   ..$ SRX_accession        : chr [1:5] "SRX6867495" "SRX6867496" "SRX6867497" "SRX6867498" ...
##   ..$ SRS_accession        : chr [1:5] "SRS5404931" "SRS5404932" "SRS5404933" "SRS5404934" ...
##   ..$ SRP_accession        : chr [1:5] "SRP222346" "SRP222346" "SRP222346" "SRP222346" ...
##   ..$ Sample_name          : chr [1:5] "GSM4083895" "GSM4083896" "GSM4083897" "GSM4083898" ...
##   ..$ GEO_series           : chr [1:5] "GSE137663" "GSE137663" "GSE137663" "GSE137663" ...
##   ..$ Library_name         : logi [1:5] NA NA NA NA NA
##   ..$ SampleName           : chr [1:5] "GSM4083895" "GSM4083896" "GSM4083897" "GSM4083898" ...
##   ..$ ReleaseDate          : chr [1:5] "2020-01-02 09:35:26" "2020-01-02 09:35:26" "2020-01-02 09:35:27" "2020-01-02 09:35:27" ...
##   ..$ LoadDate             : chr [1:5] "2019-09-18 16:31:25" "2019-09-18 16:18:21" "2019-09-18 16:16:55" "2019-09-18 16:27:33" ...
##   ..$ spots                : int [1:5] 35473828 21219962 19974486 32060518 17793882
##   ..$ bases                : num [1:5] 1.07e+10 6.41e+09 6.03e+09 9.68e+09 5.37e+09
##   ..$ spots_with_mates     : int [1:5] 35473828 21219962 19974486 32060518 17793882
##   ..$ avgLength            : int [1:5] 302 302 302 302 302
##   ..$ size_MB              : int [1:5] 4688 2798 2660 4259 2350
##   ..$ AssemblyName         : logi [1:5] NA NA NA NA NA
##   ..$ download_path        : chr [1:5] "https://sra-download.ncbi.nlm.nih.gov/traces/sra20/SRR/009901/SRR10139485" "https://sra-download.ncbi.nlm.nih.gov/traces/sra4/SRR/009901/SRR10139486" "https://sra-download.ncbi.nlm.nih.gov/traces/sra12/SRR/009901/SRR10139487" "https://sra-download.ncbi.nlm.nih.gov/traces/sra11/SRR/009901/SRR10139488" ...
##   ..$ LibraryName          : logi [1:5] NA NA NA NA NA
##   ..$ LibraryStrategy      : chr [1:5] "RNA-Seq" "RNA-Seq" "RNA-Seq" "RNA-Seq" ...
##   ..$ LibrarySelection     : chr [1:5] "cDNA" "cDNA" "cDNA" "cDNA" ...
##   ..$ LibrarySource        : chr [1:5] "TRANSCRIPTOMIC" "TRANSCRIPTOMIC" "TRANSCRIPTOMIC" "TRANSCRIPTOMIC" ...
##   ..$ LibraryLayout        : chr [1:5] "PAIRED" "PAIRED" "PAIRED" "PAIRED" ...
##   ..$ InsertSize           : int [1:5] 0 0 0 0 0
##   ..$ InsertDev            : num [1:5] 0 0 0 0 0
##   ..$ Platform             : chr [1:5] "ILLUMINA" "ILLUMINA" "ILLUMINA" "ILLUMINA" ...
##   ..$ Model                : chr [1:5] "Illumina HiSeq 4000" "Illumina HiSeq 4000" "Illumina HiSeq 4000" "Illumina HiSeq 4000" ...
##   ..$ SRAStudy             : chr [1:5] "SRP222346" "SRP222346" "SRP222346" "SRP222346" ...
##   ..$ BioProject           : chr [1:5] "PRJNA566176" "PRJNA566176" "PRJNA566176" "PRJNA566176" ...
##   ..$ Study_Pubmed_id      : int [1:5] 3 3 3 3 3
##   ..$ ProjectID            : int [1:5] 566176 566176 566176 566176 566176
##   ..$ Sample               : chr [1:5] "SRS5404931" "SRS5404932" "SRS5404933" "SRS5404934" ...
##   ..$ BioSample            : chr [1:5] "SAMN12783334" "SAMN12783333" "SAMN12783332" "SAMN12783331" ...
##   ..$ SampleType           : chr [1:5] "simple" "simple" "simple" "simple" ...
##   ..$ TaxID                : int [1:5] 9606 9606 9606 9606 9606
##   ..$ ScientificName       : chr [1:5] "Homo sapiens" "Homo sapiens" "Homo sapiens" "Homo sapiens" ...
##   ..$ g1k_pop_code         : logi [1:5] NA NA NA NA NA
##   ..$ source               : logi [1:5] NA NA NA NA NA
##   ..$ g1k_analysis_group   : logi [1:5] NA NA NA NA NA
##   ..$ Subject_ID           : logi [1:5] NA NA NA NA NA
##   ..$ Sex                  : logi [1:5] NA NA NA NA NA
##   ..$ Disease              : logi [1:5] NA NA NA NA NA
##   ..$ Tumor                : chr [1:5] "no" "no" "no" "no" ...
##   ..$ Affection_Status     : logi [1:5] NA NA NA NA NA
##   ..$ Analyte_Type         : logi [1:5] NA NA NA NA NA
##   ..$ Histological_Type    : logi [1:5] NA NA NA NA NA
##   ..$ Body_Site            : logi [1:5] NA NA NA NA NA
##   ..$ CenterName           : chr [1:5] "GEO" "GEO" "GEO" "GEO" ...
##   ..$ Submission           : chr [1:5] "SRA964414" "SRA964414" "SRA964414" "SRA964414" ...
##   ..$ dbgap_study_accession: logi [1:5] NA NA NA NA NA
##   ..$ Consent              : chr [1:5] "public" "public" "public" "public" ...
##   ..$ RunHash              : chr [1:5] "B3CC1CCE229D0D74E282538392193341" "836DEF2E4064B5C3314275D8F3B31CA7" "BCFF9CCCF5E86A982610202D9BEBAB82" "4DBBC0EF49E7F021A5A9F5747966C6BC" ...
##   ..$ ReadHash             : chr [1:5] "09C85CF051642945248374D8FE061A13" "34F36375414F117F84B38A132FD85588" "4F4FFA91983CBB24C270C65C3B8296B7" "168AF1F23FC8EEE3A7AD7CD7AF8BF088" ...
##  $ absent         : chr(0)
## converting counts to integer mode
## using supplied model matrix
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing

## SRP222346: dCas9 transduced versus dCas9 with VP64 co-expression in SH-SY5Y cells:dCas9; SRX6867495 , SRX6867496 , SRX6867497:dCas9+VP64; SRX6867500, SRX6867501, SRX6867502;
## For more information about DEE2 QC metrics, visit
##     https://github.com/markziemann/dee2/blob/master/qc/qc_metrics.md