Source: https://github.com/markziemann/gene_sig_commons

Background

This analysis is an example of how gene signatures can be generated from publicly available omics data. In this example, we will obtain data from the DEE2 database, perform differential analysis with DESeq2 and then return the gene signature.

source("../de_functions.R")
# let's assume species is human for now
SPECIES = "hsapiens"

# read in the file
x <- readLines("../contrasts/diabetes.md")

# this is the best way to get studies with contrasts set up
x <- x[grep("RP",x)]

# how many contrasts to do?
length(x)
## [1] 114
# get the metadata now to save time
mdat <- getDEE2::getDEE2Metadata(species = SPECIES)

# lets prototype this: main(x[[2]],mdat)

# now go ahead and run the whole thing
data <- lapply(x,function(x) { main(x,mdat) })
## SRP221142:Genes differentially regulated by pro-inflammatory cytokines:Ctrl; SRX6821381,SRX6821383,SRX6821385,SRX6821387,SRX6821389:PIC; SRX6821382,SRX6821384,SRX6821386,SRX6821388,SRX6821390
## For more information about DEE2 QC metrics, visit
##     https://github.com/markziemann/dee2/blob/master/qc/qc_metrics.md
## List of 8
##  $ GeneCounts     :'data.frame': 58302 obs. of  10 variables:
##   ..$ SRR10088643: int [1:58302] 3 335 28 3 0 0 3 0 0 1 ...
##   ..$ SRR10088644: int [1:58302] 0 144 17 0 0 0 2 0 0 2 ...
##   ..$ SRR10088645: int [1:58302] 0 271 18 2 0 0 5 1 0 3 ...
##   ..$ SRR10088646: int [1:58302] 0 343 23 1 0 0 10 0 0 2 ...
##   ..$ SRR10088647: int [1:58302] 3 128 11 1 0 0 1 0 0 2 ...
##   ..$ SRR10088648: int [1:58302] 2 365 51 4 0 0 0 0 0 1 ...
##   ..$ SRR10088649: int [1:58302] 1 280 14 3 0 0 0 0 0 2 ...
##   ..$ SRR10088650: int [1:58302] 0 305 24 1 0 2 4 0 0 0 ...
##   ..$ SRR10088651: int [1:58302] 3 442 24 2 0 0 0 0 0 1 ...
##   ..$ SRR10088652: int [1:58302] 1 383 26 3 0 0 1 0 0 1 ...
##  $ TxCounts       :'data.frame': 180869 obs. of  10 variables:
##   ..$ SRR10088643: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR10088644: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR10088645: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR10088646: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR10088647: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR10088648: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR10088649: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR10088650: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR10088651: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ SRR10088652: num [1:180869] 0 0 0 0 0 0 0 0 0 0 ...
##  $ GeneInfo       :'data.frame': 58302 obs. of  5 variables:
##   ..$ GeneSymbol     : chr [1:58302] "DDX11L1" "WASH7P" "MIR6859-1" "MIR1302-2HG" ...
##   ..$ mean           : int [1:58302] 973 1351 68 641 138 948 840 1295 2227 1082 ...
##   ..$ median         : int [1:58302] 632 1351 68 712 138 1187 840 1414 2618 629 ...
##   ..$ longest_isoform: int [1:58302] 1657 1351 68 712 138 1187 840 1414 2618 2748 ...
##   ..$ merged         : int [1:58302] 1735 1351 68 1021 138 1219 840 1414 2618 3726 ...
##  $ TxInfo         :'data.frame': 180869 obs. of  3 variables:
##   ..$ GeneID    : chr [1:180869] "ENSG00000237235.2" "ENSG00000228985.1" "ENSG00000223997.1" "ENSG00000282253.1" ...
##   ..$ GeneSymbol: chr [1:180869] "TRDD2" "TRDD3" "TRDD1" "AC239618.6" ...
##   ..$ TxLength  : int [1:180869] 9 13 8 12 12 17 20 19 16 18 ...
##  $ QcMx           :'data.frame': 30 obs. of  10 variables:
##   ..$ SRR10088643: chr [1:30] "PE" "Sanger/Illumina1.9" "101" "101" ...
##   ..$ SRR10088644: chr [1:30] "PE" "Sanger/Illumina1.9" "101" "101" ...
##   ..$ SRR10088645: chr [1:30] "PE" "Sanger/Illumina1.9" "101" "101" ...
##   ..$ SRR10088646: chr [1:30] "PE" "Sanger/Illumina1.9" "101" "101" ...
##   ..$ SRR10088647: chr [1:30] "PE" "Sanger/Illumina1.9" "101" "101" ...
##   ..$ SRR10088648: chr [1:30] "PE" "Sanger/Illumina1.9" "101" "101" ...
##   ..$ SRR10088649: chr [1:30] "PE" "Sanger/Illumina1.9" "101" "101" ...
##   ..$ SRR10088650: chr [1:30] "PE" "Sanger/Illumina1.9" "101" "101" ...
##   ..$ SRR10088651: chr [1:30] "PE" "Sanger/Illumina1.9" "101" "101" ...
##   ..$ SRR10088652: chr [1:30] "PE" "Sanger/Illumina1.9" "101" "101" ...
##  $ MetadataSummary:'data.frame': 10 obs. of  7 variables:
##   ..$ QC_summary   : chr [1:10] "WARN(8)" "WARN(8)" "WARN(8)" "WARN(8)" ...
##   ..$ SRX_accession: chr [1:10] "SRX6821381" "SRX6821382" "SRX6821383" "SRX6821384" ...
##   ..$ SRS_accession: chr [1:10] "SRS5363958" "SRS5363959" "SRS5363960" "SRS5363961" ...
##   ..$ SRP_accession: chr [1:10] "SRP221142" "SRP221142" "SRP221142" "SRP221142" ...
##   ..$ Sample_name  : chr [1:10] "GSM4067526" "GSM4067527" "GSM4067528" "GSM4067529" ...
##   ..$ GEO_series   : chr [1:10] "GSE137136" "GSE137136" "GSE137136" "GSE137136" ...
##   ..$ Library_name : logi [1:10] NA NA NA NA NA NA ...
##  $ MetadataFull   :'data.frame': 10 obs. of  52 variables:
##   ..$ QC_summary           : chr [1:10] "WARN(8)" "WARN(8)" "WARN(8)" "WARN(8)" ...
##   ..$ SRX_accession        : chr [1:10] "SRX6821381" "SRX6821382" "SRX6821383" "SRX6821384" ...
##   ..$ SRS_accession        : chr [1:10] "SRS5363958" "SRS5363959" "SRS5363960" "SRS5363961" ...
##   ..$ SRP_accession        : chr [1:10] "SRP221142" "SRP221142" "SRP221142" "SRP221142" ...
##   ..$ Sample_name          : chr [1:10] "GSM4067526" "GSM4067527" "GSM4067528" "GSM4067529" ...
##   ..$ GEO_series           : chr [1:10] "GSE137136" "GSE137136" "GSE137136" "GSE137136" ...
##   ..$ Library_name         : logi [1:10] NA NA NA NA NA NA ...
##   ..$ SampleName           : chr [1:10] "GSM4067526" "GSM4067527" "GSM4067528" "GSM4067529" ...
##   ..$ ReleaseDate          : chr [1:10] "2019-09-20 16:07:39" "2019-09-20 16:07:39" "2019-09-20 16:07:39" "2019-09-20 16:07:39" ...
##   ..$ LoadDate             : chr [1:10] "2019-09-09 18:36:18" "2019-09-09 17:50:42" "2019-09-09 17:56:00" "2019-09-09 18:04:08" ...
##   ..$ spots                : int [1:10] 119898489 54472778 85122073 114821406 60891015 116000000 72665319 85320664 118787727 90312380
##   ..$ bases                : num [1:10] 2.42e+10 1.10e+10 1.72e+10 2.32e+10 1.23e+10 ...
##   ..$ spots_with_mates     : int [1:10] 119898489 54472778 85122073 114821406 60891015 116000000 72665319 85320664 118787727 90312380
##   ..$ avgLength            : int [1:10] 202 202 202 202 202 202 202 202 202 202
##   ..$ size_MB              : int [1:10] 17736 8138 12413 16713 8830 16830 9333 10928 16098 12228
##   ..$ AssemblyName         : logi [1:10] NA NA NA NA NA NA ...
##   ..$ download_path        : chr [1:10] "https://sra-download.ncbi.nlm.nih.gov/traces/sra78/SRR/009852/SRR10088643" "https://sra-download.ncbi.nlm.nih.gov/traces/sra71/SRR/009852/SRR10088644" "https://sra-download.ncbi.nlm.nih.gov/traces/sra78/SRR/009852/SRR10088645" "https://sra-download.ncbi.nlm.nih.gov/traces/sra78/SRR/009852/SRR10088646" ...
##   ..$ LibraryName          : logi [1:10] NA NA NA NA NA NA ...
##   ..$ LibraryStrategy      : chr [1:10] "RNA-Seq" "RNA-Seq" "RNA-Seq" "RNA-Seq" ...
##   ..$ LibrarySelection     : chr [1:10] "cDNA" "cDNA" "cDNA" "cDNA" ...
##   ..$ LibrarySource        : chr [1:10] "TRANSCRIPTOMIC" "TRANSCRIPTOMIC" "TRANSCRIPTOMIC" "TRANSCRIPTOMIC" ...
##   ..$ LibraryLayout        : chr [1:10] "PAIRED" "PAIRED" "PAIRED" "PAIRED" ...
##   ..$ InsertSize           : int [1:10] 0 0 0 0 0 0 0 0 0 0
##   ..$ InsertDev            : num [1:10] 0 0 0 0 0 0 0 0 0 0
##   ..$ Platform             : chr [1:10] "ILLUMINA" "ILLUMINA" "ILLUMINA" "ILLUMINA" ...
##   ..$ Model                : chr [1:10] "Illumina HiSeq 2000" "Illumina HiSeq 2000" "Illumina HiSeq 2000" "Illumina HiSeq 2000" ...
##   ..$ SRAStudy             : chr [1:10] "SRP221142" "SRP221142" "SRP221142" "SRP221142" ...
##   ..$ BioProject           : chr [1:10] "PRJNA564614" "PRJNA564614" "PRJNA564614" "PRJNA564614" ...
##   ..$ Study_Pubmed_id      : logi [1:10] NA NA NA NA NA NA ...
##   ..$ ProjectID            : int [1:10] 564614 564614 564614 564614 564614 564614 564614 564614 564614 564614
##   ..$ Sample               : chr [1:10] "SRS5363958" "SRS5363959" "SRS5363960" "SRS5363961" ...
##   ..$ BioSample            : chr [1:10] "SAMN12718008" "SAMN12718007" "SAMN12718006" "SAMN12718005" ...
##   ..$ SampleType           : chr [1:10] "simple" "simple" "simple" "simple" ...
##   ..$ TaxID                : int [1:10] 9606 9606 9606 9606 9606 9606 9606 9606 9606 9606
##   ..$ ScientificName       : chr [1:10] "Homo sapiens" "Homo sapiens" "Homo sapiens" "Homo sapiens" ...
##   ..$ g1k_pop_code         : logi [1:10] NA NA NA NA NA NA ...
##   ..$ source               : logi [1:10] NA NA NA NA NA NA ...
##   ..$ g1k_analysis_group   : logi [1:10] NA NA NA NA NA NA ...
##   ..$ Subject_ID           : logi [1:10] NA NA NA NA NA NA ...
##   ..$ Sex                  : logi [1:10] NA NA NA NA NA NA ...
##   ..$ Disease              : logi [1:10] NA NA NA NA NA NA ...
##   ..$ Tumor                : chr [1:10] "no" "no" "no" "no" ...
##   ..$ Affection_Status     : logi [1:10] NA NA NA NA NA NA ...
##   ..$ Analyte_Type         : logi [1:10] NA NA NA NA NA NA ...
##   ..$ Histological_Type    : logi [1:10] NA NA NA NA NA NA ...
##   ..$ Body_Site            : logi [1:10] NA NA NA NA NA NA ...
##   ..$ CenterName           : chr [1:10] "GEO" "GEO" "GEO" "GEO" ...
##   ..$ Submission           : chr [1:10] "SRA958993" "SRA958993" "SRA958993" "SRA958993" ...
##   ..$ dbgap_study_accession: logi [1:10] NA NA NA NA NA NA ...
##   ..$ Consent              : chr [1:10] "public" "public" "public" "public" ...
##   ..$ RunHash              : chr [1:10] "E0BCDFF87AAB7BAEA59B9F907C237BC8" "A50FAC06690E5B494BE9551D7643B984" "473BB794894549C7D3362D29B94E4641" "A57C2ACE1B79211B1EF584275D0832F5" ...
##   ..$ ReadHash             : chr [1:10] "46E8B8C0063766F6C55DC0292985725B" "B2CEB9A26C261BAAC73F6122330EC1B8" "D66B1EB4FCBDBF9FCA98ED5851D2A8C4" "A509DE4EE14FE10C954B5381AC352867" ...
##  $ absent         : chr(0)
## converting counts to integer mode
## using supplied model matrix
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing