In a single sample we need to search for variants in the genes PIK3CA, RASA1, PTEN and TEK.
WES to be performed by AGRF (Australia): WES TWIST Assay, 150bp PE reads NovaSeq 6000 with 10Gb data/sample. Bioinformatics analysis to be conducted by Dr Mark Ziemann (Deakin University).
# txdb
txdb <- makeTxDbFromGFF("../../ref/Homo_sapiens.GRCh38.107.gtf.gz")
## Import genomic features from the file as a GRanges object ... OK
## Prepare the 'metadata' data frame ... OK
## Make the TxDb object ...
## Warning in .get_cds_IDX(mcols0$type, mcols0$phase): The "phase" metadata column contains non-NA values for features of type
## stop_codon. This information was ignored.
## OK
# load vcf
vcf <- readVcf("SLS.vcf.gz", "GRCh38")
# remove refcalls
vcf
## class: CollapsedVCF
## dim: 680528 1
## rowRanges(vcf):
## GRanges with 5 metadata columns: paramRangeID, REF, ALT, QUAL, FILTER
## info(vcf):
## DataFrame with 1 column: END
## info(header(vcf)):
## Number Type Description
## END 1 Integer End position (for use with symbolic alleles)
## geno(vcf):
## List of length 8: GT, GQ, DP, MIN_DP, AD, VAF, PL, MED_DP
## geno(header(vcf)):
## Number Type Description
## GT 1 String Genotype
## GQ 1 Integer Conditional genotype quality
## DP 1 Integer Read depth
## MIN_DP 1 Integer Minimum DP observed within the GVCF block.
## AD R Integer Read depth for each allele
## VAF A Float Variant allele fractions.
## PL G Integer Phred-scaled genotype likelihoods rounded to the cl...
## MED_DP 1 Integer Median DP observed within the GVCF block rounded to...
vcf <- vcf[which(rowRanges(vcf)$FILTER != "RefCall")]
vcf
## class: CollapsedVCF
## dim: 413006 1
## rowRanges(vcf):
## GRanges with 5 metadata columns: paramRangeID, REF, ALT, QUAL, FILTER
## info(vcf):
## DataFrame with 1 column: END
## info(header(vcf)):
## Number Type Description
## END 1 Integer End position (for use with symbolic alleles)
## geno(vcf):
## List of length 8: GT, GQ, DP, MIN_DP, AD, VAF, PL, MED_DP
## geno(header(vcf)):
## Number Type Description
## GT 1 String Genotype
## GQ 1 Integer Conditional genotype quality
## DP 1 Integer Read depth
## MIN_DP 1 Integer Minimum DP observed within the GVCF block.
## AD R Integer Read depth for each allele
## VAF A Float Variant allele fractions.
## PL G Integer Phred-scaled genotype likelihoods rounded to the cl...
## MED_DP 1 Integer Median DP observed within the GVCF block rounded to...
# use only the chromosome sequences in txdb
validseqs <- seqlevels(vcf)[which(seqlevels(vcf) %in% seqlevels(txdb))]
vcf <- vcf[which(seqnames( rowRanges(vcf) ) %in% validseqs)]
seqlevels(vcf) <- as.character(unique(seqnames( rowRanges(vcf) ) ))
seqlevels(vcf) %in% seqlevels(txdb)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
vcf
## class: CollapsedVCF
## dim: 411559 1
## rowRanges(vcf):
## GRanges with 5 metadata columns: paramRangeID, REF, ALT, QUAL, FILTER
## info(vcf):
## DataFrame with 1 column: END
## info(header(vcf)):
## Number Type Description
## END 1 Integer End position (for use with symbolic alleles)
## geno(vcf):
## List of length 8: GT, GQ, DP, MIN_DP, AD, VAF, PL, MED_DP
## geno(header(vcf)):
## Number Type Description
## GT 1 String Genotype
## GQ 1 Integer Conditional genotype quality
## DP 1 Integer Read depth
## MIN_DP 1 Integer Minimum DP observed within the GVCF block.
## AD R Integer Read depth for each allele
## VAF A Float Variant allele fractions.
## PL G Integer Phred-scaled genotype likelihoods rounded to the cl...
## MED_DP 1 Integer Median DP observed within the GVCF block rounded to...
tab <- table(seqnames(rowRanges(vcf)))
tab
##
## 1 10 11 12 13 14 15
## 35077 20115 20889 20652 11229 12894 13511
## 16 17 18 19 2 20 21
## 16292 18236 8462 21143 28695 10362 6702
## 22 3 4 5 6 7 8
## 10365 23122 20820 19901 23592 22345 17008
## 9 MT X Y KI270728.1 KI270727.1 KI270442.1
## 17128 48 10616 864 62 9 510
## GL000009.2 GL000205.2 KI270733.1 GL000216.2 KI270744.1 KI270734.1 KI270731.1
## 73 171 97 208 135 8 30
## KI270750.1 KI270721.1 KI270726.1 KI270711.1 KI270713.1
## 29 3 19 58 79
barplot(tab,horiz=TRUE,las=1,cex.names=0.7)
#tab[order(tab)]
barplot(tab[order(tab)],horiz=TRUE,las=1,cex.names=0.7)
grid()
#coding <- predictCoding(sites, txdb, seqSource = Hsapiens)
Hsapiens <- BSgenome.Hsapiens.UCSC.hg38
seqnames(Hsapiens) <- gsub("chr","",seqnames(Hsapiens))
seqnames(Hsapiens) <- gsub("M","MT",seqnames(Hsapiens))
# fix random contigs
seqnames(Hsapiens)[grep("_",seqnames(Hsapiens))] <-
gsub("v",".",sapply(strsplit( seqnames(Hsapiens)[grep("_",seqnames(Hsapiens))] , "_"),"[[",2))
seqlevels(Hsapiens) %in% seqlevels(vcf)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [13] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [25] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [289] FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [301] FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [313] TRUE TRUE TRUE TRUE FALSE FALSE TRUE FALSE TRUE TRUE FALSE FALSE
## [325] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [337] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [349] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [373] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [385] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [397] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [409] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [421] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [433] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [445] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [457] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [469] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [481] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [493] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [505] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [517] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [529] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [541] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [553] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [565] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [577] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [589] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
suppressWarnings({
pred <- unique(predictCoding(vcf, txdb, seqSource=Hsapiens))
})
pred
## GRanges object with 25412 ranges and 17 metadata columns:
## seqnames ranges strand | paramRangeID
## <Rle> <IRanges> <Rle> | <factor>
## 1:69270_A/G 1 69270 + | NA
## 1:69511_A/G 1 69511 + | NA
## 1:69897_T/C 1 69897 + | NA
## 1:924533_A/G 1 924533 + | NA
## 1:942451_T/C 1 942451 + | NA
## ... ... ... ... . ...
## KI270713.1:32102_C/G KI270713.1 32102 - | NA
## KI270713.1:32155_G/A KI270713.1 32155 - | NA
## KI270713.1:32201_G/A KI270713.1 32201 - | NA
## KI270713.1:32315_T/G KI270713.1 32315 - | NA
## KI270713.1:32333_C/G KI270713.1 32333 - | NA
## REF ALT QUAL FILTER
## <DNAStringSet> <DNAStringSetList> <numeric> <character>
## 1:69270_A/G A G 25.2 PASS
## 1:69511_A/G A G 34.3 PASS
## 1:69897_T/C T C 19.9 PASS
## 1:924533_A/G A G 28.9 PASS
## 1:942451_T/C T C 42.0 PASS
## ... ... ... ... ...
## KI270713.1:32102_C/G C G 27.0 PASS
## KI270713.1:32155_G/A G A 45.8 PASS
## KI270713.1:32201_G/A G A 7.6 PASS
## KI270713.1:32315_T/G T G 16.7 PASS
## KI270713.1:32333_C/G C G 16.3 PASS
## varAllele CDSLOC PROTEINLOC QUERYID
## <DNAStringSet> <IRanges> <IntegerList> <integer>
## 1:69270_A/G G 243 81 8
## 1:69511_A/G G 484 162 9
## 1:69897_T/C C 870 290 10
## 1:924533_A/G G 102 34 95
## 1:942451_T/C C 1516 506 111
## ... ... ... ... ...
## KI270713.1:32102_C/G C 271 91 411530
## KI270713.1:32155_G/A T 218 73 411531
## KI270713.1:32201_G/A T 172 58 411532
## KI270713.1:32315_T/G C 58 20 411533
## KI270713.1:32333_C/G C 40 14 411534
## TXID CDSID GENEID CONSEQUENCE
## <character> <IntegerList> <character> <factor>
## 1:69270_A/G 9 2 ENSG00000186092 synonymous
## 1:69511_A/G 9 2 ENSG00000186092 nonsynonymous
## 1:69897_T/C 9 2 ENSG00000186092 synonymous
## 1:924533_A/G 115 3 ENSG00000187634 synonymous
## 1:942451_T/C 115 18 ENSG00000187634 nonsynonymous
## ... ... ... ... ...
## KI270713.1:32102_C/G 251087 277460 ENSG00000277475 nonsynonymous
## KI270713.1:32155_G/A 251087 277460 ENSG00000277475 nonsynonymous
## KI270713.1:32201_G/A 251087 277460 ENSG00000277475 synonymous
## KI270713.1:32315_T/G 251087 277460 ENSG00000277475 nonsynonymous
## KI270713.1:32333_C/G 251087 277460 ENSG00000277475 nonsynonymous
## REFCODON VARCODON REFAA
## <DNAStringSet> <DNAStringSet> <AAStringSet>
## 1:69270_A/G TCA TCG S
## 1:69511_A/G ACA GCA T
## 1:69897_T/C TCT TCC S
## 1:924533_A/G CCA CCG P
## 1:942451_T/C TGG CGG W
## ... ... ... ...
## KI270713.1:32102_C/G GGG CGG G
## KI270713.1:32155_G/A GCC GTC A
## KI270713.1:32201_G/A CTG TTG L
## KI270713.1:32315_T/G AGT CGT S
## KI270713.1:32333_C/G GTG CTG V
## VARAA
## <AAStringSet>
## 1:69270_A/G S
## 1:69511_A/G A
## 1:69897_T/C S
## 1:924533_A/G P
## 1:942451_T/C R
## ... ...
## KI270713.1:32102_C/G R
## KI270713.1:32155_G/A V
## KI270713.1:32201_G/A L
## KI270713.1:32315_T/G R
## KI270713.1:32333_C/G L
## -------
## seqinfo: 40 sequences from GRCh38 genome
Variant summary data.
table(pred$CONSEQUENCE)
##
## frameshift nonsense nonsynonymous synonymous
## 269 125 12618 12400
pred2 <- pred[pred$CONSEQUENCE != "synonymous"]
Show mito mutations - this is not important for patient SLS.
mito <- vcf[which(seqnames(vcf) == "MT")]
mito <- vcf[which(rownames(vcf) %in% rownames(mito) )]
mito_rr <- as.data.frame(rowRanges(mito))
mito_geno <- as.data.frame(geno(mito)$GT)
mito_df <- cbind(mito_rr, mito_geno)
write.table(mito_df,file="mito_df.tsv", sep="\t")
mito_df
## seqnames start end width strand paramRangeID REF
## MT:146_T/C MT 146 146 1 * <NA> T
## MT:263_A/G MT 263 263 1 * <NA> A
## MT:499_G/A MT 499 499 1 * <NA> G
## MT:513_GCA/G MT 513 515 3 * <NA> GCA
## MT:750_A/G MT 750 750 1 * <NA> A
## MT:827_A/G MT 827 827 1 * <NA> A
## MT:1438_A/G MT 1438 1438 1 * <NA> A
## MT:1811_A/G MT 1811 1811 1 * <NA> A
## MT:2150_T/TA MT 2150 2150 1 * <NA> T
## MT:2226_T/TA MT 2226 2226 1 * <NA> T
## MT:2617_A/G MT 2617 2617 1 * <NA> A
## MT:2706_A/G MT 2706 2706 1 * <NA> A
## MT:3010_G/A MT 3010 3010 1 * <NA> G
## MT:3547_A/G MT 3547 3547 1 * <NA> A
## MT:4071_C/T MT 4071 4071 1 * <NA> C
## MT:4755_T/C MT 4755 4755 1 * <NA> T
## MT:4769_A/G MT 4769 4769 1 * <NA> A
## MT:4820_G/A MT 4820 4820 1 * <NA> G
## MT:4977_T/C MT 4977 4977 1 * <NA> T
## MT:6455_C/T MT 6455 6455 1 * <NA> C
## MT:6473_C/T MT 6473 6473 1 * <NA> C
## MT:7028_C/T MT 7028 7028 1 * <NA> C
## MT:7241_A/G MT 7241 7241 1 * <NA> A
## MT:8270_CACCCCCTCT/C MT 8270 8279 10 * <NA> CACCCCCTCT
## MT:8440_A/G MT 8440 8440 1 * <NA> A
## MT:8702_C/T MT 8702 8702 1 * <NA> C
## MT:8860_A/G MT 8860 8860 1 * <NA> A
## MT:9950_T/C MT 9950 9950 1 * <NA> T
## MT:10813_C/CA MT 10813 10813 1 * <NA> C
## MT:10873_T/C MT 10873 10873 1 * <NA> T
## MT:11031_G/GA MT 11031 11031 1 * <NA> G
## MT:11299_T/C MT 11299 11299 1 * <NA> T
## MT:11467_A/G MT 11467 11467 1 * <NA> A
## MT:11485_T/C MT 11485 11485 1 * <NA> T
## MT:11719_G/A MT 11719 11719 1 * <NA> G
## MT:12007_G/A MT 12007 12007 1 * <NA> G
## MT:12705_C/T MT 12705 12705 1 * <NA> C
## MT:13590_G/A MT 13590 13590 1 * <NA> G
## MT:14757_T/C MT 14757 14757 1 * <NA> T
## MT:14766_C/T MT 14766 14766 1 * <NA> C
## MT:15326_A/G MT 15326 15326 1 * <NA> A
## MT:15535_C/T MT 15535 15535 1 * <NA> C
## MT:15706_A/G MT 15706 15706 1 * <NA> A
## MT:16182_A/C MT 16182 16182 1 * <NA> A
## MT:16189_T/C MT 16189 16189 1 * <NA> T
## MT:16217_T/C MT 16217 16217 1 * <NA> T
## MT:16319_G/A MT 16319 16319 1 * <NA> G
## MT:16355_C/T MT 16355 16355 1 * <NA> C
## ALT QUAL FILTER default
## MT:146_T/C C 34.9 PASS 0/1
## MT:263_A/G G 42.1 PASS 1/1
## MT:499_G/A A 33.1 PASS 0/1
## MT:513_GCA/G G 33.3 PASS 1/1
## MT:750_A/G G 43.1 PASS 1/1
## MT:827_A/G G 40.3 PASS 0/1
## MT:1438_A/G G 40.2 PASS 1/1
## MT:1811_A/G G 36.8 PASS 0/1
## MT:2150_T/TA TA 4.7 PASS 0/1
## MT:2226_T/TA TA 16.6 PASS 0/1
## MT:2617_A/G G, T 51.1 PASS 1/2
## MT:2706_A/G G 38.6 PASS 1/1
## MT:3010_G/A A 15.8 PASS 0/1
## MT:3547_A/G G 44.8 PASS 0/1
## MT:4071_C/T T 7.5 PASS 0/1
## MT:4755_T/C C 17.7 PASS 0/1
## MT:4769_A/G G 33.9 PASS 1/1
## MT:4820_G/A A 30.8 PASS 0/1
## MT:4977_T/C C 16.2 PASS 0/1
## MT:6455_C/T T 24.8 PASS 0/1
## MT:6473_C/T T 43.7 PASS 0/1
## MT:7028_C/T T 39.1 PASS 1/1
## MT:7241_A/G G 34.1 PASS 0/1
## MT:8270_CACCCCCTCT/C C 36.4 PASS 0/1
## MT:8440_A/G G 6.7 PASS 0/1
## MT:8702_C/T T 35.8 PASS 0/1
## MT:8860_A/G G 37.4 PASS 1/1
## MT:9950_T/C C 43.2 PASS 0/1
## MT:10813_C/CA CA 11.3 PASS 0/1
## MT:10873_T/C C 35.3 PASS 0/1
## MT:11031_G/GA GA 16.6 PASS 0/1
## MT:11299_T/C C 3.4 PASS 0/1
## MT:11467_A/G G 39.6 PASS 0/1
## MT:11485_T/C C 35.5 PASS 0/1
## MT:11719_G/A A 31.3 PASS 1/1
## MT:12007_G/A A 12.4 PASS 0/1
## MT:12705_C/T T 12.7 PASS 0/1
## MT:13590_G/A A 42.1 PASS 0/1
## MT:14757_T/C C 32.1 PASS 0/1
## MT:14766_C/T T 43.8 PASS 1/1
## MT:15326_A/G G 43.9 PASS 1/1
## MT:15535_C/T T 33.7 PASS 0/1
## MT:15706_A/G G 36.6 PASS 0/1
## MT:16182_A/C C 4.3 PASS 0/1
## MT:16189_T/C C 6.7 PASS 0/1
## MT:16217_T/C C 5.6 PASS 0/1
## MT:16319_G/A A 14.8 PASS 0/1
## MT:16355_C/T T 6.7 PASS 0/1
mito2 <- mito[names(mito) %in% names(pred2)]
mito_rr <- data.frame(pred2[names(pred2) %in% names(mito2)])
mito_geno <- as.data.frame(geno(mito2)$GT)
mito_df2 <- cbind(mito_rr, mito_geno)
write.table(mito_df2,file="mito_df2.tsv", sep="\t")
mito_df2
## seqnames start end width strand paramRangeID REF ALT QUAL
## MT:3547_A/G MT 3547 3547 1 + <NA> A G 44.8
## MT:4769_A/G MT 4769 4769 1 + <NA> A G 33.9
## MT:8702_C/T MT 8702 8702 1 + <NA> C T 35.8
## MT:8860_A/G MT 8860 8860 1 + <NA> A G 37.4
## MT:10813_C/CA MT 10813 10813 1 + <NA> C CA 11.3
## MT:11031_G/GA MT 11031 11031 1 + <NA> G GA 16.6
## MT:12007_G/A MT 12007 12007 1 + <NA> G A 12.4
## MT:14757_T/C MT 14757 14757 1 + <NA> T C 32.1
## MT:14766_C/T MT 14766 14766 1 + <NA> C T 43.8
## MT:15326_A/G MT 15326 15326 1 + <NA> A G 43.9
## FILTER varAllele CDSLOC.start CDSLOC.end CDSLOC.width PROTEINLOC
## MT:3547_A/G PASS G 241 241 1 81
## MT:4769_A/G PASS G 300 300 1 100
## MT:8702_C/T PASS T 176 176 1 59
## MT:8860_A/G PASS G 334 334 1 112
## MT:10813_C/CA PASS CA 54 54 1 18
## MT:11031_G/GA PASS GA 272 272 1 91
## MT:12007_G/A PASS A 1248 1248 1 416
## MT:14757_T/C PASS C 11 11 1 4
## MT:14766_C/T PASS T 20 20 1 7
## MT:15326_A/G PASS G 580 580 1 194
## QUERYID TXID CDSID GENEID CONSEQUENCE REFCODON
## MT:3547_A/G 398554 251031 277405 ENSG00000198888 nonsynonymous ATC
## MT:4769_A/G 398557 251034 277406 ENSG00000198763 nonsynonymous ATA
## MT:8702_C/T 398566 251041 277410 ENSG00000198899 nonsynonymous ACC
## MT:8860_A/G 398567 251041 277410 ENSG00000198899 nonsynonymous ACA
## MT:10813_C/CA 398569 251047 277414 ENSG00000198886 frameshift TCC
## MT:11031_G/GA 398571 251047 277414 ENSG00000198886 frameshift CGA
## MT:12007_G/A 398576 251047 277414 ENSG00000198886 nonsense TGG
## MT:14757_T/C 398579 251052 277416 ENSG00000198727 nonsynonymous ATA
## MT:14766_C/T 398580 251052 277416 ENSG00000198727 nonsynonymous ACT
## MT:15326_A/G 398581 251052 277416 ENSG00000198727 nonsynonymous ACA
## VARCODON REFAA VARAA default
## MT:3547_A/G GTC I V 0/1
## MT:4769_A/G ATG I M 1/1
## MT:8702_C/T ATC T I 0/1
## MT:8860_A/G GCA T A 1/1
## MT:10813_C/CA TCCA 0/1
## MT:11031_G/GA CGAA 0/1
## MT:12007_G/A TGA W * 0/1
## MT:14757_T/C ACA I T 0/1
## MT:14766_C/T ATT T I 1/1
## MT:15326_A/G GCA T A 1/1
Now I will filter to identify variants in genes PIK3CA, RASA1, PTEN and TEK.
Gene Name | ID |
---|---|
PIK3CA | ENSG00000121879 |
RASA1 | ENSG00000145715 |
PTEN | ENSG00000171862 |
TEK | ENSG00000120156 |
genes <- c("ENSG00000121879","ENSG00000145715","ENSG00000171862","ENSG00000120156")
cand <- pred2[which(pred2$GENEID %in% genes),]
cand
## GRanges object with 3 ranges and 17 metadata columns:
## seqnames ranges strand | paramRangeID REF
## <Rle> <IRanges> <Rle> | <factor> <DNAStringSet>
## 10:87864144_G/C 10 87864144 + | NA G
## 9:27183465_A/C 9 27183465 + | NA A
## 9:27183600_C/T 9 27183600 + | NA C
## ALT QUAL FILTER varAllele
## <DNAStringSetList> <numeric> <character> <DNAStringSet>
## 10:87864144_G/C C 30.7 PASS C
## 9:27183465_A/C C 48.2 PASS C
## 9:27183600_C/T T 45.0 PASS T
## CDSLOC PROTEINLOC QUERYID TXID CDSID
## <IRanges> <IntegerList> <integer> <character> <IntegerList>
## 10:87864144_G/C 194 65 47756 124746 135451
## 9:27183465_A/C 596 199 384517 113908 122719
## 9:27183600_C/T 731 244 384518 113908 122719
## GENEID CONSEQUENCE REFCODON VARCODON
## <character> <factor> <DNAStringSet> <DNAStringSet>
## 10:87864144_G/C ENSG00000171862 nonsynonymous TGC TCC
## 9:27183465_A/C ENSG00000120156 nonsynonymous CAG CCG
## 9:27183600_C/T ENSG00000120156 nonsynonymous ACA ATA
## REFAA VARAA
## <AAStringSet> <AAStringSet>
## 10:87864144_G/C C S
## 9:27183465_A/C Q P
## 9:27183600_C/T T I
## -------
## seqinfo: 40 sequences from GRCh38 genome
Variant identified | Variant ID | Approx Population frequency |
---|---|---|
87864144_G/C | rs2943772 | 0.99 |
9:27183465_A/C | rs682632 | 0.95 |
9:27183600_C/T | rs34032300 | 0.02 |
rs2943772 and rs682632 occur too frequently in the population to be pathogenic.
rs34032300 is interesting - dbSNP and ClinVar have this variant classified as benign, but was observed in Multiple cutaneous and mucosal venous malformations. https://www.ncbi.nlm.nih.gov/clinvar/variation/366415/?oq=rs34032300&m=NM_000459.5(TEK):c.1172C%3ET%20(p.Thr391Ile)
Now I will look at the genotype of the variants identified.
gt <- geno(vcf)$GT
gt[which(rownames(gt) %in% names(cand) ),]
## 10:87864144_G/C 9:27183465_A/C 9:27183600_C/T
## "1/1" "1/1" "0/1"
27183600_C/T / rs34032300 is heterozygous.
sessionInfo()
## R version 4.0.1 (2020-06-06)
## Platform: x86_64-conda_cos6-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
##
## Matrix products: default
## BLAS/LAPACK: /ceph-g/opt/miniconda3/envs/R40/lib/libopenblasp-r0.3.9.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] SNPlocs.Hsapiens.dbSNP151.GRCh38_0.99.20
## [2] TxDb.Hsapiens.UCSC.hg38.knownGene_3.10.0
## [3] GenomicFeatures_1.40.1
## [4] AnnotationDbi_1.50.3
## [5] BSgenome.Hsapiens.UCSC.hg38_1.4.3
## [6] BSgenome_1.56.0
## [7] VariantAnnotation_1.34.0
## [8] Rsamtools_2.4.0
## [9] SummarizedExperiment_1.18.2
## [10] DelayedArray_0.14.1
## [11] matrixStats_0.62.0
## [12] Biobase_2.48.0
## [13] eulerr_6.1.1
## [14] Biostrings_2.56.0
## [15] XVector_0.28.0
## [16] R.utils_2.12.0
## [17] R.oo_1.25.0
## [18] R.methodsS3_1.8.2
## [19] rtracklayer_1.48.0
## [20] GenomicRanges_1.40.0
## [21] GenomeInfoDb_1.24.2
## [22] IRanges_2.22.2
## [23] S4Vectors_0.26.1
## [24] BiocGenerics_0.34.0
##
## loaded via a namespace (and not attached):
## [1] httr_1.4.1 bit64_4.0.5 assertthat_0.2.1
## [4] askpass_1.1 BiocFileCache_1.12.1 blob_1.2.1
## [7] GenomeInfoDbData_1.2.3 yaml_2.2.1 progress_1.2.2
## [10] pillar_1.8.0 RSQLite_2.2.15 lattice_0.20-41
## [13] glue_1.6.2 digest_0.6.25 htmltools_0.5.0
## [16] Matrix_1.2-18 XML_3.99-0.10 pkgconfig_2.0.3
## [19] biomaRt_2.44.4 zlibbioc_1.34.0 purrr_0.3.4
## [22] BiocParallel_1.22.0 tibble_3.0.1 openssl_1.4.1
## [25] generics_0.0.2 ellipsis_0.3.2 cachem_1.0.6
## [28] cli_3.3.0 magrittr_1.5 crayon_1.3.4
## [31] memoise_2.0.1 evaluate_0.14 fansi_0.4.1
## [34] xml2_1.3.2 tools_4.0.1 prettyunits_1.1.1
## [37] hms_0.5.3 lifecycle_1.0.1 stringr_1.4.0
## [40] compiler_4.0.1 rlang_1.0.4 grid_4.0.1
## [43] RCurl_1.98-1.8 rappdirs_0.3.3 bitops_1.0-7
## [46] rmarkdown_2.3 DBI_1.1.0 curl_4.3
## [49] R6_2.4.1 GenomicAlignments_1.24.0 knitr_1.28
## [52] dplyr_1.0.9 fastmap_1.0.1 bit_4.0.4
## [55] utf8_1.1.4 stringi_1.4.6 Rcpp_1.0.9
## [58] vctrs_0.4.1 xfun_0.15 dbplyr_1.4.4
## [61] tidyselect_1.1.2