Intro

Rationale

In a single sample we need to search for variants in the genes PIK3CA, RASA1, PTEN and TEK.

Methodology

WES to be performed by AGRF (Australia): WES TWIST Assay, 150bp PE reads NovaSeq 6000 with 10Gb data/sample. Bioinformatics analysis to be conducted by Dr Mark Ziemann (Deakin University).

Loading the data

# txdb
txdb <- makeTxDbFromGFF("../../ref/Homo_sapiens.GRCh38.107.gtf.gz")
## Import genomic features from the file as a GRanges object ... OK
## Prepare the 'metadata' data frame ... OK
## Make the TxDb object ...
## Warning in .get_cds_IDX(mcols0$type, mcols0$phase): The "phase" metadata column contains non-NA values for features of type
##   stop_codon. This information was ignored.
## OK
# load vcf
vcf <- readVcf("SLS.vcf.gz", "GRCh38")

# remove refcalls
vcf
## class: CollapsedVCF 
## dim: 680528 1 
## rowRanges(vcf):
##   GRanges with 5 metadata columns: paramRangeID, REF, ALT, QUAL, FILTER
## info(vcf):
##   DataFrame with 1 column: END
## info(header(vcf)):
##        Number Type    Description                                 
##    END 1      Integer End position (for use with symbolic alleles)
## geno(vcf):
##   List of length 8: GT, GQ, DP, MIN_DP, AD, VAF, PL, MED_DP
## geno(header(vcf)):
##           Number Type    Description                                           
##    GT     1      String  Genotype                                              
##    GQ     1      Integer Conditional genotype quality                          
##    DP     1      Integer Read depth                                            
##    MIN_DP 1      Integer Minimum DP observed within the GVCF block.            
##    AD     R      Integer Read depth for each allele                            
##    VAF    A      Float   Variant allele fractions.                             
##    PL     G      Integer Phred-scaled genotype likelihoods rounded to the cl...
##    MED_DP 1      Integer Median DP observed within the GVCF block rounded to...
vcf <- vcf[which(rowRanges(vcf)$FILTER != "RefCall")]
vcf
## class: CollapsedVCF 
## dim: 413006 1 
## rowRanges(vcf):
##   GRanges with 5 metadata columns: paramRangeID, REF, ALT, QUAL, FILTER
## info(vcf):
##   DataFrame with 1 column: END
## info(header(vcf)):
##        Number Type    Description                                 
##    END 1      Integer End position (for use with symbolic alleles)
## geno(vcf):
##   List of length 8: GT, GQ, DP, MIN_DP, AD, VAF, PL, MED_DP
## geno(header(vcf)):
##           Number Type    Description                                           
##    GT     1      String  Genotype                                              
##    GQ     1      Integer Conditional genotype quality                          
##    DP     1      Integer Read depth                                            
##    MIN_DP 1      Integer Minimum DP observed within the GVCF block.            
##    AD     R      Integer Read depth for each allele                            
##    VAF    A      Float   Variant allele fractions.                             
##    PL     G      Integer Phred-scaled genotype likelihoods rounded to the cl...
##    MED_DP 1      Integer Median DP observed within the GVCF block rounded to...
# use only the chromosome sequences in txdb
validseqs <- seqlevels(vcf)[which(seqlevels(vcf) %in%  seqlevels(txdb))]
vcf <- vcf[which(seqnames( rowRanges(vcf) ) %in% validseqs)]
seqlevels(vcf) <- as.character(unique(seqnames( rowRanges(vcf) ) ))
seqlevels(vcf) %in%  seqlevels(txdb)
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
vcf
## class: CollapsedVCF 
## dim: 411559 1 
## rowRanges(vcf):
##   GRanges with 5 metadata columns: paramRangeID, REF, ALT, QUAL, FILTER
## info(vcf):
##   DataFrame with 1 column: END
## info(header(vcf)):
##        Number Type    Description                                 
##    END 1      Integer End position (for use with symbolic alleles)
## geno(vcf):
##   List of length 8: GT, GQ, DP, MIN_DP, AD, VAF, PL, MED_DP
## geno(header(vcf)):
##           Number Type    Description                                           
##    GT     1      String  Genotype                                              
##    GQ     1      Integer Conditional genotype quality                          
##    DP     1      Integer Read depth                                            
##    MIN_DP 1      Integer Minimum DP observed within the GVCF block.            
##    AD     R      Integer Read depth for each allele                            
##    VAF    A      Float   Variant allele fractions.                             
##    PL     G      Integer Phred-scaled genotype likelihoods rounded to the cl...
##    MED_DP 1      Integer Median DP observed within the GVCF block rounded to...
tab <- table(seqnames(rowRanges(vcf)))
tab
## 
##          1         10         11         12         13         14         15 
##      35077      20115      20889      20652      11229      12894      13511 
##         16         17         18         19          2         20         21 
##      16292      18236       8462      21143      28695      10362       6702 
##         22          3          4          5          6          7          8 
##      10365      23122      20820      19901      23592      22345      17008 
##          9         MT          X          Y KI270728.1 KI270727.1 KI270442.1 
##      17128         48      10616        864         62          9        510 
## GL000009.2 GL000205.2 KI270733.1 GL000216.2 KI270744.1 KI270734.1 KI270731.1 
##         73        171         97        208        135          8         30 
## KI270750.1 KI270721.1 KI270726.1 KI270711.1 KI270713.1 
##         29          3         19         58         79
barplot(tab,horiz=TRUE,las=1,cex.names=0.7)

#tab[order(tab)]

barplot(tab[order(tab)],horiz=TRUE,las=1,cex.names=0.7)
grid()

#coding <- predictCoding(sites, txdb, seqSource = Hsapiens)
Hsapiens <- BSgenome.Hsapiens.UCSC.hg38
seqnames(Hsapiens) <- gsub("chr","",seqnames(Hsapiens))
seqnames(Hsapiens) <- gsub("M","MT",seqnames(Hsapiens))

# fix random contigs
seqnames(Hsapiens)[grep("_",seqnames(Hsapiens))] <-
 gsub("v",".",sapply(strsplit( seqnames(Hsapiens)[grep("_",seqnames(Hsapiens))] , "_"),"[[",2))

seqlevels(Hsapiens) %in% seqlevels(vcf)
##   [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [25]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [289] FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [301] FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [313]  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE FALSE  TRUE  TRUE FALSE FALSE
## [325] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [337] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [349] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [373] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [385] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [397] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [409] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [421] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [433] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [445] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [457] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [469] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [481] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [493] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [505] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [517] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [529] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [541] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [553] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [565] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [577] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [589] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
suppressWarnings({
  pred <- unique(predictCoding(vcf, txdb, seqSource=Hsapiens))
})

pred
## GRanges object with 25412 ranges and 17 metadata columns:
##                          seqnames    ranges strand | paramRangeID
##                             <Rle> <IRanges>  <Rle> |     <factor>
##            1:69270_A/G          1     69270      + |           NA
##            1:69511_A/G          1     69511      + |           NA
##            1:69897_T/C          1     69897      + |           NA
##           1:924533_A/G          1    924533      + |           NA
##           1:942451_T/C          1    942451      + |           NA
##                    ...        ...       ...    ... .          ...
##   KI270713.1:32102_C/G KI270713.1     32102      - |           NA
##   KI270713.1:32155_G/A KI270713.1     32155      - |           NA
##   KI270713.1:32201_G/A KI270713.1     32201      - |           NA
##   KI270713.1:32315_T/G KI270713.1     32315      - |           NA
##   KI270713.1:32333_C/G KI270713.1     32333      - |           NA
##                                   REF                ALT      QUAL      FILTER
##                        <DNAStringSet> <DNAStringSetList> <numeric> <character>
##            1:69270_A/G              A                  G      25.2        PASS
##            1:69511_A/G              A                  G      34.3        PASS
##            1:69897_T/C              T                  C      19.9        PASS
##           1:924533_A/G              A                  G      28.9        PASS
##           1:942451_T/C              T                  C      42.0        PASS
##                    ...            ...                ...       ...         ...
##   KI270713.1:32102_C/G              C                  G      27.0        PASS
##   KI270713.1:32155_G/A              G                  A      45.8        PASS
##   KI270713.1:32201_G/A              G                  A       7.6        PASS
##   KI270713.1:32315_T/G              T                  G      16.7        PASS
##   KI270713.1:32333_C/G              C                  G      16.3        PASS
##                             varAllele    CDSLOC    PROTEINLOC   QUERYID
##                        <DNAStringSet> <IRanges> <IntegerList> <integer>
##            1:69270_A/G              G       243            81         8
##            1:69511_A/G              G       484           162         9
##            1:69897_T/C              C       870           290        10
##           1:924533_A/G              G       102            34        95
##           1:942451_T/C              C      1516           506       111
##                    ...            ...       ...           ...       ...
##   KI270713.1:32102_C/G              C       271            91    411530
##   KI270713.1:32155_G/A              T       218            73    411531
##   KI270713.1:32201_G/A              T       172            58    411532
##   KI270713.1:32315_T/G              C        58            20    411533
##   KI270713.1:32333_C/G              C        40            14    411534
##                               TXID         CDSID          GENEID   CONSEQUENCE
##                        <character> <IntegerList>     <character>      <factor>
##            1:69270_A/G           9             2 ENSG00000186092 synonymous   
##            1:69511_A/G           9             2 ENSG00000186092 nonsynonymous
##            1:69897_T/C           9             2 ENSG00000186092 synonymous   
##           1:924533_A/G         115             3 ENSG00000187634 synonymous   
##           1:942451_T/C         115            18 ENSG00000187634 nonsynonymous
##                    ...         ...           ...             ...           ...
##   KI270713.1:32102_C/G      251087        277460 ENSG00000277475 nonsynonymous
##   KI270713.1:32155_G/A      251087        277460 ENSG00000277475 nonsynonymous
##   KI270713.1:32201_G/A      251087        277460 ENSG00000277475 synonymous   
##   KI270713.1:32315_T/G      251087        277460 ENSG00000277475 nonsynonymous
##   KI270713.1:32333_C/G      251087        277460 ENSG00000277475 nonsynonymous
##                              REFCODON       VARCODON         REFAA
##                        <DNAStringSet> <DNAStringSet> <AAStringSet>
##            1:69270_A/G            TCA            TCG             S
##            1:69511_A/G            ACA            GCA             T
##            1:69897_T/C            TCT            TCC             S
##           1:924533_A/G            CCA            CCG             P
##           1:942451_T/C            TGG            CGG             W
##                    ...            ...            ...           ...
##   KI270713.1:32102_C/G            GGG            CGG             G
##   KI270713.1:32155_G/A            GCC            GTC             A
##   KI270713.1:32201_G/A            CTG            TTG             L
##   KI270713.1:32315_T/G            AGT            CGT             S
##   KI270713.1:32333_C/G            GTG            CTG             V
##                                VARAA
##                        <AAStringSet>
##            1:69270_A/G             S
##            1:69511_A/G             A
##            1:69897_T/C             S
##           1:924533_A/G             P
##           1:942451_T/C             R
##                    ...           ...
##   KI270713.1:32102_C/G             R
##   KI270713.1:32155_G/A             V
##   KI270713.1:32201_G/A             L
##   KI270713.1:32315_T/G             R
##   KI270713.1:32333_C/G             L
##   -------
##   seqinfo: 40 sequences from GRCh38 genome

Variant summary data.

table(pred$CONSEQUENCE)
## 
##    frameshift      nonsense nonsynonymous    synonymous 
##           269           125         12618         12400
pred2 <- pred[pred$CONSEQUENCE != "synonymous"]

Show mito mutations - this is not important for patient SLS.

mito <- vcf[which(seqnames(vcf) == "MT")]
mito <- vcf[which(rownames(vcf) %in% rownames(mito) )] 
mito_rr <- as.data.frame(rowRanges(mito))
mito_geno <- as.data.frame(geno(mito)$GT)
mito_df <- cbind(mito_rr, mito_geno)
write.table(mito_df,file="mito_df.tsv", sep="\t")
mito_df
##                      seqnames start   end width strand paramRangeID        REF
## MT:146_T/C                 MT   146   146     1      *         <NA>          T
## MT:263_A/G                 MT   263   263     1      *         <NA>          A
## MT:499_G/A                 MT   499   499     1      *         <NA>          G
## MT:513_GCA/G               MT   513   515     3      *         <NA>        GCA
## MT:750_A/G                 MT   750   750     1      *         <NA>          A
## MT:827_A/G                 MT   827   827     1      *         <NA>          A
## MT:1438_A/G                MT  1438  1438     1      *         <NA>          A
## MT:1811_A/G                MT  1811  1811     1      *         <NA>          A
## MT:2150_T/TA               MT  2150  2150     1      *         <NA>          T
## MT:2226_T/TA               MT  2226  2226     1      *         <NA>          T
## MT:2617_A/G                MT  2617  2617     1      *         <NA>          A
## MT:2706_A/G                MT  2706  2706     1      *         <NA>          A
## MT:3010_G/A                MT  3010  3010     1      *         <NA>          G
## MT:3547_A/G                MT  3547  3547     1      *         <NA>          A
## MT:4071_C/T                MT  4071  4071     1      *         <NA>          C
## MT:4755_T/C                MT  4755  4755     1      *         <NA>          T
## MT:4769_A/G                MT  4769  4769     1      *         <NA>          A
## MT:4820_G/A                MT  4820  4820     1      *         <NA>          G
## MT:4977_T/C                MT  4977  4977     1      *         <NA>          T
## MT:6455_C/T                MT  6455  6455     1      *         <NA>          C
## MT:6473_C/T                MT  6473  6473     1      *         <NA>          C
## MT:7028_C/T                MT  7028  7028     1      *         <NA>          C
## MT:7241_A/G                MT  7241  7241     1      *         <NA>          A
## MT:8270_CACCCCCTCT/C       MT  8270  8279    10      *         <NA> CACCCCCTCT
## MT:8440_A/G                MT  8440  8440     1      *         <NA>          A
## MT:8702_C/T                MT  8702  8702     1      *         <NA>          C
## MT:8860_A/G                MT  8860  8860     1      *         <NA>          A
## MT:9950_T/C                MT  9950  9950     1      *         <NA>          T
## MT:10813_C/CA              MT 10813 10813     1      *         <NA>          C
## MT:10873_T/C               MT 10873 10873     1      *         <NA>          T
## MT:11031_G/GA              MT 11031 11031     1      *         <NA>          G
## MT:11299_T/C               MT 11299 11299     1      *         <NA>          T
## MT:11467_A/G               MT 11467 11467     1      *         <NA>          A
## MT:11485_T/C               MT 11485 11485     1      *         <NA>          T
## MT:11719_G/A               MT 11719 11719     1      *         <NA>          G
## MT:12007_G/A               MT 12007 12007     1      *         <NA>          G
## MT:12705_C/T               MT 12705 12705     1      *         <NA>          C
## MT:13590_G/A               MT 13590 13590     1      *         <NA>          G
## MT:14757_T/C               MT 14757 14757     1      *         <NA>          T
## MT:14766_C/T               MT 14766 14766     1      *         <NA>          C
## MT:15326_A/G               MT 15326 15326     1      *         <NA>          A
## MT:15535_C/T               MT 15535 15535     1      *         <NA>          C
## MT:15706_A/G               MT 15706 15706     1      *         <NA>          A
## MT:16182_A/C               MT 16182 16182     1      *         <NA>          A
## MT:16189_T/C               MT 16189 16189     1      *         <NA>          T
## MT:16217_T/C               MT 16217 16217     1      *         <NA>          T
## MT:16319_G/A               MT 16319 16319     1      *         <NA>          G
## MT:16355_C/T               MT 16355 16355     1      *         <NA>          C
##                       ALT QUAL FILTER default
## MT:146_T/C              C 34.9   PASS     0/1
## MT:263_A/G              G 42.1   PASS     1/1
## MT:499_G/A              A 33.1   PASS     0/1
## MT:513_GCA/G            G 33.3   PASS     1/1
## MT:750_A/G              G 43.1   PASS     1/1
## MT:827_A/G              G 40.3   PASS     0/1
## MT:1438_A/G             G 40.2   PASS     1/1
## MT:1811_A/G             G 36.8   PASS     0/1
## MT:2150_T/TA           TA  4.7   PASS     0/1
## MT:2226_T/TA           TA 16.6   PASS     0/1
## MT:2617_A/G          G, T 51.1   PASS     1/2
## MT:2706_A/G             G 38.6   PASS     1/1
## MT:3010_G/A             A 15.8   PASS     0/1
## MT:3547_A/G             G 44.8   PASS     0/1
## MT:4071_C/T             T  7.5   PASS     0/1
## MT:4755_T/C             C 17.7   PASS     0/1
## MT:4769_A/G             G 33.9   PASS     1/1
## MT:4820_G/A             A 30.8   PASS     0/1
## MT:4977_T/C             C 16.2   PASS     0/1
## MT:6455_C/T             T 24.8   PASS     0/1
## MT:6473_C/T             T 43.7   PASS     0/1
## MT:7028_C/T             T 39.1   PASS     1/1
## MT:7241_A/G             G 34.1   PASS     0/1
## MT:8270_CACCCCCTCT/C    C 36.4   PASS     0/1
## MT:8440_A/G             G  6.7   PASS     0/1
## MT:8702_C/T             T 35.8   PASS     0/1
## MT:8860_A/G             G 37.4   PASS     1/1
## MT:9950_T/C             C 43.2   PASS     0/1
## MT:10813_C/CA          CA 11.3   PASS     0/1
## MT:10873_T/C            C 35.3   PASS     0/1
## MT:11031_G/GA          GA 16.6   PASS     0/1
## MT:11299_T/C            C  3.4   PASS     0/1
## MT:11467_A/G            G 39.6   PASS     0/1
## MT:11485_T/C            C 35.5   PASS     0/1
## MT:11719_G/A            A 31.3   PASS     1/1
## MT:12007_G/A            A 12.4   PASS     0/1
## MT:12705_C/T            T 12.7   PASS     0/1
## MT:13590_G/A            A 42.1   PASS     0/1
## MT:14757_T/C            C 32.1   PASS     0/1
## MT:14766_C/T            T 43.8   PASS     1/1
## MT:15326_A/G            G 43.9   PASS     1/1
## MT:15535_C/T            T 33.7   PASS     0/1
## MT:15706_A/G            G 36.6   PASS     0/1
## MT:16182_A/C            C  4.3   PASS     0/1
## MT:16189_T/C            C  6.7   PASS     0/1
## MT:16217_T/C            C  5.6   PASS     0/1
## MT:16319_G/A            A 14.8   PASS     0/1
## MT:16355_C/T            T  6.7   PASS     0/1
mito2 <- mito[names(mito) %in% names(pred2)] 
mito_rr <- data.frame(pred2[names(pred2) %in% names(mito2)])
mito_geno <- as.data.frame(geno(mito2)$GT)
mito_df2 <- cbind(mito_rr, mito_geno)
write.table(mito_df2,file="mito_df2.tsv", sep="\t")                                                      
mito_df2
##               seqnames start   end width strand paramRangeID REF ALT QUAL
## MT:3547_A/G         MT  3547  3547     1      +         <NA>   A   G 44.8
## MT:4769_A/G         MT  4769  4769     1      +         <NA>   A   G 33.9
## MT:8702_C/T         MT  8702  8702     1      +         <NA>   C   T 35.8
## MT:8860_A/G         MT  8860  8860     1      +         <NA>   A   G 37.4
## MT:10813_C/CA       MT 10813 10813     1      +         <NA>   C  CA 11.3
## MT:11031_G/GA       MT 11031 11031     1      +         <NA>   G  GA 16.6
## MT:12007_G/A        MT 12007 12007     1      +         <NA>   G   A 12.4
## MT:14757_T/C        MT 14757 14757     1      +         <NA>   T   C 32.1
## MT:14766_C/T        MT 14766 14766     1      +         <NA>   C   T 43.8
## MT:15326_A/G        MT 15326 15326     1      +         <NA>   A   G 43.9
##               FILTER varAllele CDSLOC.start CDSLOC.end CDSLOC.width PROTEINLOC
## MT:3547_A/G     PASS         G          241        241            1         81
## MT:4769_A/G     PASS         G          300        300            1        100
## MT:8702_C/T     PASS         T          176        176            1         59
## MT:8860_A/G     PASS         G          334        334            1        112
## MT:10813_C/CA   PASS        CA           54         54            1         18
## MT:11031_G/GA   PASS        GA          272        272            1         91
## MT:12007_G/A    PASS         A         1248       1248            1        416
## MT:14757_T/C    PASS         C           11         11            1          4
## MT:14766_C/T    PASS         T           20         20            1          7
## MT:15326_A/G    PASS         G          580        580            1        194
##               QUERYID   TXID  CDSID          GENEID   CONSEQUENCE REFCODON
## MT:3547_A/G    398554 251031 277405 ENSG00000198888 nonsynonymous      ATC
## MT:4769_A/G    398557 251034 277406 ENSG00000198763 nonsynonymous      ATA
## MT:8702_C/T    398566 251041 277410 ENSG00000198899 nonsynonymous      ACC
## MT:8860_A/G    398567 251041 277410 ENSG00000198899 nonsynonymous      ACA
## MT:10813_C/CA  398569 251047 277414 ENSG00000198886    frameshift      TCC
## MT:11031_G/GA  398571 251047 277414 ENSG00000198886    frameshift      CGA
## MT:12007_G/A   398576 251047 277414 ENSG00000198886      nonsense      TGG
## MT:14757_T/C   398579 251052 277416 ENSG00000198727 nonsynonymous      ATA
## MT:14766_C/T   398580 251052 277416 ENSG00000198727 nonsynonymous      ACT
## MT:15326_A/G   398581 251052 277416 ENSG00000198727 nonsynonymous      ACA
##               VARCODON REFAA VARAA default
## MT:3547_A/G        GTC     I     V     0/1
## MT:4769_A/G        ATG     I     M     1/1
## MT:8702_C/T        ATC     T     I     0/1
## MT:8860_A/G        GCA     T     A     1/1
## MT:10813_C/CA     TCCA                 0/1
## MT:11031_G/GA     CGAA                 0/1
## MT:12007_G/A       TGA     W     *     0/1
## MT:14757_T/C       ACA     I     T     0/1
## MT:14766_C/T       ATT     T     I     1/1
## MT:15326_A/G       GCA     T     A     1/1

Look for causative mutations

Now I will filter to identify variants in genes PIK3CA, RASA1, PTEN and TEK.

Gene Name ID
PIK3CA ENSG00000121879
RASA1 ENSG00000145715
PTEN ENSG00000171862
TEK ENSG00000120156
genes <- c("ENSG00000121879","ENSG00000145715","ENSG00000171862","ENSG00000120156")

cand <- pred2[which(pred2$GENEID %in% genes),]

cand
## GRanges object with 3 ranges and 17 metadata columns:
##                   seqnames    ranges strand | paramRangeID            REF
##                      <Rle> <IRanges>  <Rle> |     <factor> <DNAStringSet>
##   10:87864144_G/C       10  87864144      + |           NA              G
##    9:27183465_A/C        9  27183465      + |           NA              A
##    9:27183600_C/T        9  27183600      + |           NA              C
##                                  ALT      QUAL      FILTER      varAllele
##                   <DNAStringSetList> <numeric> <character> <DNAStringSet>
##   10:87864144_G/C                  C      30.7        PASS              C
##    9:27183465_A/C                  C      48.2        PASS              C
##    9:27183600_C/T                  T      45.0        PASS              T
##                      CDSLOC    PROTEINLOC   QUERYID        TXID         CDSID
##                   <IRanges> <IntegerList> <integer> <character> <IntegerList>
##   10:87864144_G/C       194            65     47756      124746        135451
##    9:27183465_A/C       596           199    384517      113908        122719
##    9:27183600_C/T       731           244    384518      113908        122719
##                            GENEID   CONSEQUENCE       REFCODON       VARCODON
##                       <character>      <factor> <DNAStringSet> <DNAStringSet>
##   10:87864144_G/C ENSG00000171862 nonsynonymous            TGC            TCC
##    9:27183465_A/C ENSG00000120156 nonsynonymous            CAG            CCG
##    9:27183600_C/T ENSG00000120156 nonsynonymous            ACA            ATA
##                           REFAA         VARAA
##                   <AAStringSet> <AAStringSet>
##   10:87864144_G/C             C             S
##    9:27183465_A/C             Q             P
##    9:27183600_C/T             T             I
##   -------
##   seqinfo: 40 sequences from GRCh38 genome
Variant identified Variant ID Approx Population frequency
87864144_G/C rs2943772 0.99
9:27183465_A/C rs682632 0.95
9:27183600_C/T rs34032300 0.02

rs2943772 and rs682632 occur too frequently in the population to be pathogenic.

rs34032300 is interesting - dbSNP and ClinVar have this variant classified as benign, but was observed in Multiple cutaneous and mucosal venous malformations. https://www.ncbi.nlm.nih.gov/clinvar/variation/366415/?oq=rs34032300&m=NM_000459.5(TEK):c.1172C%3ET%20(p.Thr391Ile)

Now I will look at the genotype of the variants identified.

gt <- geno(vcf)$GT

gt[which(rownames(gt) %in% names(cand) ),]
## 10:87864144_G/C  9:27183465_A/C  9:27183600_C/T 
##           "1/1"           "1/1"           "0/1"

27183600_C/T / rs34032300 is heterozygous.

Session information

sessionInfo()
## R version 4.0.1 (2020-06-06)
## Platform: x86_64-conda_cos6-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /ceph-g/opt/miniconda3/envs/R40/lib/libopenblasp-r0.3.9.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] SNPlocs.Hsapiens.dbSNP151.GRCh38_0.99.20
##  [2] TxDb.Hsapiens.UCSC.hg38.knownGene_3.10.0
##  [3] GenomicFeatures_1.40.1                  
##  [4] AnnotationDbi_1.50.3                    
##  [5] BSgenome.Hsapiens.UCSC.hg38_1.4.3       
##  [6] BSgenome_1.56.0                         
##  [7] VariantAnnotation_1.34.0                
##  [8] Rsamtools_2.4.0                         
##  [9] SummarizedExperiment_1.18.2             
## [10] DelayedArray_0.14.1                     
## [11] matrixStats_0.62.0                      
## [12] Biobase_2.48.0                          
## [13] eulerr_6.1.1                            
## [14] Biostrings_2.56.0                       
## [15] XVector_0.28.0                          
## [16] R.utils_2.12.0                          
## [17] R.oo_1.25.0                             
## [18] R.methodsS3_1.8.2                       
## [19] rtracklayer_1.48.0                      
## [20] GenomicRanges_1.40.0                    
## [21] GenomeInfoDb_1.24.2                     
## [22] IRanges_2.22.2                          
## [23] S4Vectors_0.26.1                        
## [24] BiocGenerics_0.34.0                     
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.1               bit64_4.0.5              assertthat_0.2.1        
##  [4] askpass_1.1              BiocFileCache_1.12.1     blob_1.2.1              
##  [7] GenomeInfoDbData_1.2.3   yaml_2.2.1               progress_1.2.2          
## [10] pillar_1.8.0             RSQLite_2.2.15           lattice_0.20-41         
## [13] glue_1.6.2               digest_0.6.25            htmltools_0.5.0         
## [16] Matrix_1.2-18            XML_3.99-0.10            pkgconfig_2.0.3         
## [19] biomaRt_2.44.4           zlibbioc_1.34.0          purrr_0.3.4             
## [22] BiocParallel_1.22.0      tibble_3.0.1             openssl_1.4.1           
## [25] generics_0.0.2           ellipsis_0.3.2           cachem_1.0.6            
## [28] cli_3.3.0                magrittr_1.5             crayon_1.3.4            
## [31] memoise_2.0.1            evaluate_0.14            fansi_0.4.1             
## [34] xml2_1.3.2               tools_4.0.1              prettyunits_1.1.1       
## [37] hms_0.5.3                lifecycle_1.0.1          stringr_1.4.0           
## [40] compiler_4.0.1           rlang_1.0.4              grid_4.0.1              
## [43] RCurl_1.98-1.8           rappdirs_0.3.3           bitops_1.0-7            
## [46] rmarkdown_2.3            DBI_1.1.0                curl_4.3                
## [49] R6_2.4.1                 GenomicAlignments_1.24.0 knitr_1.28              
## [52] dplyr_1.0.9              fastmap_1.0.1            bit_4.0.4               
## [55] utf8_1.1.4               stringi_1.4.6            Rcpp_1.0.9              
## [58] vctrs_0.4.1              xfun_0.15                dbplyr_1.4.4            
## [61] tidyselect_1.1.2