Loading the data

# reading in the GFF files
gff <- read.table("Mus_musculus.GRCm38.98.gff3", sep = "\t", quote = "")

v75 <- readVcf("VCF_SRP199233_GRCm39/SRX5884275_svtyper.vcf", "GRCm39")
v76 <- readVcf("VCF_SRP199233_GRCm39/SRX5884276_svtyper.vcf", "GRCm39")
v77 <- readVcf("VCF_SRP199233_GRCm39/SRX5884277_svtyper.vcf", "GRCm39")
v78 <- readVcf("VCF_SRP199233_GRCm39/SRX5884278_svtyper.vcf", "GRCm39")
v79 <- readVcf("VCF_SRP199233_GRCm39/SRX5884279_svtyper.vcf", "GRCm39")
v80 <- readVcf("VCF_SRP199233_GRCm39/SRX5884280_svtyper.vcf", "GRCm39")

r75 <- rowRanges(v75)
r76 <- rowRanges(v76)
r77 <- rowRanges(v77)
r78 <- rowRanges(v78)
r79 <- rowRanges(v79)
r80 <- rowRanges(v80)

g75 <- geno(v75)$GT
g76 <- geno(v76)$GT
g77 <- geno(v77)$GT
g78 <- geno(v78)$GT
g79 <- geno(v79)$GT
g80 <- geno(v80)$GT

Reorganising the data

df <- data.frame(rowRanges(v75),info(v75))  #taking a look into the content for one of the vcf file.
df <- df[,c("seqnames","start","END")]
head(df)
##   seqnames    start      END
## 1        1 26725045 26727528
## 2        1 39502386 39502399
## 3        1 39740580 39742018
## 4        1 39787083 39787266
## 5        1 39867178 39869113
## 6        1 39875834 39875909
alt <- unlist(as.vector(elementMetadata(v75)$ALT))
svtype <- info(v75)$SVTYPE

gt <- data.frame(g75,g76,g77,g78,g79,g80)
colnames(gt) <- c("g75","g76","g77","g78","g79","g80") 

qual <- data.frame(r75$QUAL,r76$QUAL,r77$QUAL,r78$QUAL,r79$QUAL,r80$QUAL)

gt2 <- data.frame(df,alt,svtype,gt,qual) 

head(gt2,20)
##    seqnames    start      END   alt svtype g75 g76 g77 g78 g79 g80 r75.QUAL
## 1         1 26725045 26727528 <DUP>    DUP 1/1 0/0 1/1 1/1 ./. 0/0    17.35
## 2         1 39502386 39502399 <DEL>    DEL ./. 0/0 0/0 ./. 0/0 0/0     0.00
## 3         1 39740580 39742018 <DEL>    DEL ./. 1/1 ./. 1/1 0/0 1/1     0.00
## 4         1 39787083 39787266 <DEL>    DEL ./. 1/1 ./. ./. 0/0 0/0     0.00
## 5         1 39867178 39869113 <DEL>    DEL 1/1 ./. ./. ./. 0/0 0/0    31.46
## 6         1 39875834 39875909 <DEL>    DEL 0/0 ./. 0/0 0/0 0/0 0/0     2.04
## 7         1 40181383 40187403 <DEL>    DEL 0/0 1/1 ./. 1/1 1/1 ./.     4.77
## 8         1 40413280 40413335 <DEL>    DEL ./. 0/0 ./. 1/1 1/1 1/1     0.00
## 9         1 40604578 40604663 <DEL>    DEL 1/1 1/1 1/1 0/0 0/0 1/1    60.25
## 10        1 40737159 40741612 <DEL>    DEL 0/0 0/0 ./. 1/1 0/0 ./.     4.77
## 11        1 42176587 42182958 <DEL>    DEL ./. ./. 0/0 0/0 1/1 ./.     0.00
## 12        1 42685262 42687148 <DEL>    DEL 1/1 ./. ./. ./. 0/0 ./.    60.25
## 13        1 42838780 42838966 <DEL>    DEL 0/0 ./. 0/0 0/0 1/1 0/0     4.77
## 14        1 43069960 43070355 <DEL>    DEL ./. 1/1 ./. ./. 1/1 ./.     0.00
## 15        1 43383064 43383173 <DEL>    DEL ./. ./. 1/1 0/0 ./. 0/0     0.00
## 16        1 44768031 44769118 <DEL>    DEL ./. ./. 0/0 0/0 ./. 0/0     0.00
## 17        1 45722421 45722587 <DEL>    DEL 0/0 ./. 1/1 ./. 1/1 1/1     4.77
## 18        1 51290935 51296630 <DEL>    DEL 0/0 1/1 ./. ./. 0/0 1/1     4.77
## 19        1 51584071 51585910 <DEL>    DEL 1/1 ./. ./. 1/1 ./. ./.    31.46
## 20        1 51691097 51693813 <DEL>    DEL ./. 0/0 ./. 1/1 ./. ./.     0.00
##    r76.QUAL r77.QUAL r78.QUAL r79.QUAL r80.QUAL
## 1      4.77    17.35    46.54     0.00     3.24
## 2      4.77     4.77     0.00     4.77     4.77
## 3     31.46     0.00    31.46     4.77   236.38
## 4     89.31     0.00     0.00     4.77     4.77
## 5      0.00     0.00     0.00     4.77     4.77
## 6      0.00     4.77     0.07     1.01     2.04
## 7     60.25     0.00    31.46    89.31     0.00
## 8      4.77     0.00    31.46   147.94    60.25
## 9     31.46    60.25     4.77     4.77    31.46
## 10     4.77     0.00    60.25     4.77     0.00
## 11     0.00     4.77     4.77    31.46     0.00
## 12     0.00     0.00     0.00     4.77     0.00
## 13     0.00     4.77     4.77    31.46     4.77
## 14    31.46     0.00     0.00    60.25     0.00
## 15     0.00    31.46     4.77     0.00     4.77
## 16     0.00     4.77     4.77     0.00     4.77
## 17     0.00    31.46     0.00    31.46    31.46
## 18    31.46     0.00     0.00     4.77    31.46
## 19     0.00     0.00    31.46     0.00     0.00
## 20     4.77     0.00    60.25     0.00     0.00
gt3 <- gt2[which(apply(qual,1,min)>=20),]
head(gt3,20)
##         seqnames     start       END           alt svtype g75 g76 g77 g78 g79
## 53_1           1  16078454        NA N]1:78585581]    BND 1/1 1/1 1/1 1/1 1/1
## 53_2           1  78585581        NA N]1:16078454]    BND 1/1 1/1 1/1 1/1 1/1
## 74             1  88198043  88199163         <DEL>    DEL 1/1 1/1 1/1 0/1 1/1
## 261_1          2  98493171        NA N]2:98495562]    BND 1/1 1/1 1/1 1/1 1/1
## 261_2          2  98495562        NA N]2:98493171]    BND 1/1 1/1 1/1 1/1 1/1
## 262            2  98493172  98496582         <DEL>    DEL 1/1 1/1 1/1 1/1 1/1
## 269_1          2  98492824        NA N]2:98497676]    BND 0/1 0/1 0/1 0/1 0/1
## 269_2          2  98497676        NA N]2:98492824]    BND 0/1 0/1 0/1 0/1 0/1
## 274            2  98497402  98497677         <DUP>    DUP 1/1 1/1 1/1 1/1 1/1
## 1205          10 128277743 128277834         <DUP>    DUP 1/1 1/1 1/1 1/1 1/1
## 1217          11   3148658   3148711         <DEL>    DEL 1/1 1/1 1/1 1/1 1/1
## 1418          13  48407019  48423790         <DUP>    DUP 1/1 1/1 1/1 1/1 1/1
## 1506          15  73399206  73399288         <DEL>    DEL 1/1 1/1 1/1 1/1 1/1
## 1597          17  13524840  13525181         <DUP>    DUP 0/1 0/1 1/1 0/1 0/1
## 1893           X  75642359  75642466         <DEL>    DEL 0/1 0/1 0/1 0/1 0/1
## 1894           X  75642288  75642741         <DUP>    DUP 1/1 1/1 1/1 1/1 1/1
## 1895           X  75642155  75642741         <DUP>    DUP 1/1 1/1 1/1 1/1 1/1
## 1896           X  75642466  75642741         <DUP>    DUP 1/1 1/1 1/1 1/1 1/1
## 1901           Y  11858017  11858363         <DUP>    DUP 0/1 1/1 0/1 1/1 0/1
## 1931  MU069435.1      5791      6073         <DEL>    DEL 0/1 0/1 0/1 0/1 0/1
##       g80 r75.QUAL r76.QUAL r77.QUAL r78.QUAL r79.QUAL r80.QUAL
## 53_1  1/1   649.93   443.14   590.85  1743.00  1004.44  1093.07
## 53_2  1/1   649.93   443.14   590.85  1743.00  1004.44  1093.07
## 74    0/1    31.46    31.46    60.25    25.33   138.74    75.87
## 261_1 1/1   118.56   147.94   118.56   590.85   295.44   236.38
## 261_2 1/1   118.56   147.94   118.56   590.85   295.44   236.38
## 262   1/1  8655.93  7208.35  6647.05 21034.21 15627.94 16632.39
## 269_1 0/1  1124.42   926.75   880.01  1504.26  1472.22  1169.32
## 269_2 0/1  1124.42   926.75   880.01  1504.26  1472.22  1169.32
## 274   1/1 36091.50 27559.29 24188.83 55896.96 36054.86 46082.68
## 1205  1/1   592.21   304.58   456.86  1641.27  1076.09   864.61
## 1217  1/1   472.68    60.25    89.31   265.90   226.54   147.94
## 1418  1/1    30.30    31.80    31.80   189.40   258.89   106.72
## 1506  1/1    31.46    31.46    31.46    89.31    60.25    31.46
## 1597  0/1    34.58    40.45    91.73   101.63    55.09    65.38
## 1893  0/1   278.72   152.79    83.85   323.30   464.45   221.55
## 1894  1/1   716.11   470.96   525.46  2275.09  1256.35  1501.50
## 1895  1/1   715.75   487.32   487.32  2269.09  1355.36  1629.48
## 1896  1/1  1041.97   654.61   754.57  2713.25  1864.10  1772.50
## 1901  0/1    44.54    91.73    22.06   308.18   126.23    82.97
## 1931  0/1   269.77   113.91    65.95   176.58   104.83    65.81
gt <- gt[which(apply(qual,1,min)>=20),]
head(gt,20)
##       g75 g76 g77 g78 g79 g80
## 53_1  1/1 1/1 1/1 1/1 1/1 1/1
## 53_2  1/1 1/1 1/1 1/1 1/1 1/1
## 74    1/1 1/1 1/1 0/1 1/1 0/1
## 261_1 1/1 1/1 1/1 1/1 1/1 1/1
## 261_2 1/1 1/1 1/1 1/1 1/1 1/1
## 262   1/1 1/1 1/1 1/1 1/1 1/1
## 269_1 0/1 0/1 0/1 0/1 0/1 0/1
## 269_2 0/1 0/1 0/1 0/1 0/1 0/1
## 274   1/1 1/1 1/1 1/1 1/1 1/1
## 1205  1/1 1/1 1/1 1/1 1/1 1/1
## 1217  1/1 1/1 1/1 1/1 1/1 1/1
## 1418  1/1 1/1 1/1 1/1 1/1 1/1
## 1506  1/1 1/1 1/1 1/1 1/1 1/1
## 1597  0/1 0/1 1/1 0/1 0/1 0/1
## 1893  0/1 0/1 0/1 0/1 0/1 0/1
## 1894  1/1 1/1 1/1 1/1 1/1 1/1
## 1895  1/1 1/1 1/1 1/1 1/1 1/1
## 1896  1/1 1/1 1/1 1/1 1/1 1/1
## 1901  0/1 1/1 0/1 1/1 0/1 0/1
## 1931  0/1 0/1 0/1 0/1 0/1 0/1
rd <- rowRanges(v75)
rd <- rd[which(apply(qual,1,min)>=20),]
head(rd,20)
## GRanges object with 20 ranges and 5 metadata columns:
##           seqnames    ranges strand | paramRangeID            REF
##              <Rle> <IRanges>  <Rle> |     <factor> <DNAStringSet>
##    53_1          1  16078454      * |           NA              N
##    53_2          1  78585581      * |           NA              N
##      74          1  88198043      * |           NA              N
##   261_1          2  98493171      * |           NA              N
##   261_2          2  98495562      * |           NA              N
##     ...        ...       ...    ... .          ...            ...
##    1894          X  75642288      * |           NA              N
##    1895          X  75642155      * |           NA              N
##    1896          X  75642466      * |           NA              N
##    1901          Y  11858017      * |           NA              N
##    1931 MU069435.1      5791      * |           NA              N
##                     ALT      QUAL      FILTER
##         <CharacterList> <numeric> <character>
##    53_1   N]1:78585581]    649.93           .
##    53_2   N]1:16078454]    649.93           .
##      74           <DEL>     31.46           .
##   261_1   N]2:98495562]    118.56           .
##   261_2   N]2:98493171]    118.56           .
##     ...             ...       ...         ...
##    1894           <DUP>    716.11           .
##    1895           <DUP>    715.75           .
##    1896           <DUP>   1041.97           .
##    1901           <DUP>     44.54           .
##    1931           <DEL>    269.77           .
##   -------
##   seqinfo: 34 sequences from GRCm39 genome; no seqlengths
rows <- which(apply(gt,1,function(x){
  sd(as.numeric(as.factor(x)))
})!=0)

gt3 <- gt3[rows,]
gt <- gt[rows,]

gtx <- apply(gt,2,function(x) { as.numeric(as.factor(x))})
rownames(gtx) <- paste(gt3$seqnames,gt3$start,gt3$END,gt3$alt)
gtx
##                                        g75 g76 g77 g78 g79 g80
## 1 88198043 88199163 <DEL>                2   2   2   1   2   1
## 17 13524840 13525181 <DUP>               1   1   2   1   1   1
## Y 11858017 11858363 <DUP>                1   2   1   2   1   1
## 4 28176942 NA ]X:75642741]N              1   2   1   1   1   1
## X 75642741 NA N[4:28176942[              1   2   1   1   1   1
## GL456392.1 19292 NA ]GL456396.1:9019]N   1   2   2   1   1   1
## GL456396.1 9019 NA N[GL456392.1:19292[   1   2   2   1   1   1
rd <- rd[rows,]
rd
## GRanges object with 7 ranges and 5 metadata columns:
##            seqnames    ranges strand | paramRangeID            REF
##               <Rle> <IRanges>  <Rle> |     <factor> <DNAStringSet>
##       74          1  88198043      * |           NA              N
##     1597         17  13524840      * |           NA              N
##     1901          Y  11858017      * |           NA              N
##   2032_1          4  28176942      * |           NA              N
##   2032_2          X  75642741      * |           NA              N
##   2127_1 GL456392.1     19292      * |           NA              N
##   2127_2 GL456396.1      9019      * |           NA              N
##                          ALT      QUAL      FILTER
##              <CharacterList> <numeric> <character>
##       74               <DEL>     31.46           .
##     1597               <DUP>     34.58           .
##     1901               <DUP>     44.54           .
##   2032_1       ]X:75642741]N    204.60           .
##   2032_2       N[4:28176942[    204.60           .
##   2127_1  ]GL456396.1:9019]N     48.50           .
##   2127_2 N[GL456392.1:19292[     48.50           .
##   -------
##   seqinfo: 34 sequences from GRCm39 genome; no seqlengths
colfunc <- colorRampPalette(c("blue", "red"))
library(gplots)
## Registered S3 method overwritten by 'gplots':
##   method    from  
##   plot.venn eulerr
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:eulerr':
## 
##     venn
## The following object is masked from 'package:IRanges':
## 
##     space
## The following object is masked from 'package:S4Vectors':
## 
##     space
## The following object is masked from 'package:stats':
## 
##     lowess
heatmap.2(gtx, trace = "none", scale = "none", 
  dendrogram='none', Rowv=FALSE, Colv=FALSE, col=colfunc(25),
  margins = c(5,20), cexRow=.8, cexCol=.9,  main="Genetic Variation")

Session Information

sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_AU.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_AU.UTF-8        LC_COLLATE=en_AU.UTF-8    
##  [5] LC_MONETARY=en_AU.UTF-8    LC_MESSAGES=en_AU.UTF-8   
##  [7] LC_PAPER=en_AU.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] gplots_3.1.1                VariantAnnotation_1.38.0   
##  [3] Rsamtools_2.8.0             SummarizedExperiment_1.22.0
##  [5] Biobase_2.52.0              MatrixGenerics_1.4.3       
##  [7] matrixStats_0.60.1          eulerr_6.1.1               
##  [9] Biostrings_2.60.2           XVector_0.32.0             
## [11] R.utils_2.10.1              R.oo_1.24.0                
## [13] R.methodsS3_1.8.1           GenomicRanges_1.44.0       
## [15] GenomeInfoDb_1.28.4         IRanges_2.26.0             
## [17] S4Vectors_0.30.0            BiocGenerics_0.38.0        
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.2               sass_0.4.0               bit64_4.0.5             
##  [4] jsonlite_1.7.2           gtools_3.9.2             bslib_0.3.0             
##  [7] assertthat_0.2.1         highr_0.9                BiocFileCache_2.0.0     
## [10] blob_1.2.2               BSgenome_1.60.0          GenomeInfoDbData_1.2.6  
## [13] yaml_2.2.1               progress_1.2.2           pillar_1.6.2            
## [16] RSQLite_2.2.8            lattice_0.20-44          glue_1.4.2              
## [19] digest_0.6.27            htmltools_0.5.2          Matrix_1.3-4            
## [22] XML_3.99-0.7             pkgconfig_2.0.3          biomaRt_2.48.3          
## [25] zlibbioc_1.38.0          purrr_0.3.4              BiocParallel_1.26.2     
## [28] tibble_3.1.4             KEGGREST_1.32.0          generics_0.1.0          
## [31] ellipsis_0.3.2           cachem_1.0.6             GenomicFeatures_1.44.2  
## [34] magrittr_2.0.1           crayon_1.4.1             memoise_2.0.0           
## [37] evaluate_0.14            fansi_0.5.0              xml2_1.3.2              
## [40] tools_4.1.1              prettyunits_1.1.1        hms_1.1.0               
## [43] BiocIO_1.2.0             lifecycle_1.0.0          stringr_1.4.0           
## [46] DelayedArray_0.18.0      AnnotationDbi_1.54.1     compiler_4.1.1          
## [49] jquerylib_0.1.4          caTools_1.18.2           rlang_0.4.11            
## [52] grid_4.1.1               RCurl_1.98-1.4           rjson_0.2.20            
## [55] rappdirs_0.3.3           bitops_1.0-7             rmarkdown_2.11          
## [58] restfulr_0.0.13          curl_4.3.2               DBI_1.1.1               
## [61] R6_2.5.1                 GenomicAlignments_1.28.0 rtracklayer_1.52.1      
## [64] knitr_1.34               dplyr_1.0.7              fastmap_1.1.0           
## [67] bit_4.0.4                utf8_1.2.2               filelock_1.0.2          
## [70] KernSmooth_2.23-20       stringi_1.7.4            Rcpp_1.0.7              
## [73] vctrs_0.3.8              png_0.1-7                dbplyr_2.1.1            
## [76] tidyselect_1.1.1         xfun_0.26