#reading in the first two VCF file with VariantAnnotation
vcf_x <- readVcf("VCF_SRP199233/SRX5884275_svtyper.vcf", "mm10")
vcf_y <- readVcf("VCF_SRP199233/SRX5884276_svtyper.vcf", "mm10")
#downloading the gff files and installing a package to unzip the gff file.
#download.file("ftp://ftp.ensembl.org/pub/release-98/gff3/mus_musculus/Mus_musculus.GRCm38.98.gff3.gz",destfile = "Mus_musculus.GRCm38.98.gff3.gz")
#library("R.utils")
#gunzip("Mus_musculus.GRCm38.98.gff3.gz")
# reading in the GFF files
gff <- read.table("Mus_musculus.GRCm38.98.gff3", sep = "\t", quote = "")
# this is a better way to look at the contents of the VCF files
rowRanges(vcf_x)
## GRanges object with 2417 ranges and 5 metadata columns:
## seqnames ranges strand | paramRangeID REF
## <Rle> <IRanges> <Rle> | <factor> <DNAStringSet>
## 1 1 26685964 * | NA N
## 2 1 39463305 * | NA N
## 3 1 39701420 * | NA N
## 4 1 39747923 * | NA N
## 5 1 39828018 * | NA N
## ... ... ... ... . ... ...
## 2138_2 GL456392.1 8589 * | NA N
## 2139_1 GL456390.1 23732 * | NA N
## 2139_2 GL456396.1 10046 * | NA N
## 2140_1 GL456392.1 19292 * | NA N
## 2140_2 GL456396.1 9019 * | NA N
## ALT QUAL FILTER
## <CharacterList> <numeric> <character>
## 1 <DUP> 19.26 .
## 2 <DEL> 0.00 .
## 3 <DEL> 0.00 .
## 4 <DEL> 0.00 .
## 5 <DEL> 31.46 .
## ... ... ... ...
## 2138_2 N[GL456390.1:21531[ 0.13 .
## 2139_1 ]GL456396.1:10046]N 31.46 .
## 2139_2 N[GL456390.1:23732[ 31.46 .
## 2140_1 ]GL456396.1:9019]N 48.50 .
## 2140_2 N[GL456392.1:19292[ 48.50 .
## -------
## seqinfo: 39 sequences from mm10 genome; no seqlengths
rd_x <- rowRanges(vcf_x)
info(vcf_x)
## DataFrame with 2417 rows and 19 columns
## SVTYPE SVLEN END STRANDS IMPRECISE
## <character> <IntegerList> <integer> <CharacterList> <logical>
## 1 DUP 2483 26688447 -+:5 TRUE
## 2 DEL -13 39463318 +-:4 TRUE
## 3 DEL -1438 39702858 +-:12 FALSE
## 4 DEL -183 39748106 +-:5 FALSE
## 5 DEL -1935 39829953 +-:5 TRUE
## ... ... ... ... ... ...
## 2138_2 BND NA +-:4 TRUE
## 2139_1 BND NA -+:10 TRUE
## 2139_2 BND NA +-:10 TRUE
## 2140_1 BND NA -+:28 FALSE
## 2140_2 BND NA +-:28 FALSE
## CIPOS CIEND CIPOS95 CIEND95 MATEID
## <IntegerList> <IntegerList> <IntegerList> <IntegerList> <CharacterList>
## 1 -119,9 -9,181 -32,1 0,36
## 2 -10,11 -12,9 -2,10 -12,1
## 3 -4,4 -10,9 0,0 0,0
## 4 -10,9 -10,9 0,0 0,0
## 5 -10,9 -10,9 -1,1 -1,1
## ... ... ... ... ... ...
## 2138_2 -10,9 -10,8 -1,1 -1,1 2138_1
## 2139_1 -3,2 -10,9 -1,1 0,0 2139_2
## 2139_2 -10,9 -3,2 0,0 -1,1 2139_1
## 2140_1 -10,9 -2,0 0,0 0,0 2140_2
## 2140_2 -2,0 -10,9 0,0 0,0 2140_1
## EVENT SECONDARY SU PE SR
## <character> <logical> <IntegerList> <IntegerList> <IntegerList>
## 1 NA FALSE 5 5 0
## 2 NA FALSE 4 4 0
## 3 NA FALSE 12 6 6
## 4 NA FALSE 5 0 5
## 5 NA FALSE 5 3 2
## ... ... ... ... ... ...
## 2138_2 2138 TRUE 4 2 2
## 2139_1 2139 FALSE 10 5 5
## 2139_2 2139 TRUE 10 5 5
## 2140_1 2140 FALSE 28 3 25
## 2140_2 2140 TRUE 28 3 25
## BD EV PRPOS
## <IntegerList> <CharacterList> <CharacterList>
## 1 1.63734e-09,3.84941e-09,6.97546e-09,...
## 2 4.02714e-05,9.24947e-05,0.000214025,...
## 3 4.2664e-15,4.18489e-11,4.10855e-07,...
## 4 9.80198e-21,9.80198e-19,9.80198e-17,...
## 5 2.96754e-17,2.96754e-15,2.96754e-13,...
## ... ... ... ...
## 2138_2 4.67913e-16,1.8628e-14,7.41593e-13,...
## 2139_1 3.19611e-05,0.00127685,0.0510055,...
## 2139_2 6.22132e-38,6.22132e-34,6.22132e-30,...
## 2140_1 9.65672e-101,9.70953e-91,9.7553e-81,...
## 2140_2 6.32701e-22,1.00167e-10,1
## PREND
## <CharacterList>
## 1 2.41925e-13,9.61778e-12,1.52147e-10,...
## 2 0.0351816,0.0392766,0.0443191,...
## 3 4.61213e-25,1.28089e-22,3.50382e-20,...
## 4 9.80198e-21,9.80198e-19,9.80198e-17,...
## 5 5.80289e-09,3.78972e-08,2.4738e-07,...
## ... ...
## 2138_2 7.81701e-09,4.94801e-08,3.13121e-07,...
## 2139_1 6.22132e-38,6.22132e-34,6.22132e-30,...
## 2139_2 3.19611e-05,0.00127685,0.0510055,...
## 2140_1 6.32701e-22,1.00167e-10,1
## 2140_2 9.65672e-101,9.70953e-91,9.7553e-81,...
info(VariantAnnotation::header(vcf_x))
## DataFrame with 19 rows and 3 columns
## Number Type Description
## <character> <character> <character>
## SVTYPE 1 String Type of structural v..
## SVLEN . Integer Difference in length..
## END 1 Integer End position of the ..
## STRANDS . String Strand orientation o..
## IMPRECISE 0 Flag Imprecise structural..
## ... ... ... ...
## SR . Integer Number of split read..
## BD . Integer Amount of BED eviden..
## EV . String Type of LUMPY eviden..
## PRPOS . String LUMPY probability cu..
## PREND . String LUMPY probability cu..
geno(vcf_x)
## List of length 19
## names(19): GT SU PE SR BD GQ SQ GL DP RO AO QR QA RS AS ASC RP AP AB
geno(VariantAnnotation::header(vcf_x))
## DataFrame with 19 rows and 3 columns
## Number Type Description
## <character> <character> <character>
## GT 1 String Genotype
## SU 1 Integer Number of pieces of ..
## PE 1 Integer Number of paired-end..
## SR 1 Integer Number of split read..
## BD 1 Integer Amount of BED eviden..
## ... ... ... ...
## AS A Integer Alternate allele spl..
## ASC A Integer Alternate allele cli..
## RP 1 Integer Reference allele pai..
## AP A Integer Alternate allele pai..
## AB A Float Allele balance, frac..
header(vcf_x)
## class: VCFHeader
## samples(1): sample
## meta(3): fileDate fileformat source
## fixed(2): FILTER ALT
## info(19): SVTYPE SVLEN ... PRPOS PREND
## geno(19): GT SU ... AP AB
samples(VariantAnnotation::header(vcf_x))
## [1] "sample"
#determining the genomic positions of the data using the rowRanges function which contains information from the CHROM, POS and ID fields of the Vcf files.
rowRanges(vcf_x)
## GRanges object with 2417 ranges and 5 metadata columns:
## seqnames ranges strand | paramRangeID REF
## <Rle> <IRanges> <Rle> | <factor> <DNAStringSet>
## 1 1 26685964 * | NA N
## 2 1 39463305 * | NA N
## 3 1 39701420 * | NA N
## 4 1 39747923 * | NA N
## 5 1 39828018 * | NA N
## ... ... ... ... . ... ...
## 2138_2 GL456392.1 8589 * | NA N
## 2139_1 GL456390.1 23732 * | NA N
## 2139_2 GL456396.1 10046 * | NA N
## 2140_1 GL456392.1 19292 * | NA N
## 2140_2 GL456396.1 9019 * | NA N
## ALT QUAL FILTER
## <CharacterList> <numeric> <character>
## 1 <DUP> 19.26 .
## 2 <DEL> 0.00 .
## 3 <DEL> 0.00 .
## 4 <DEL> 0.00 .
## 5 <DEL> 31.46 .
## ... ... ... ...
## 2138_2 N[GL456390.1:21531[ 0.13 .
## 2139_1 ]GL456396.1:10046]N 31.46 .
## 2139_2 N[GL456390.1:23732[ 31.46 .
## 2140_1 ]GL456396.1:9019]N 48.50 .
## 2140_2 N[GL456392.1:19292[ 48.50 .
## -------
## seqinfo: 39 sequences from mm10 genome; no seqlengths
head(rowRanges(vcf_x, 3))
## GRanges object with 6 ranges and 5 metadata columns:
## seqnames ranges strand | paramRangeID REF ALT
## <Rle> <IRanges> <Rle> | <factor> <DNAStringSet> <CharacterList>
## 1 1 26685964 * | NA N <DUP>
## 2 1 39463305 * | NA N <DEL>
## 3 1 39701420 * | NA N <DEL>
## 4 1 39747923 * | NA N <DEL>
## 5 1 39828018 * | NA N <DEL>
## 6 1 39836674 * | NA N <DEL>
## QUAL FILTER
## <numeric> <character>
## 1 19.26 .
## 2 0.00 .
## 3 0.00 .
## 4 0.00 .
## 5 31.46 .
## 6 2.04 .
## -------
## seqinfo: 39 sequences from mm10 genome; no seqlengths
head(rowRanges(vcf_y, 3))
## GRanges object with 6 ranges and 5 metadata columns:
## seqnames ranges strand | paramRangeID REF ALT
## <Rle> <IRanges> <Rle> | <factor> <DNAStringSet> <CharacterList>
## 1 1 26685964 * | NA N <DUP>
## 2 1 39463305 * | NA N <DEL>
## 3 1 39701420 * | NA N <DEL>
## 4 1 39747923 * | NA N <DEL>
## 5 1 39828018 * | NA N <DEL>
## 6 1 39836674 * | NA N <DEL>
## QUAL FILTER
## <numeric> <character>
## 1 4.77 .
## 2 4.77 .
## 3 31.46 .
## 4 89.31 .
## 5 0.00 .
## 6 0.00 .
## -------
## seqinfo: 39 sequences from mm10 genome; no seqlengths
rd_y <- rowRanges(vcf_y)
#looking into the individuals fields of both vcf file for the first length 9
ref(vcf_x)[1:9]
## DNAStringSet object of length 9:
## width seq
## [1] 1 N
## [2] 1 N
## [3] 1 N
## [4] 1 N
## [5] 1 N
## [6] 1 N
## [7] 1 N
## [8] 1 N
## [9] 1 N
qual(vcf_x)[1:9]
## [1] 19.26 0.00 0.00 0.00 31.46 2.04 4.77 0.00 60.25
alt(vcf_x)[1:9]
## CharacterList of length 9
## [[1]] <DUP>
## [[2]] <DEL>
## [[3]] <DEL>
## [[4]] <DEL>
## [[5]] <DEL>
## [[6]] <DEL>
## [[7]] <DEL>
## [[8]] <DEL>
## [[9]] <DEL>
ref(vcf_y)[1:9]
## DNAStringSet object of length 9:
## width seq
## [1] 1 N
## [2] 1 N
## [3] 1 N
## [4] 1 N
## [5] 1 N
## [6] 1 N
## [7] 1 N
## [8] 1 N
## [9] 1 N
qual(vcf_y)[1:9]
## [1] 4.77 4.77 31.46 89.31 0.00 0.00 60.25 4.77 31.46
alt(vcf_y)[1:9]
## CharacterList of length 9
## [[1]] <DUP>
## [[2]] <DEL>
## [[3]] <DEL>
## [[4]] <DEL>
## [[5]] <DEL>
## [[6]] <DEL>
## [[7]] <DEL>
## [[8]] <DEL>
## [[9]] <DEL>
#getting the full list of genotype in both vcf files and describing them in `FORMAT` fields as dataframe and class
geno(vcf_x)
sapply(geno(vcf_x), DataFrame)
sapply(geno(vcf_x), class)
geno(vcf_y)
sapply(geno(vcf_y), DataFrame)
sapply(geno(vcf_y), class)
#I observed lots of differences with the genotype of both vcf when described in dataframe formats
#having a look at the genotype (GT) by analysing the data as a factor
geno(VariantAnnotation::header(vcf_x))["GT",]
GT_x <- geno(vcf_x)$GT
GT_x <- as.factor(GT_x)
head(GT_x)
str(GT_x)
table(GT_x)
geno(VariantAnnotation::header(vcf_y))["GT",]
GT_y <- geno(vcf_y)$GT
GT_y <- as.factor(GT_y)
head(GT_y)
str(GT_y)
table(GT_y)
#comparing the two animals genotype as a single data frame
GT <- data.frame(GT_x, GT_y)
head(GT)
GT[,c("GT_x","GT_y")]
str(GT)
#filtering the missing data found within the variants
subset(GT, GT_x != "./." & GT_y != "./.")
GT_f <- subset(GT, GT_x != "./." & GT_y != "./.")
GT_f
#using table and barplot to check the level of heterozygous and homozygous found within the two animals
table(paste(GT_f$GT_x,GT_f$GT_y))
barplot(table(GT_f$GT_x == GT_f$GT_y))
#creating a function that compares two animals together.
compare_vcf <- function(vcf_a,vcf_b) {
gta <- geno(vcf_a)$GT
gtb <- geno(vcf_b)$GT
df <- data.frame(rowRanges(vcf_a),info(vcf_a))
df <- df[,c("seqnames","start","END","SVTYPE","SVLEN","QUAL")]
gt <- data.frame(gta,gtb)
colnames(gt) <- c("a","b")
gt <- data.frame(df,gt)
nr <- nrow(gt)
gt <- subset(gt, a != "./." & b != "./.")
gtnr <- nrow(gt)
missing <- nr - gtnr
gtsame <- length(which(gt$a == gt$b))
gtdiff <- length(which(gt$a != gt$b))
barplot(table(gt$a),main="a")
barplot(table(gt$b),main="b")
gt2 <- subset(gt, a != "0/1" & b != "0/1")
aset <- which(gt2$a == "1/1")
bset <- which(gt2$b == "1/1")
v1 <- list("a" = aset , "b" = bset)
print(plot(euler(v1),quantities = TRUE,main="homozygous variants"))
output=list("gt"=gt, "missing"=missing, "gtsame"=gtsame,"gtdiff"=gtdiff)
return(output)
}
#function created and to be tried on two different animals read with VariantAnnotation package.
vcf_a <- readVcf("VCF_SRP199233/SRX5884277_svtyper.vcf", "mm10")
rd_a <- rowRanges(vcf_a)
vcf_b <- readVcf("VCF_SRP199233/SRX5884278_svtyper.vcf", "mm10")
rd_b <- rowRanges(vcf_b)
#attempting the new function.
res <- compare_vcf(vcf_a,vcf_b)
#str(res)
#head(res)
Analyzing all 6 Animals together
#reading in the last 2 vcf data.
vcf_d <- readVcf("VCF_SRP199233/SRX5884279_svtyper.vcf", "mm10")
rd_d <- rowRanges(vcf_d)
vcf_e <- readVcf("VCF_SRP199233/SRX5884280_svtyper.vcf", "mm10")
rd_e <- rowRanges(vcf_e)
#analyzing the geno for the 6 animals respectively.
gtx <- geno(vcf_x)$GT
gty <- geno(vcf_y)$GT
gta <- geno(vcf_a)$GT
gtb <- geno(vcf_b)$GT
gtd <- geno(vcf_d)$GT
gte <- geno(vcf_e)$GT
df <- data.frame(rowRanges(vcf_x),info(vcf_x)) #taking a look into the content for one of the vcf file.
head(df)
## seqnames start end width strand paramRangeID REF ALT QUAL FILTER
## 1 1 26685964 26685964 1 * <NA> N <DUP> 19.26 .
## 2 1 39463305 39463305 1 * <NA> N <DEL> 0.00 .
## 3 1 39701420 39701420 1 * <NA> N <DEL> 0.00 .
## 4 1 39747923 39747923 1 * <NA> N <DEL> 0.00 .
## 5 1 39828018 39828018 1 * <NA> N <DEL> 31.46 .
## 6 1 39836674 39836674 1 * <NA> N <DEL> 2.04 .
## SVTYPE SVLEN END STRANDS IMPRECISE CIPOS CIEND CIPOS95 CIEND95
## 1 DUP 2483 26688447 -+:5 TRUE -119, 9 -9, 181 -32, 1 0, 36
## 2 DEL -13 39463318 +-:4 TRUE -10, 11 -12, 9 -2, 10 -12, 1
## 3 DEL -1438 39702858 +-:12 FALSE -4, 4 -10, 9 0, 0 0, 0
## 4 DEL -183 39748106 +-:5 FALSE -10, 9 -10, 9 0, 0 0, 0
## 5 DEL -1935 39829953 +-:5 TRUE -10, 9 -10, 9 -1, 1 -1, 1
## 6 DEL -75 39836749 +-:4 TRUE -10, 73 -20, 9 -3, 34 -16, 2
## MATEID EVENT SECONDARY SU PE SR BD EV PRPOS PREND
## 1 <NA> FALSE 5 5 0 1.63734e.... 2.41925e....
## 2 <NA> FALSE 4 4 0 4.02714e.... 0.035181....
## 3 <NA> FALSE 12 6 6 4.2664e-.... 4.61213e....
## 4 <NA> FALSE 5 0 5 9.80198e.... 9.80198e....
## 5 <NA> FALSE 5 3 2 2.96754e.... 5.80289e....
## 6 <NA> FALSE 4 4 0 1.16408e.... 0.002455....
df <- df[,c("seqnames","start","END","SVTYPE","SVLEN","QUAL")]
gt <- data.frame(gtx,gty,gta,gtb,gtd,gte)
gt <- data.frame(df,gt) # this gives us the quality check of each variants for the 6 animals and from here i will need to copy objects gt into gt2 and then subset gt2 for the variants with highest confidence using the QUAL which should be > 30.
gt2 <- gt
colnames(gt) <- c("x","y","a","b","d","e") #continue working on gt objects to get the true Homozygous variants for all animals and visualize it on a venn diagram
gt <- subset(gt != "./." & gt != "0/1")
head(gt)
## x y a b d e <NA> <NA> <NA> <NA> <NA> <NA>
## 1 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## 2 TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE
## 3 TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE
## 4 TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE
## 5 TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE
## 6 TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE
table(gt) #having a tabular look of the TRUE Homozygous variants for all 6 animals.
## gt
## FALSE TRUE
## 5257 23193
a_set <- which(gta == "1/1")
b_set <- which(gtb == "1/1")
x_set <- which(gtx == "1/1")
y_set <- which(gty == "1/1")
d_set <- which(gtd == "1/1")
e_set <- which(gte == "1/1")
v1 <- list("a" = a_set, "b" = b_set, "x" = x_set, "y" = y_set, "d" = d_set, "e" = e_set)
print(plot(euler(v1),quantities = TRUE,main="homozygous variants"))
gt2 <- subset(gt2, gt2$QUAL>30)
gt3 <- apply(gt2[,c(7:12)], 2, function(x){
as.numeric(as.factor(x))-2
})
gt23 <- cbind(gt2,gt3)
head(gt23)
## seqnames start END SVTYPE SVLEN QUAL sample sample.1 sample.2
## 5 1 39828018 39829953 DEL -1935 31.46 1/1 ./. ./.
## 9 1 40565418 40565503 DEL -85 60.25 1/1 1/1 1/1
## 12 1 42646102 42647988 DEL -1886 60.25 1/1 ./. ./.
## 19 1 51544912 51546751 DEL -1839 31.46 1/1 ./. ./.
## 26 1 63886209 63893328 DEL -7119 31.46 1/1 0/0 1/1
## 27 1 64151297 64153186 DEL -1889 31.46 1/1 ./. ./.
## sample.3 sample.4 sample.5 sample sample.1 sample.2 sample.3 sample.4
## 5 0/0 0/0 0/0 0 -1 -1 0 0
## 9 0/0 0/0 1/1 0 2 2 0 0
## 12 ./. 0/0 ./. 0 -1 -1 -1 0
## 19 1/1 ./. 0/0 0 -1 -1 2 -1
## 26 ./. 0/0 ./. 0 0 2 -1 0
## 27 1/1 1/1 ./. 0 -1 -1 2 2
## sample.5
## 5 0
## 9 2
## 12 -1
## 19 0
## 26 -1
## 27 -1
gt2[gt2 == -1] <- NA
library(gplots)
## Registered S3 method overwritten by 'gplots':
## method from
## plot.venn eulerr
##
## Attaching package: 'gplots'
## The following object is masked from 'package:eulerr':
##
## venn
## The following object is masked from 'package:IRanges':
##
## space
## The following object is masked from 'package:S4Vectors':
##
## space
## The following object is masked from 'package:stats':
##
## lowess
rownames(gt3) <- paste(gt2$seqnames,gt2$start,sep=":")
head(rownames(gt3))
## [1] "1:39828018" "1:40565418" "1:42646102" "1:51544912" "1:63886209"
## [6] "1:64151297"
colfunc <- colorRampPalette(c("yellow", "blue", "red"))
heatmap.2(gt3, trace = "none", scale = "none", dendrogram='none', Rowv=FALSE, Colv=FALSE, col=colfunc(25), margins = c(5,10), cexRow=.8, cexCol=.9, main="Genetic Variation")
#from my little investigation, i could tell that variants with high confidence can only be identify through there quality check
#using the package below to locate and annotate the gene variants without necessary needing the gff file at the moment
library(TxDb.Mmusculus.UCSC.mm10.ensGene)
## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi
#txdb <- TxDb.Mmusculus.UCSC.mm10.ensGene
#myseqlevels <- gsub("chrUn_","",seqlevels(txdb))
#myseqlevels <- gsub("_random","",myseqlevels)
#strReverse <- function(x) { sapply(lapply(strsplit(x, NULL), rev), paste, collapse="") }
#myseqlevels <- strReverse(myseqlevels)
#myseqlevels <- sapply(strsplit(myseqlevels , "_"),"[[",1)
#myseqlevels <- strReverse(myseqlevels)
#myseqlevels <- gsub("chr","",myseqlevels)
#myseqlevels <- gsub("M","MT",myseqlevels)
#myseqlevels
# why are the variants in rd only 1bp size?
#loc <- locateVariants(rd, txdb, CodingVariants())
#all <- locateVariants(rd, txdb, AllVariants())
gff_file <- "Mus_musculus.GRCm38.98.gff3"
txdb <- makeTxDbFromGFF(gff_file, format="gff3")
## Import genomic features from the file as a GRanges object ... OK
## Prepare the 'metadata' data frame ... OK
## Make the TxDb object ...
## Warning in .extract_exons_from_GRanges(exon_IDX, gr, mcols0, tx_IDX, feature = "exon", : 6 exons couldn't be linked to a transcript so were dropped:
## seqid start end strand ID Name
## 1 12 113654967 113655244 + <NA> ENSMUSE00000892648
## 2 12 114630680 114630973 + <NA> ENSMUSE00001053179
## 3 6 42980690 42980744 + <NA> ENSMUSE00001000765
## 4 6 42980854 42981150 + <NA> ENSMUSE00000892164
## 5 6 43081518 43081827 - <NA> ENSMUSE00000663119
## 6 6 43082049 43082097 - <NA> ENSMUSE00001046644
## Parent Parent_type
## 1 transcript:ENSMUST00000166255 <NA>
## 2 transcript:ENSMUST00000095364 <NA>
## 3 transcript:ENSMUST00000167638 <NA>
## 4 transcript:ENSMUST00000167638 <NA>
## 5 transcript:ENSMUST00000103301 <NA>
## 6 transcript:ENSMUST00000103301 <NA>
## Warning in .extract_exons_from_GRanges(cds_IDX, gr, mcols0, tx_IDX, feature = "cds", : 6 CDS couldn't be linked to a transcript so were dropped:
## seqid start end strand ID Name
## 1 12 113654967 113655244 + CDS:ENSMUSP00000128354 ENSMUSP00000128354
## 2 12 114630680 114630973 + CDS:ENSMUSP00000093010 ENSMUSP00000093010
## 3 6 42980690 42980744 + CDS:ENSMUSP00000131191 ENSMUSP00000131191
## 4 6 42980854 42981150 + CDS:ENSMUSP00000131191 ENSMUSP00000131191
## 5 6 43081518 43081827 - CDS:ENSMUSP00000100102 ENSMUSP00000100102
## 6 6 43082049 43082097 - CDS:ENSMUSP00000100102 ENSMUSP00000100102
## Parent Parent_type
## 1 transcript:ENSMUST00000166255 <NA>
## 2 transcript:ENSMUST00000095364 <NA>
## 3 transcript:ENSMUST00000167638 <NA>
## 4 transcript:ENSMUST00000167638 <NA>
## 5 transcript:ENSMUST00000103301 <NA>
## 6 transcript:ENSMUST00000103301 <NA>
## OK
vcfd <- vcf_d #copied vcf_d file to vcfd, so as to use vcfd to have a clear look into how to annotate and locate the gene variants without altering the vcf_d files.
rd <- rowRanges(vcfd)
rd
## GRanges object with 2417 ranges and 5 metadata columns:
## seqnames ranges strand | paramRangeID REF
## <Rle> <IRanges> <Rle> | <factor> <DNAStringSet>
## 1 1 26685964 * | NA N
## 2 1 39463305 * | NA N
## 3 1 39701420 * | NA N
## 4 1 39747923 * | NA N
## 5 1 39828018 * | NA N
## ... ... ... ... . ... ...
## 2138_2 GL456392.1 8589 * | NA N
## 2139_1 GL456390.1 23732 * | NA N
## 2139_2 GL456396.1 10046 * | NA N
## 2140_1 GL456392.1 19292 * | NA N
## 2140_2 GL456396.1 9019 * | NA N
## ALT QUAL FILTER
## <CharacterList> <numeric> <character>
## 1 <DUP> 68.74 .
## 2 <DEL> 4.77 .
## 3 <DEL> 4.77 .
## 4 <DEL> 4.77 .
## 5 <DEL> 4.77 .
## ... ... ... ...
## 2138_2 N[GL456390.1:21531[ 0.13 .
## 2139_1 ]GL456396.1:10046]N 41.83 .
## 2139_2 N[GL456390.1:23732[ 41.83 .
## 2140_1 ]GL456396.1:9019]N 110.90 .
## 2140_2 N[GL456392.1:19292[ 110.90 .
## -------
## seqinfo: 39 sequences from mm10 genome; no seqlengths
rd <- rd[elementMetadata(rd)[,4]>30]
txdb
## TxDb object:
## # Db type: TxDb
## # Supporting package: GenomicFeatures
## # Data source: Mus_musculus.GRCm38.98.gff3
## # Organism: NA
## # Taxonomy ID: NA
## # miRBase build ID: NA
## # Genome: NA
## # Nb of transcripts: 142442
## # Db created by: GenomicFeatures package from Bioconductor
## # Creation time: 2021-08-23 12:59:17 +1000 (Mon, 23 Aug 2021)
## # GenomicFeatures version at creation time: 1.44.0
## # RSQLite version at creation time: 2.2.7
## # DBSCHEMAVERSION: 1.2
seqlevels(vcfd)
## [1] "1" "2"
## [3] "3" "4"
## [5] "5" "6"
## [7] "7" "8"
## [9] "9" "10"
## [11] "11" "12"
## [13] "13" "14"
## [15] "15" "16"
## [17] "17" "18"
## [19] "19" "X"
## [21] "Y" "MT"
## [23] "CHR_MG3251_PATCH" "JH584304.1"
## [25] "GL456216.1" "GL456239.1"
## [27] "GL456389.1" "GL456381.1"
## [29] "GL456387.1" "GL456393.1"
## [31] "CHR_MG4259_PATCH" "GL456394.1"
## [33] "GL456368.1" "JH584292.1"
## [35] "CHR_CAST_EI_MMCHR11_CTG5" "GL456383.1"
## [37] "GL456392.1" "GL456396.1"
## [39] "GL456390.1"
seqlevels(txdb)
## [1] "1" "2" "3" "4" "5"
## [6] "6" "7" "8" "9" "10"
## [11] "11" "12" "13" "14" "15"
## [16] "16" "17" "18" "19" "X"
## [21] "Y" "MT" "GL456210.1" "GL456211.1" "GL456212.1"
## [26] "GL456216.1" "GL456219.1" "GL456221.1" "GL456233.1" "GL456239.1"
## [31] "GL456350.1" "GL456354.1" "GL456372.1" "GL456381.1" "GL456385.1"
## [36] "JH584292.1" "JH584293.1" "JH584294.1" "JH584295.1" "JH584296.1"
## [41] "JH584297.1" "JH584298.1" "JH584299.1" "JH584303.1" "JH584304.1"
intersect(seqlevels(rd),seqlevels(txdb))
## [1] "1" "2" "3" "4" "5"
## [6] "6" "7" "8" "9" "10"
## [11] "11" "12" "13" "14" "15"
## [16] "16" "17" "18" "19" "X"
## [21] "Y" "MT" "JH584304.1" "GL456216.1" "GL456239.1"
## [26] "GL456381.1" "JH584292.1"
tx <- transcripts(txdb)
ol <- findOverlaps(rd,tx)
## Warning in .Seqinfo.mergexy(x, y): Each of the 2 combined objects has sequence levels not in the other:
## - in 'x': CHR_MG3251_PATCH, GL456389.1, GL456387.1, GL456393.1, CHR_MG4259_PATCH, GL456394.1, GL456368.1, CHR_CAST_EI_MMCHR11_CTG5, GL456383.1, GL456392.1, GL456396.1, GL456390.1
## - in 'y': GL456210.1, GL456211.1, GL456212.1, GL456219.1, GL456221.1, GL456233.1, GL456350.1, GL456354.1, GL456372.1, GL456385.1, JH584293.1, JH584294.1, JH584295.1, JH584296.1, JH584297.1, JH584298.1, JH584299.1, JH584303.1
## Make sure to always combine/compare objects based on the same reference
## genome (use suppressWarnings() to suppress this warning).
ol
## Hits object with 1086 hits and 0 metadata columns:
## queryHits subjectHits
## <integer> <integer>
## [1] 1 4630
## [2] 6 682
## [3] 16 5642
## [4] 16 5643
## [5] 16 5644
## ... ... ...
## [1082] 784 138837
## [1083] 784 138838
## [1084] 785 134530
## [1085] 786 136646
## [1086] 786 136648
## -------
## queryLength: 792 / subjectLength: 142442
var_ol <- rd[queryHits(ol)]
tx_ol <- tx[subjectHits(ol)]
var_ol
## GRanges object with 1086 ranges and 5 metadata columns:
## seqnames ranges strand | paramRangeID REF
## <Rle> <IRanges> <Rle> | <factor> <DNAStringSet>
## 1 1 26685964 * | NA N
## 14 1 43030800 * | NA N
## 44 1 73977363 * | NA N
## 44 1 73977363 * | NA N
## 44 1 73977363 * | NA N
## ... ... ... ... . ... ...
## 2130_2 X 87238421 * | NA N
## 2130_2 X 87238421 * | NA N
## 2132_1 19 45650267 * | NA N
## 2132_2 X 112028707 * | NA N
## 2132_2 X 112028707 * | NA N
## ALT QUAL FILTER
## <CharacterList> <numeric> <character>
## 1 <DUP> 68.74 .
## 14 <DEL> 60.25 .
## 44 <DEL> 31.46 .
## 44 <DEL> 31.46 .
## 44 <DEL> 31.46 .
## ... ... ... ...
## 2130_2 N]18:85698118] 41.98 .
## 2130_2 N]18:85698118] 41.98 .
## 2132_1 N[X:112028707[ 60.25 .
## 2132_2 ]19:45650267]N 60.25 .
## 2132_2 ]19:45650267]N 60.25 .
## -------
## seqinfo: 39 sequences from mm10 genome; no seqlengths
tx_ol
## GRanges object with 1086 ranges and 2 metadata columns:
## seqnames ranges strand | tx_id tx_name
## <Rle> <IRanges> <Rle> | <integer> <character>
## [1] 1 26681814-26687460 - | 4630 ENSMUST00000097801
## [2] 1 42952963-43035456 + | 682 ENSMUST00000179766
## [3] 1 73910231-74124447 - | 5642 ENSMUST00000169786
## [4] 1 73910232-74091351 - | 5643 ENSMUST00000187584
## [5] 1 73910232-74124449 - | 5644 ENSMUST00000191104
## ... ... ... ... . ... ...
## [1082] X 86747242-87890235 - | 138837 ENSMUST00000078875
## [1083] X 86769118-88115645 - | 138838 ENSMUST00000113964
## [1084] 19 45578254-45660312 - | 134530 ENSMUST00000046869
## [1085] X 111891389-112240590 + | 136646 ENSMUST00000207962
## [1086] X 112011007-112093089 + | 136648 ENSMUST00000131304
## -------
## seqinfo: 45 sequences (1 circular) from an unspecified genome; no seqlengths
vardf <- as.data.frame(var_ol,row.names = 1:nrow(as.data.frame(ranges(var_ol,use.mcols = TRUE)))) #read up more on the tab options to get the ALT.
head(vardf)
## seqnames start end width strand paramRangeID REF ALT QUAL FILTER
## 1 1 26685964 26685964 1 * <NA> N <DUP> 68.74 .
## 2 1 43030800 43030800 1 * <NA> N <DEL> 60.25 .
## 3 1 73977363 73977363 1 * <NA> N <DEL> 31.46 .
## 4 1 73977363 73977363 1 * <NA> N <DEL> 31.46 .
## 5 1 73977363 73977363 1 * <NA> N <DEL> 31.46 .
## 6 1 73977363 73977363 1 * <NA> N <DEL> 31.46 .
txdf <- as.data.frame(tx_ol,row.names = 1:nrow(as.data.frame(ranges(tx_ol))))
head(txdf)
## seqnames start end width strand tx_id tx_name
## 1 1 26681814 26687460 5647 - 4630 ENSMUST00000097801
## 2 1 42952963 43035456 82494 + 682 ENSMUST00000179766
## 3 1 73910231 74124447 214217 - 5642 ENSMUST00000169786
## 4 1 73910232 74091351 181120 - 5643 ENSMUST00000187584
## 5 1 73910232 74124449 214218 - 5644 ENSMUST00000191104
## 6 1 73914093 73995372 81280 - 5646 ENSMUST00000185702
vardf$tx <- txdf$tx_name
head(vardf)
## seqnames start end width strand paramRangeID REF ALT QUAL FILTER
## 1 1 26685964 26685964 1 * <NA> N <DUP> 68.74 .
## 2 1 43030800 43030800 1 * <NA> N <DEL> 60.25 .
## 3 1 73977363 73977363 1 * <NA> N <DEL> 31.46 .
## 4 1 73977363 73977363 1 * <NA> N <DEL> 31.46 .
## 5 1 73977363 73977363 1 * <NA> N <DEL> 31.46 .
## 6 1 73977363 73977363 1 * <NA> N <DEL> 31.46 .
## tx
## 1 ENSMUST00000097801
## 2 ENSMUST00000179766
## 3 ENSMUST00000169786
## 4 ENSMUST00000187584
## 5 ENSMUST00000191104
## 6 ENSMUST00000185702
vardf_tx <- vardf
head(vardf_tx)
## seqnames start end width strand paramRangeID REF ALT QUAL FILTER
## 1 1 26685964 26685964 1 * <NA> N <DUP> 68.74 .
## 2 1 43030800 43030800 1 * <NA> N <DEL> 60.25 .
## 3 1 73977363 73977363 1 * <NA> N <DEL> 31.46 .
## 4 1 73977363 73977363 1 * <NA> N <DEL> 31.46 .
## 5 1 73977363 73977363 1 * <NA> N <DEL> 31.46 .
## 6 1 73977363 73977363 1 * <NA> N <DEL> 31.46 .
## tx
## 1 ENSMUST00000097801
## 2 ENSMUST00000179766
## 3 ENSMUST00000169786
## 4 ENSMUST00000187584
## 5 ENSMUST00000191104
## 6 ENSMUST00000185702
gx <- genes(txdb)
ol <- findOverlaps(rd,gx,select = "all")
## Warning in .Seqinfo.mergexy(x, y): Each of the 2 combined objects has sequence levels not in the other:
## - in 'x': CHR_MG3251_PATCH, GL456389.1, GL456387.1, GL456393.1, CHR_MG4259_PATCH, GL456394.1, GL456368.1, CHR_CAST_EI_MMCHR11_CTG5, GL456383.1, GL456392.1, GL456396.1, GL456390.1
## - in 'y': GL456210.1, GL456211.1, GL456212.1, GL456219.1, GL456221.1, GL456233.1, GL456350.1, GL456354.1, GL456372.1, GL456385.1, JH584293.1, JH584294.1, JH584295.1, JH584296.1, JH584297.1, JH584298.1, JH584299.1, JH584303.1
## Make sure to always combine/compare objects based on the same reference
## genome (use suppressWarnings() to suppress this warning).
ol
## Hits object with 374 hits and 0 metadata columns:
## queryHits subjectHits
## <integer> <integer>
## [1] 1 20515
## [2] 6 12401
## [3] 16 15917
## [4] 18 20495
## [5] 19 5445
## ... ... ...
## [370] 775 27968
## [371] 784 15270
## [372] 785 12095
## [373] 786 9717
## [374] 786 46608
## -------
## queryLength: 792 / subjectLength: 55417
var_ol <- rd[queryHits(ol)]
gx_ol <- gx[subjectHits(ol)]
var_ol
## GRanges object with 374 ranges and 5 metadata columns:
## seqnames ranges strand | paramRangeID REF
## <Rle> <IRanges> <Rle> | <factor> <DNAStringSet>
## 1 1 26685964 * | NA N
## 14 1 43030800 * | NA N
## 44 1 73977363 * | NA N
## 55 1 79751457 * | NA N
## 56 1 79853964 * | NA N
## ... ... ... ... . ... ...
## 2118_1 16 35981692 * | NA N
## 2130_2 X 87238421 * | NA N
## 2132_1 19 45650267 * | NA N
## 2132_2 X 112028707 * | NA N
## 2132_2 X 112028707 * | NA N
## ALT QUAL FILTER
## <CharacterList> <numeric> <character>
## 1 <DUP> 68.74 .
## 14 <DEL> 60.25 .
## 44 <DEL> 31.46 .
## 55 <DEL> 60.25 .
## 56 <DEL> 60.25 .
## ... ... ... ...
## 2118_1 [17:70963682[N 48.50 .
## 2130_2 N]18:85698118] 41.98 .
## 2132_1 N[X:112028707[ 60.25 .
## 2132_2 ]19:45650267]N 60.25 .
## 2132_2 ]19:45650267]N 60.25 .
## -------
## seqinfo: 39 sequences from mm10 genome; no seqlengths
vardf <- as.data.frame(var_ol,row.names = 1:nrow(as.data.frame(ranges(var_ol))))
vardf$ALT <- as.character(GenomicRanges::elementMetadata(var_ol)$ALT)
head(vardf)
## seqnames start end width strand paramRangeID REF ALT QUAL FILTER
## 1 1 26685964 26685964 1 * <NA> N <DUP> 68.74 .
## 2 1 43030800 43030800 1 * <NA> N <DEL> 60.25 .
## 3 1 73977363 73977363 1 * <NA> N <DEL> 31.46 .
## 4 1 79751457 79751457 1 * <NA> N <DEL> 60.25 .
## 5 1 79853964 79853964 1 * <NA> N <DEL> 60.25 .
## 6 1 80706070 80706070 1 * <NA> N <DEL> 31.46 .
gxdf <- as.data.frame(gx_ol,row.names = 1:nrow(as.data.frame(ranges(gx_ol))))
head(gxdf)
## seqnames start end width strand gene_id
## 1 1 26681814 26687460 5647 - ENSMUSG00000073722
## 2 1 42952872 43035456 82585 + ENSMUSG00000041907
## 3 1 73910231 74124449 214219 - ENSMUSG00000055322
## 4 1 79702262 79776143 73882 - ENSMUSG00000073643
## 5 1 79794197 79861180 66984 - ENSMUSG00000026249
## 6 1 80501073 80758527 257455 - ENSMUSG00000038608
vardf$gene <- gxdf$gene_id
head(vardf)
## seqnames start end width strand paramRangeID REF ALT QUAL FILTER
## 1 1 26685964 26685964 1 * <NA> N <DUP> 68.74 .
## 2 1 43030800 43030800 1 * <NA> N <DEL> 60.25 .
## 3 1 73977363 73977363 1 * <NA> N <DEL> 31.46 .
## 4 1 79751457 79751457 1 * <NA> N <DEL> 60.25 .
## 5 1 79853964 79853964 1 * <NA> N <DEL> 60.25 .
## 6 1 80706070 80706070 1 * <NA> N <DEL> 31.46 .
## gene
## 1 ENSMUSG00000073722
## 2 ENSMUSG00000041907
## 3 ENSMUSG00000055322
## 4 ENSMUSG00000073643
## 5 ENSMUSG00000026249
## 6 ENSMUSG00000038608
genenames <- read.table("Mus_musculus.GRCm38.98.gff3.genenames.tsv", header = FALSE, sep = "\t", quote = "")
colnames(genenames) <- c("Ensembl ID","Gene Name")
head(genenames)
## Ensembl ID Gene Name
## 1 ENSMUSG00000102693 4933401J01Rik
## 2 ENSMUSG00000051951 Xkr4
## 3 ENSMUSG00000103377 Gm37180
## 4 ENSMUSG00000104017 Gm37363
## 5 ENSMUSG00000103025 Gm37686
## 6 ENSMUSG00000103201 Gm37329
gene_variants <- merge(genenames, vardf,by.x="Ensembl ID", by.y="gene")
head(gene_variants)
## Ensembl ID Gene Name seqnames start end width strand
## 1 ENSMUSG00000000197 Nalcn 14 123401210 123401210 1 *
## 2 ENSMUSG00000000275 Trim25 11 89009389 89009389 1 *
## 3 ENSMUSG00000000305 Cdh4 2 179446174 179446174 1 *
## 4 ENSMUSG00000001313 Rnd2 11 101469662 101469662 1 *
## 5 ENSMUSG00000001630 Stk38l 6 146764296 146764296 1 *
## 6 ENSMUSG00000001768 Rin2 2 145691244 145691244 1 *
## paramRangeID REF ALT QUAL FILTER
## 1 <NA> N <DEL> 31.46 .
## 2 <NA> N <DEL> 60.25 .
## 3 <NA> N <DEL> 60.25 .
## 4 <NA> N <DEL> 31.46 .
## 5 <NA> N <DEL> 89.31 .
## 6 <NA> N <DEL> 89.31 .
Subset from the gene variants objects to get the 3 types of variation to show on the circos plot: ie Deletions, Duplications and Break-ends.
library("RCircos")
sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8
## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8
## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] RCircos_1.2.1
## [2] TxDb.Mmusculus.UCSC.mm10.ensGene_3.4.0
## [3] GenomicFeatures_1.44.0
## [4] AnnotationDbi_1.54.1
## [5] gplots_3.1.1
## [6] VariantAnnotation_1.38.0
## [7] Rsamtools_2.8.0
## [8] SummarizedExperiment_1.22.0
## [9] Biobase_2.52.0
## [10] MatrixGenerics_1.4.2
## [11] matrixStats_0.60.0
## [12] eulerr_6.1.0
## [13] Biostrings_2.60.2
## [14] XVector_0.32.0
## [15] R.utils_2.10.1
## [16] R.oo_1.24.0
## [17] R.methodsS3_1.8.1
## [18] GenomicRanges_1.44.0
## [19] GenomeInfoDb_1.28.1
## [20] IRanges_2.26.0
## [21] S4Vectors_0.30.0
## [22] BiocGenerics_0.38.0
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-7 bit64_4.0.5 filelock_1.0.2
## [4] progress_1.2.2 httr_1.4.2 tools_4.1.0
## [7] bslib_0.2.5.1 utf8_1.2.2 R6_2.5.0
## [10] KernSmooth_2.23-20 DBI_1.1.1 tidyselect_1.1.1
## [13] prettyunits_1.1.1 bit_4.0.4 curl_4.3.2
## [16] compiler_4.1.0 xml2_1.3.2 DelayedArray_0.18.0
## [19] rtracklayer_1.52.0 sass_0.4.0 caTools_1.18.2
## [22] rappdirs_0.3.3 stringr_1.4.0 digest_0.6.27
## [25] rmarkdown_2.10 pkgconfig_2.0.3 htmltools_0.5.1.1
## [28] dbplyr_2.1.1 fastmap_1.1.0 BSgenome_1.60.0
## [31] highr_0.9 rlang_0.4.11 RSQLite_2.2.7
## [34] jquerylib_0.1.4 BiocIO_1.2.0 generics_0.1.0
## [37] jsonlite_1.7.2 BiocParallel_1.26.1 gtools_3.9.2
## [40] dplyr_1.0.7 RCurl_1.98-1.3 magrittr_2.0.1
## [43] GenomeInfoDbData_1.2.6 Matrix_1.3-4 Rcpp_1.0.7
## [46] fansi_0.5.0 lifecycle_1.0.0 stringi_1.7.3
## [49] yaml_2.2.1 zlibbioc_1.38.0 BiocFileCache_2.0.0
## [52] grid_4.1.0 blob_1.2.2 crayon_1.4.1
## [55] lattice_0.20-44 hms_1.1.0 KEGGREST_1.32.0
## [58] polylabelr_0.2.0 knitr_1.33 pillar_1.6.2
## [61] rjson_0.2.20 biomaRt_2.48.2 XML_3.99-0.6
## [64] glue_1.4.2 evaluate_0.14 png_0.1-7
## [67] vctrs_0.3.8 purrr_0.3.4 polyclip_1.10-0
## [70] assertthat_0.2.1 cachem_1.0.5 xfun_0.25
## [73] restfulr_0.0.13 tibble_3.1.3 GenomicAlignments_1.28.0
## [76] memoise_2.0.0 ellipsis_0.3.2