Intro

Here we’re doing an analysis to identify CNV using normalised reads counts across genomic windows.

library("cobs")
library("quantreg")
library("parallel")
library("gplots")
library("dplyr")
library("kableExtra")
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
interpolate_points<-function(row,dat,curve){
  MY_X=dat[row,1]
  MY_Y=dat[row,2]
  VAL1=tail(which(curve[,1]<MY_X),1)
  VAL2=VAL1+1
  X <- curve[c(VAL1,VAL2),1]
  Y <- curve[c(VAL1,VAL2),2]
  INTERP_Y=approx(X,Y,xout=MY_X)$y
  INTERP_Y
}

1 Mbp bins

x <- read.table("SRP199233.1e6_fmt.tsv",header=T,row.names=1)
x <- x[which(rowSums(x)>=10),]
x <- sweep(x, 2, colSums(x), FUN="/")*1000000
mysd <- apply(x,1,sd)
mean <- apply(x,1,mean)
y <- data.frame(log10(mean),mysd/mean)
colnames(y) = c("logMean","cv")
Rbs.9 <- cobs(y$logMean,y$cv, nknots=10,constraint="none",tau=0.99)
## qbsks2():
##  Performing general knot selection ...
## 
##  Deleting unnecessary knots ...
Rbs.median <- cobs(y$logMean,y$cv,nknots=10,constraint="none",tau=0.5)
## qbsks2():
##  Performing general knot selection ...
## 
##  Deleting unnecessary knots ...
pred <- data.frame(predict(Rbs.9))

res <- mclapply(X=1:nrow(y),function(row) {
  interpolate_points(row,y,pred)
  },mc.cores=8)
y$interpolated <- unlist(res)

y$diff=y$cv-y$interpolated
yy <- y[which(y$diff>0.05),]
yy <- yy[order(-yy$diff),]

write.table(yy,file="SRP199233.1e6_regions.tsv")

yy %>% kbl() %>% kable_paper("hover", full_width = F)
logMean cv interpolated diff
7:32000000-33000000 1.514337 0.3780389 0.1594371 0.2186018
MT:0-16299 1.621759 0.2621939 0.1369478 0.1252461
9:124000000-124595110 1.941474 0.1903983 0.0882176 0.1021807
plot(y$logMean,y$cv,pch=18,cex=0.5,xlab="log10(mean)",ylab="CV")
lines(predict(Rbs.9), col = "red", lwd = 1.0)
lines(predict(Rbs.median), col = "blue", lwd = 1.0)
points(yy$logMean,yy$cv)
text(yy$logMean,yy$cv+0.02,labels=rownames(yy),cex=0.8)

my_palette <- colorRampPalette(c("blue", "white", "red"))(n = 25)
zz<-x[which(rownames(x) %in% rownames(yy)),]
heatmap.2(as.matrix(zz),margin=c(8, 22),cexRow=0.65,trace="none",
  cexCol=0.8,col=my_palette,scale="row")

heatmap.2(cor(t(zz)),trace="none",scale="none",margins=c(12,12),
  cexRow=0.8, cexCol=0.8)

100 kbp bins

x<-read.table("SRP199233.1e5_fmt.tsv",header=T,row.names=1)
x<-x[which(rowSums(x)>=10),]
x<-sweep(x, 2, colSums(x), FUN="/")*1000000
mysd<-apply(x,1,sd)
mean<-apply(x,1,mean)
y<-data.frame(log10(mean),mysd/mean)
colnames(y)=c("logMean","cv")
Rbs.9 <- cobs(y$logMean,y$cv, nknots=10,constraint="none",tau=0.99)
## qbsks2():
##  Performing general knot selection ...
## 
##  WARNING! Since the number of  10  knots selected by  AIC  reached the
##    upper bound during general knot selection, you might want to rerun
##    cobs with a larger number of knots. 
## 
##  Deleting unnecessary knots ...
## 
##  WARNING! Since the number of  10  knots selected by  AIC  reached the
##    upper bound during general knot selection, you might want to rerun
##    cobs with a larger number of knots.
Rbs.median <- cobs(y$logMean,y$cv,nknots=10,constraint="none",tau=0.5)
## qbsks2():
##  Performing general knot selection ...
## 
##  Deleting unnecessary knots ...
pred<-data.frame(predict(Rbs.9))

res <- mclapply(X=1:nrow(y),function(row) {
  interpolate_points(row,y,pred)
  },mc.cores=8)
y$interpolated <- unlist(res)

y$diff=y$cv-y$interpolated
yy <- y[which(y$diff>0.05),]
yy <- yy[order(-yy$diff),]
yy <- head(yy,200)

write.table(yy,file="SRP199233.1e5_regions.tsv")

yy %>% kbl() %>% kable_paper("hover", full_width = F)
logMean cv interpolated diff
7:26000000-26100000 0.7328296 0.8076524 0.3768477 0.4308047
Y:42900000-43000000 -0.7795757 1.8135944 1.4661506 0.3474438
7:26100000-26200000 0.7526029 0.6582584 0.3678549 0.2904035
Y:19400000-19500000 -0.5049940 1.4829307 1.2101645 0.2727662
9:124200000-124300000 1.4183689 0.3858147 0.1432172 0.2425976
7:32300000-32400000 0.8291872 0.5611718 0.3339716 0.2272002
9:123900000-124000000 1.7001917 0.3110071 0.1010840 0.2099232
13:119500000-119600000 0.9692719 0.4388120 0.2774641 0.1613479
MT:0-16299 1.6221543 0.2621907 0.1129953 0.1491954
Y:12200000-12300000 -0.6857873 1.5206623 1.3757258 0.1449365
Y:24700000-24800000 -0.6804049 1.5066416 1.3706704 0.1359712
7:32200000-32300000 0.6757996 0.5376975 0.4036827 0.1340149
7:31700000-31800000 0.7891310 0.4692208 0.3514426 0.1177782
7:32500000-32600000 0.5227468 0.5736655 0.4813127 0.0923529
14:6900000-7000000 1.5638312 0.1921610 0.1208570 0.0713040
7:38300000-38400000 1.8168547 0.1634987 0.1030466 0.0604521
7:39100000-39200000 0.9358565 0.3461525 0.2902758 0.0558766
14:5900000-6000000 1.5713234 0.1755526 0.1200704 0.0554822
4:6200000-6300000 1.5866750 0.1712088 0.1184586 0.0527503
plot(y$logMean,y$cv,pch=18,cex=0.5,xlab="log10(mean)",ylab="CV")
lines(predict(Rbs.9), col = "red", lwd = 1.0)
lines(predict(Rbs.median), col = "blue", lwd = 1.0)
points(yy$logMean,yy$cv)
text(yy$logMean,yy$cv+0.02,labels=rownames(yy),cex=0.8)

my_palette <- colorRampPalette(c("blue", "white", "red"))(n = 25)
zz<-x[which(rownames(x) %in% rownames(yy)),]
heatmap.2(as.matrix(zz),margin=c(8, 22),cexRow=0.65,trace="none",
  cexCol=0.8,col=my_palette,scale="row")

heatmap.2(cor(t(zz)),trace="none",scale="none",margins=c(12,12),
  cexRow=0.8, cexCol=0.8)

10 kbp bins

x<-read.table("SRP199233.1e4_fmt.tsv",header=T,row.names=1)
x<-x[which(rowSums(x)>=10),]
x<-sweep(x, 2, colSums(x), FUN="/")*1000000
mysd<-apply(x,1,sd)
mean<-apply(x,1,mean)
y<-data.frame(log10(mean),mysd/mean)
colnames(y)=c("logMean","cv")
Rbs.9 <- cobs(y$logMean,y$cv, nknots=10,constraint="none",tau=0.99)
## qbsks2():
##  Performing general knot selection ...
## 
##  Deleting unnecessary knots ...
## Warning in cobs(y$logMean, y$cv, nknots = 10, constraint = "none", tau = 0.99):
## The algorithm has not converged after 100 iterations
Rbs.median <- cobs(y$logMean,y$cv,nknots=10,constraint="none",tau=0.5)
## qbsks2():
##  Performing general knot selection ...
## 
##  Deleting unnecessary knots ...
pred<-data.frame(predict(Rbs.9))

res <- mclapply(X=1:nrow(y),function(row) {
  interpolate_points(row,y,pred)
  },mc.cores=8)
y$interpolated <- unlist(res)

y$diff=y$cv-y$interpolated
yy <- y[which(y$diff>0.2),]
yy <- yy[order(-yy$diff),]
yy <- head(yy,200)

write.table(yy,file="SRP199233.1e4_regions.tsv")

yy %>% kbl() %>% kable_paper("hover", full_width = F)
logMean cv interpolated diff
7:32240000-32250000 -0.2373112 1.4993115 0.9505398 0.5487717
7:7960000-7970000 -0.4359363 1.6231229 1.1993986 0.4237243
X:30040000-30050000 -0.7175903 2.0028778 1.6273673 0.3755105
9:123930000-123940000 1.2312546 0.6430982 0.2923801 0.3507181
12:114070000-114080000 -0.3445578 1.3963290 1.0795868 0.3167422
8:52960000-52970000 -0.5796499 1.7137573 1.4067886 0.3069688
7:32540000-32550000 -0.6211188 1.7778557 1.4710081 0.3068476
16:6120000-6130000 0.3082189 0.7689337 0.4908537 0.2780800
X:90380000-90390000 -0.1537834 1.1316620 0.8585531 0.2731089
9:123920000-123930000 1.0217671 0.5278489 0.2644314 0.2634176
X:37530000-37540000 -0.7138833 1.8795928 1.6211113 0.2584815
2:177610000-177620000 -0.7695387 1.9725124 1.7158171 0.2566953
2:71360000-71370000 0.4755420 0.6526598 0.4100564 0.2426034
7:26160000-26170000 -0.2149329 1.1674789 0.9250583 0.2424206
X:25410000-25420000 0.1645395 0.8147801 0.5800721 0.2347080
9:80520000-80530000 0.1234258 0.8395154 0.6094926 0.2300228
X:144590000-144600000 0.0714536 0.8733448 0.6496522 0.2236926
2:50990000-51000000 -0.1363275 1.0625373 0.8405647 0.2219725
X:70050000-70060000 0.0597184 0.8794734 0.6593997 0.2200736
X:82820000-82830000 0.1297576 0.8240179 0.6048271 0.2191908
7:26060000-26070000 -0.2734820 1.2111514 0.9925381 0.2186134
3:64530000-64540000 0.2562980 0.7382075 0.5205607 0.2176468
11:83130000-83140000 -0.5466603 1.5692902 1.3571514 0.2121388
1:66640000-66650000 0.6777967 0.5315521 0.3264069 0.2051452
X:34280000-34290000 -1.0512068 2.4494897 2.2476713 0.2018185
3:22150000-22160000 0.5698086 0.5720654 0.3702702 0.2017951
Y:84510000-84520000 -0.8097790 1.9868289 1.7861816 0.2006473
plot(y$logMean,y$cv,pch=18,cex=0.5,xlab="log10(mean)",ylab="CV")
lines(predict(Rbs.9), col = "red", lwd = 1.0)
lines(predict(Rbs.median), col = "blue", lwd = 1.0)
points(yy$logMean,yy$cv)
text(yy$logMean,yy$cv+0.02,labels=rownames(yy),cex=0.8)

my_palette <- colorRampPalette(c("blue", "white", "red"))(n = 25)
zz<-x[which(rownames(x) %in% rownames(yy)),]
heatmap.2(as.matrix(zz),margin=c(8, 22),cexRow=0.65,trace="none",
  cexCol=0.8,col=my_palette,scale="row")

heatmap.2(cor(t(zz)),trace="none",scale="none",margins=c(12,12),
  cexRow=0.8, cexCol=0.8)

1 kbp bins

#```{r,1kbp1, fig.width = 8 ,fig.height = 8}

#x<-read.table(“SRP199233.1e3_fmt.tsv”,header=T,row.names=1) #x<-x[which(rowSums(x)>=10),] #x<-sweep(x, 2, colSums(x), FUN=“/”)*1000000 #mysd<-apply(x,1,sd) #mean<-apply(x,1,mean) #y<-data.frame(log10(mean),mysd/mean) #colnames(y)=c(“logMean”,“cv”) #Rbs.9 <- cobs(y\(logMean,y\)cv, nknots=10,constraint=“none”,tau=0.99) #Rbs.median <- cobs(y\(logMean,y\)cv,nknots=10,constraint=“none”,tau=0.5) #pred<-data.frame(predict(Rbs.9))

#res <- mclapply(X=1:nrow(y),function(row) { # interpolate_points(row,y,pred) # },mc.cores=8) #y$interpolated <- unlist(res)

#y\(diff=y\)cv-y\(interpolated #yy <- y[which(y\)diff>0.2),] #yy <- yy[order(-yy$diff),] #yy <- head(yy,200)

#write.table(yy,file=“SRP199233.1e3_regions.tsv”)

#plot(y\(logMean,y\)cv,pch=18,cex=0.5,xlab=“log10(mean)”,ylab=“CV”) #lines(predict(Rbs.9), col = “red”, lwd = 1.0) #lines(predict(Rbs.median), col = “blue”, lwd = 1.0) #points(yy\(logMean,yy\)cv) #text(yy\(logMean,yy\)cv+0.02,labels=rownames(yy),cex=0.8)

#my_palette <- colorRampPalette(c(“blue”, “white”, “red”))(n = 25) #zz<-x[which(rownames(x) %in% rownames(yy)),] #heatmap.2(as.matrix(zz),margin=c(8, 22),cexRow=0.65,trace=“none”, # cexCol=0.8,col=my_palette,scale=“row”)

#heatmap.2(cor(t(zz)),trace=“none”,scale=“none”,margins=c(12,12), # cexRow=0.8, cexCol=0.8)

#```

Session information

sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] kableExtra_1.3.4 dplyr_1.0.7      quantreg_5.86    SparseM_1.81    
## [5] cobs_1.3-4       gplots_3.1.1    
## 
## loaded via a namespace (and not attached):
##  [1] gtools_3.9.2       tidyselect_1.1.1   xfun_0.25          bslib_0.2.5.1     
##  [5] purrr_0.3.4        splines_4.1.1      lattice_0.20-44    colorspace_2.0-2  
##  [9] vctrs_0.3.8        generics_0.1.0     viridisLite_0.4.0  htmltools_0.5.1.1 
## [13] yaml_2.2.1         utf8_1.2.2         rlang_0.4.11       jquerylib_0.1.4   
## [17] pillar_1.6.2       glue_1.4.2         DBI_1.1.1          matrixStats_0.60.0
## [21] lifecycle_1.0.0    stringr_1.4.0      MatrixModels_0.5-0 munsell_0.5.0     
## [25] rvest_1.0.1        caTools_1.18.2     evaluate_0.14      knitr_1.33        
## [29] fansi_0.5.0        highr_0.9          Rcpp_1.0.7         KernSmooth_2.23-20
## [33] conquer_1.0.2      scales_1.1.1       webshot_0.5.2      jsonlite_1.7.2    
## [37] systemfonts_1.0.2  digest_0.6.27      stringi_1.7.3      grid_4.1.1        
## [41] tools_4.1.1        bitops_1.0-7       magrittr_2.0.1     sass_0.4.0        
## [45] tibble_3.1.3       crayon_1.4.1       pkgconfig_2.0.3    ellipsis_0.3.2    
## [49] Matrix_1.3-4       xml2_1.3.2         assertthat_0.2.1   rmarkdown_2.10    
## [53] svglite_2.0.0      httr_1.4.2         rstudioapi_0.13    R6_2.5.0          
## [57] compiler_4.1.1