This repository includes data and scripts to analyze structural variations of 320 Chinese samples which includes 119 Tibetan samples and 201 Han samples using nanopore sequencing.
If you use the methods or pipeline in this repository, please cite our paper. Also, you can get detailed information about these methods in the supplementary method of our paper.
In the bash file pipeline.sv.calling.sh, we use sample data to demonstrate the complete process of detecting structural variations based on nanopore sequencing technology.
Firstly, The FASTQ files were aligned to the GRCh38 human reference genome available from NCBI, and a BAM format alignment file was produced using minimap2(Li 2021) with -t 30 --MD -Y -L -a -x map-ont
parameter..
Then Variants were called using Sniffles(Sedlazeck et al. 2018) with -s 2 --ignore_sd -l 50
parameter, cuteSV(Jiang et al. 2020) (parameter: --max_cluster_bias_DEL 100 --diff_ratio_merging_DEL 0.3 -s 2 -l 50 --genotype
) and nanovar(Tham et al. 2020) (parameter: -x ont -l 50 -s 10
).
The VCF format files per sample were merged using SURVIVOR if an SV was supported by at least two callers or flagged by cuteSV as high-quality with a priority of cuteSV>sniffles>nanovar to generate the genotypes.
DbSVmerge was used to obtain the nonredundant SV set for all high-quality SVs from 320 samples. The merging strategy was as follows: the distance of the variant coordinates between any two SVs must be less than 1 kb; for deletion-, duplication- and inversion-type SVs, at least 40% of the region should overlap; and for insertion SVs, the difference in length between insertions should be less than twice the length of both insertions. The numbers and lengths of SVs of different types (insertions, duplications, deletions and inversions) were counted.
High-quality SVs with upstream and downstream genes were annotated in the segdup, rmsk, dgv, 1KGP, gnomAD(Collins et al. 2020), dbvar, DGV, Decipher, OMIM, and ANNOVAR databases.
args = commandArgs(TRUE)
input = args[1]
output = args[2]
df = read.table(input,sep="\t",header=T)
EAS = df[,5]
AT = df[,2]
AL = df[,3]
HT = df[,4]
c1 = cor.test(EAS,AT);R1 = (c1$estimate)^2;p1=c1$p.value
c2 = cor.test(EAS,AL);R2 = (c2$estimate)^2;p2=c2$p.value
c3 = cor.test(EAS,HT);R3 = (c3$estimate)^2;p3=c3$p.value
p1 = ifelse(p1==0,"<2.2e-16",paste("=",p1))
p2 = ifelse(p2==0,"<2.2e-16",paste("=",p2))
p3 = ifelse(p3==0,"<2.2e-16",paste("=",p3))
col.raw = c("#00AFBB", "purple", "#FC4E07")
col = apply(col2rgb(col.raw)/255,2,function(x)rgb(x[1],x[2],x[3],0.8))
pdf(output,width=13,height=15)
mat = matrix(c(2,1,4,0,3,0),nc=2)
layout(mat=mat,width=c(10,2.8),height=c(2,10,3))
# main
par(mar=c(0.1,8,0.5,0.1),cex.lab=2.5,cex.axis=2.2,mgp=c(3.5,1,0),las=1,tck=-0.01,cex.main=2.5,font.lab=2,mgp=c(4,1,0))
plot(EAS,AT,type="n",ylab="Tibetan and/or Han AF",xlab="",ylim=c(0,1),xlim=c(0,1),xaxt="n",main="") # EAS AF
segments(0,0,1,1,lwd=1.2)
## AL
points(EAS,AL,cex=1.2,col=col[2],pch=18)
## HT
points(EAS,HT,cex=1.2,col=col[3],pch=20)
# ALL
points(EAS,AT,cex=1.2,col=col[1],pch=16)
abline(h=1.04)
axis(1)
# up EAS
par(mar=c(0.1,8,3,0.1),mgp=c(3,0.5,0))
hist(EAS,col="grey60",border="grey60",xaxt="n",xlab="",ylab="",breaks=seq(0,1,by=0.025),main="",axes=F)
legend('center',legend="EAS",bty="n",cex=2.5)
a2=axis(2,tick=F,labels=F)
axis(2,a2,prettyNum(a2,","))
#axis(1)
## right AL-HT
par(mar=c(0.1,0.1,2,1),mgp=c(3,0.5,0))
h1 = hist(AT,breaks=seq(0,1,by=0.05),plot=F)
h2 = hist(AL,breaks=seq(0,1,by=0.05),plot=F)
h3 = hist(HT,breaks=seq(0,1,by=0.05),plot=F)
wid = 0.014
par(las=3)
plot(1,xlab="AL-HT",ylab="",yaxt="n",xaxt="n",col=col[1],type="n",axes=F,xlim=c(0,max(h1$counts,h2$counts,h3$counts)),ylim=c(0,1))
for(i in 1:(length(h1$breaks)-1)){
rect(0,h1$breaks[i],h1$counts[i],h1$breaks[i]+wid,col=col.raw[1],border=NA)
rect(0,h1$breaks[i]+wid,h2$counts[i],h1$breaks[i]+wid*2,col=col.raw[2],border=NA)
rect(0,h1$breaks[i]+wid*2,h3$counts[i],h1$breaks[i]+wid*3,col=col.raw[3],border=NA)
}
legend('right',legend=c('Tibetan&Han','Tibetan','Han'),col=col.raw[1:3],pch=15,bty="n",cex=2.5)
a3=axis(3,tick=F,labels=F)
axis(3,a3,prettyNum(a3,","))
## bottom
par(mar=c(0.1,8,1,0.1),las=1,mgp=c(3,0.5,0))
plot(1,type="n",ylab="",xlab="",ylim=c(0,1),xlim=c(0,1),xaxt="n",axes=F)
legends <- vector('expression',3)
legends[1] <- substitute(expression(paste("EAS--Tibetan&Han: ",R^2 ," = ",rR1,", Pvalue ",p1)),list(rR1=round(R1,4),p1=p1))[2]
legends[2] <- substitute(expression(paste("EAS--Tibetan: ",R^2," = ", rR2,", Pvalue ",p2)),list(rR2=round(R2,4),p2=p2))[2]
legends[3] <- substitute(expression(paste("EAS--Han: ",R^2 ," = ", rR3,", Pvalue ",p3)),list(rR3=round(R3,4),p3=p3))[2]
legend('left',legend=legends,bty="n",col=col.raw,pch=c(16,18,20),cex=2.2)
dev.off()
ref=read.table("windows.bed.1K.gc.txt",header=F,sep="\t")
ref[,3] = ref[,3]-1 ## end
#ref[,1] = paste0('chr',ref[,1]) ## chrom
colnames(ref) = c('chr','start','end','gc')
ref = ref[,-3]
AL.DEL = read.table("splide/DEL.splide.counts.xls",header=T,sep="\t")
AL.DEL = AL.DEL[,-3]
mer = merge(ref,AL.DEL,by=c("chr",'start'),all=T)
colnames(mer)[ncol(mer)] = 'AL-DEL'
AL.INS = read.table("splide/INS.splide.counts.xls",header=T,sep="\t")
AL.INS = AL.INS[,-3]
mer = merge(mer,AL.INS,by=c("chr",'start'),all=T)
colnames(mer)[ncol(mer)] = 'AL-INS'
EAS.DEL = read.table("EAS/DEL.splide.counts.xls",header=T,sep="\t")
EAS.DEL = EAS.DEL[,-3]
mer = merge(mer,EAS.DEL,by=c("chr",'start'),all=T)
colnames(mer)[ncol(mer)] = 'EAS-DEL'
EAS.INS = read.table("EAS/INS.splide.counts.xls",header=T,sep="\t")
EAS.INS = EAS.INS[,-3]
mer = merge(mer,EAS.INS,by=c("chr",'start'),all=T)
colnames(mer)[ncol(mer)] = 'EAS-INS'
mer[is.na(mer)] = 0
out = mer[,-c(1:2)]
class = cut(out$gc,breaks=seq(0,1,by=0.01))
out$gc = as.character(class)
library(reshape2)
mel = melt(out,id="gc")
mel[is.na(mel)]="(0,0.01]"
dca=dcast(mel,gc~variable,sum) ## col=5
dca$ratio <- apply(dca[,2:3],1,sum)/apply(dca[,2:5],1,sum)
dca[is.na(dca)]=0.0
nr = nrow(dca)
rect.width = 0.25
Col2rgb=function(a){
apply(col2rgb(a)/255,2,function(x)rgb(x[1],x[2],x[3],0.7))
}
#col = Col2rgb(c("darkred","darkgreen","#00AFBB","#E7B800"))
col = Col2rgb(c("#E64B35","#F39B7F","#4DBBD5","#00A087"))
pdf('DEL--INS.splide.gc.distribution.line.num.pdf')
plot(1,type="n",xlab="GC content",ylab="The number of SV",xlim=c(0,nrow(dca)+1),ylim=c(0,max(dca[,2])+100),bty="l",xaxt="n",main="")
#polygon(x=c(1,1:nr,nr),y=c(0,dca[,2],0),border="grey80",col=Col2rgb("darkred"))
#polygon(x=c(1,1:nr,nr),y=c(0,dca[,3],0),border="grey80",col=Col2rgb("darkgreen"))
#polygon(x=c(1,1:nr,nr),y=c(0,dca[,4],0),border="grey80",col=Col2rgb("#00AFBB")) ## c("#00AFBB", "#E7B800", "#FC4E07")
#polygon(x=c(1,1:nr,nr),y=c(0,dca[,5],0),border="grey80",col=Col2rgb("#E7B800"))
polygon(x=c(1,1:nr,nr),y=c(0,dca[,2],0),border="grey80",col=Col2rgb("#E64B35"))
polygon(x=c(1,1:nr,nr),y=c(0,dca[,3],0),border="grey80",col=Col2rgb("#F39B7F"))
polygon(x=c(1,1:nr,nr),y=c(0,dca[,4],0),border="grey80",col=Col2rgb("#4DBBD5")) ## c("#00AFBB", "#E7B800", "#FC4E07")
polygon(x=c(1,1:nr,nr),y=c(0,dca[,5],0),border="grey80",col=Col2rgb("#00A087"))
v1 = c(0,max(dca[,2])+100)
lines(rep(55,length(v1)),v1,col="black")
lines(rep(25, length(v1)), v1, col="black")
#axis(1,1:nr,gsub('\\(','',sapply(strsplit(dca[,1],","),'[',1)))
axis(1,seq(0,nr,10),c('0%','10%','20%','30%','40%','50%','60%','70%','80%'))
legend("topleft",legend=c('DEL (Tibetan & Han)','INS (Tibetan & Han)','DEL (EAS)','INS (EAS)'),col=col,bty="n",pch=15,cex=1.2)
dev.off()
sv_sum = c(sum(dca[,5]), sum(dca[,4]), sum(dca[,3]), sum(dca[,2]))
sv_sum_30 = c(sum(dca[1:25,5]),sum(dca[1:25,4]),sum(dca[1:25,3]),sum(dca[1:25,2]))
sv_sum_55 = c(sum(dca[55:nr,5]),sum(dca[55:nr,4]),sum(dca[55:nr,3]),sum(dca[55:nr,2]))
sv_ratio_30 = sv_sum_30/sv_sum
sv_ratio_55 = sv_sum_55/sv_sum
pdf('bar.pdf')
col2 = Col2rgb(c("#00A087", "#4DBBD5", "#F39B7F", "#E64B35"))
plot(1,type="n",xlab="",ylab="",xlim=c(0,4),ylim=c(0,0.03),bty="l",xaxt="n", yaxt="n", main="25% GC content")
rect(0.5:3.5-rect.width,0,0.5:3.5+rect.width,sv_ratio_30,col=col2,border=NA)
#axis(1,0.5:3.5, c(1,2,3,4))
#axis(2, 0:0.03)
dev.off()
pdf('bar2.pdf')
col2 = Col2rgb(c("#00A087", "#4DBBD5","#F39B7F", "#E64B35"))
plot(1,type="n",xlab="",ylab="",xlim=c(0,4),ylim=c(0,0.3),bty="l",xaxt="n", yaxt="n", main="55% GC content")
rect(0.5:3.5-rect.width,0,0.5:3.5+rect.width,sv_ratio_55,col=col2,border=NA)
#axis(1,0.5:3.5, c(1,2,3,4))
#axis(2, 0:0.2)
dev.off()
dca[,-1] = apply(dca[,-1],2,function(x)(x-mean(x))/sd(x))
pdf('DEL--INS.splide.gc.distribution.line.scale.pdf')
plot(1,type="n",xlab="",ylab="sv number(scale)",xlim=c(0,nrow(dca)+1),ylim=c(min(apply(dca[,2:3],1,sum)),max(apply(dca[,2:3],1,sum))),bty="l",xaxt="n")
polygon(x=c(1,1:nr,nr),y=c(min(dca[,2:5]),dca[,2],min(dca[,2:5])),border="grey80",col=Col2rgb("darkred"))
polygon(x=c(1,1:nr,nr),y=c(min(dca[,2:5]),dca[,3],min(dca[,2:5])),border="grey80",col=Col2rgb("darkgreen"))
polygon(x=c(1,1:nr,nr),y=c(min(dca[,2:5]),dca[,4],min(dca[,2:5])),border="grey80",col=Col2rgb("#00AFBB")) ## c("#00AFBB", "#E7B800", "#FC4E07")
polygon(x=c(1,1:nr,nr),y=c(min(dca[,2:5]),dca[,5],min(dca[,2:5])),border="grey80",col=Col2rgb("#E7B800"))
axis(1,1:nr,gsub('\\(','',sapply(strsplit(dca[,1],","),'[',1)))
legend("topleft",legend=c('Tibetan-Han_DEL','Tibetan-Han_INS','EAS-DEL','EAS-INS'),col=col,bty="n",pch=15,cex=1.5)
dev.off()
library(VennDiagram)
library(data.table)
args = commandArgs(TRUE)
mergefile = args[1]
df = fread(mergefile,sep="\t",header=F)
df = as.data.frame(df)
df = df[df[,7]>=50,,drop=F]
tab = table(df[,1])
oneID = names(tab[tab==1])
term = unique(df[,8])
for (i in term){
if (grepl('Ebert32',i))
{
Ebert32 = unique(df[df[,8]==i,1])
}
else if (grepl('Han405',i))
{
Han405 = unique(df[df[,8]==i,1])
}
else if (grepl('Audano15',i))
{
Audano15 = unique(df[df[,8]==i,1])
}
else if (grepl('Icelander3622',i))
{
Icelander3622 = unique(df[df[,8]==i,1])
}
else
{
TH = unique(df[df[,8]==i,1])
}
}
allother=unique(c(Ebert32,Han405,Audano15,Icelander3622))
x = list("Tibetan-Han"=TH,"Ebert32,Han405,Audano15,Icelander3622"=allother)
p = venn.diagram(x=x,col="transparent", fill=c('cornflowerblue','yellow'), alpha=0.6, cex=1, cat.cex=1,label.col="black",fontfamily = "serif", fontface = "bold",filename=NULL,output=TRUE)
pdf("figure.S1F.TGS.v2.pdf")
grid.draw(p)
dev.off()
library(ggplot2)
library(dplyr)
# Create Data
data <- data.frame(
group=c("Repeat Region","Nonrepeat"),
value=c(8134,1866)
)
# Compute the position of labels
data <- data %>%
arrange(desc(group)) %>%
mutate(prop = value / sum(data$value) *100) %>%
mutate(ypos = cumsum(prop)- 0.5*prop )
# Basic piechart
#pie color c("#00AFBB", "#E7B800")
pie=ggplot(data, aes(x="", y=prop, fill=group)) +
geom_bar(stat="identity", width=1, color="white") +
coord_polar("y", start=0) +
theme_void() +
theme(legend.position="none") +
geom_text(aes(y = ypos, label = prop), color = "white", size=5) +
scale_fill_brewer(palette = "Set3")
pdf("pie-new.pdf",width = 4, height = 4)
print(pie)
dev.off()
condition <- c("SINE","Simple Repeat","LINE" ,"LTR", "Other","Satellite","Low complexity","Seg Dup","Nonrepeat")
value <- c(0.1459,0.212,0.0836,0.0449,0.0492,0.0446,0.0171,0.0696,0.3331)
data <- data.frame(condition,value)
data$condition <- factor(data$condition, levels=condition, ordered=TRUE)
library(ggplot2)
library(scales)
library(ggsci)
library(ggplot2)
mypal=pal_npg("nrc",alpha=1)(9)
p=ggplot(data,aes(data$condition,data$value,fill=data$condition))+
geom_bar(stat="identity",fill=mypal,width = 0.7)+
scale_y_continuous(expand = c(0,0),limits=c(0,1.1*max(data$value)))+
xlab("Genomic elements")+
ylab("Frequency")+
theme_bw()+
theme(panel.grid.major =element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
legend.position = "none",
axis.line = element_line(colour = "black"),
axis.text.x=element_text(angle=0,hjust = 1,size = 10),
axis.text.y = element_text(size = 10),
plot.title=element_text(hjust=0.7),
axis.title.y = element_text(size = 15))
pdf("barplot-new.pdf",width =10,height = 6)
print(p)
dev.off()
args =commandArgs(TRUE)
file = args[1]
outpdf = args[2]
#file = "data.input.nogap.xls"
library(circlize)
data = read.table(file,sep="\t",header=T)
main.data = data[,5:8]
#main.data = apply(main.data,2,function(x)(x-mean(x))/sd(x))
## hg19
plotcytoBand = function(chrom,bm=1000000,start,d=0.15,D=0.5,cex=1.6,par=T,y1,y2){
#file = system.file(package = "circlize","extdata", "cytoBand.txt")
file = "cytoBand.txt"
options(stringsAsFactors=F)
cyto = read.table(file,header=F,sep="\t")
colnames(cyto)<-c("chr","start","end","name","type")
#cyto$start = cyto$start / bm
#cyto$end = cyto$end / bm
cyto$Color = "white" # white
cyto[cyto$type=="gneg",]$Color<-rgb(255,255,255, maxColorValue=255)
cyto[cyto$type=="gpos25",]$Color<-rgb(200,200,200, maxColorValue=255)
cyto[cyto$type=="gpos50",]$Color<-rgb(200,200,200, maxColorValue=255)
cyto[cyto$type=="gpos75",]$Color<-rgb(130,130,130, maxColorValue=255)
cyto[cyto$type=="gpos100",]$Color<-rgb(200,200,200, maxColorValue=255)
cyto[cyto$type=="acen",]$Color<-"yellow" # red centromere
cyto[cyto$type=="stalk",]$Color<-rgb(100,127,164, maxColorValue=255) # repeat regions
cyto[cyto$type=="gvar",]$Color<-rgb(220,220,220, maxColorValue=255) # indented region
cyto$chr = gsub('chr','',cyto$chr)
cyto = cyto[cyto$chr==as.character(chrom),,drop=F]
bei = max(cyto$end)
cyto$start = cyto$start/bei*bm+start
cyto$end = cyto$end/bei*bm+start
acen.w = which(cyto$type=="acen")
cyto.up = cyto[1:(acen.w[1]-1),,drop=F]
cyto.acen1 = cyto[acen.w[1],,drop=F]
cyto.acen2 = cyto[acen.w[2],,drop=F]
cyto.down = cyto[(acen.w[2]+1):nrow(cyto),,drop=F]
rect(min(cyto.up$start),y1,max(cyto.up$end),y2,col=NA,border="black",lwd=2)
rect(cyto.up$start,y1,cyto.up$end,y2,col=cyto.up$Color,border=NA)
rect(min(cyto.down$start),y1,max(cyto.down$end),y2,col=NA,border="black",lwd=2)
rect(cyto.down$start,y1,cyto.down$end,y2,col=cyto.down$Color,border=NA)
polygon(x=c(cyto.acen1$start[1],cyto.acen1$end[1],cyto.acen1$start[1],cyto.acen1$start[1]),y=c(y1,(y1+y2)/2,y2,y1),col=2,border=2)
polygon(x=c(cyto.acen2$end[1],cyto.acen2$start[1],cyto.acen2$end[1],cyto.acen2$end[1]),y=c(y1,(y1+y2)/2,y2,y1),col=2,border=2)
}
rmsk.max = max(data[,5])
SINE.genome = data[,5]/rmsk.max
LINE.genome = data[,6]/rmsk.max
LTR.genome = data[,7]/rmsk.max
SD.genome = data[,8]/rmsk.max
SD.genome[SD.genome>1]=1
chromlen = table(data[,1])
fenmu = min(max(data[,9]),max(data[,10]),max(data[,11]),max(data[,12]))
sine.sv = data[,9]/fenmu
sine.sv[sine.sv>1]=1
line.sv = data[,10]/fenmu
line.sv[line.sv>1]=1
ltr.sv = data[,11]/fenmu
ltr.sv[ltr.sv>1]=1
sd.sv=data[,12]/fenmu
sd.sv[sd.sv>1]=1
allsv = data[,4]/fenmu
allsv[allsv>1]=1
gene.density = data[,13]/max(data[,13])
svtype.data = data[,14:18]
svtype.data = svtype.data/fenmu
del.data = ifelse(svtype.data[,1]>1,1,svtype.data[,1])
dup.data = ifelse(svtype.data[,2]>1,1,svtype.data[,2])
ins.data = ifelse(svtype.data[,3]>1,1,svtype.data[,3])
inv.data = ifelse(svtype.data[,4]>1,1,svtype.data[,4])
tra.data = ifelse(svtype.data[,5]>1,1,svtype.data[,5])
els.data = data[,19:20]
els.data[,1] = els.data[,1]/max(els.data[,1])
els.data[,2] = els.data[,2]/max(els.data[,2])
hicdamin.data = data[,21:23]
hicdamin.data[,1] = hicdamin.data[,1]/max(hicdamin.data[,1])
hicdamin.data[,2] = hicdamin.data[,2]/max(hicdamin.data[,2])
hicdamin.data[,3] = hicdamin.data[,3]/max(hicdamin.data[,3])
##heatmap.col = c(rgb(31/255,49/255,94/255),rgb(34/255,103/255,179/255),rgb(63/255,148/255,198/255),rgb(148/255,197/255,224/255),rgb(212/255,229/255,245/255),rgb(251/255,250/255,251/255),rgb(252/255,220/255,200/255),rgb(247/255,165/255,130/255),rgb(219/255,95/255,79/255),rgb(180/255,30/255,48/255),rgb(106/255,9/255,36/255))
col.breaks = c(min(main.data),-2,-1.5,-1,-0.5,0,0.5,1,1.5,2,max(main.data))
"11"="#ada397","12"='#66c2a5',"13"='#fc8d62',"14"='#8da0cb',"15"='#e78ac3',"16"='#a6d854',"17"='#ffd92f',"18"='#e5c494',"19"='#b3b3b3',"20"="#20854E","21"="#7876B1","22"="#6F99AD","23"="#FFDC91","24"="#EE4C97")
chrom_col=c("1"="#a2b9bc","2"="#b2ad7f","3"="#878f99","4"="#6b5b95","5"="#feb236","6"="#d64161","7"="#ff7b25","8"="#d6cbd3","9"="#eca1a6","10"="#bdcebe","11"="#ada397","12"='#66c2a5',"13"='#fc8d62',"14"='#8da0cb',"15"='#e78ac3',"16"='#a6d854',"17"='#ffd92f',"18"='#e5c494',"19"='#b3b3b3',"20"="#20854E","21"="#7876B1","22"="#6F99AD","23"="#FFDC91","24"="#EE4C97")
pdf(outpdf,width=30,height=20)
layout(mat=matrix(c(1,2,3),nc=1),height=c(1.5,9,2))
par(mar=c(1,2,5,35)) # 1,8
plot(1,type="n",xlim=c(0.5,nrow(main.data)+10.5),ylim=c(0,2),xlab="",ylab="",axes=F,xaxs="i")
#chr.uniq = unique(data[,1])
start = 0.5
start.index = 1
k = 22
chrom.seq = c()
bgdata = matrix(0,nc=2,nr=11)
for(i in 1:k){
cluster.num = i
nx = sum(data[,1]==cluster.num)
chrom.seq = c(chrom.seq,start+nx)
#rect(start,0.1,start+nx,1.9,col=chrom_col[i],border=NA)
if(i%%2==1) {
bgdata[(i+1)/2,1] = start
bgdata[(i+1)/2,2] = start+nx
rect(start,1.1,start+nx,1.9,col='grey80',border=NA)
text((start+start+nx)/2,1.5,cluster.num,col="black",cex=3)
}else{
rect(start,1.1,start+nx,1.9,col='grey30',border=NA)
text((start+start+nx)/2,1.5,cluster.num,col="white",cex=3)
}
plotcytoBand(chrom=i,bm=nx,start,d=0.15,D=0.5,cex=1.6,par=T,y1=0.1,y2=0.9)
start = start+nx
start.index = start+0.5
}
require("RColorBrewer")
line.col = colorRampPalette(brewer.pal(9, "Paired"))(8)
line.col[1] = 'purple'
line.col[4] = 'Cyan1'
par(mar=c(8,2,0,35),cex.axis=5,las=1,mgp=c(0.5,0.1,0),cex.lab=5,font.lab=2)
y2 = 7+4+2+3+4
plot(1,type="n",xlim=c(0.5,nrow(main.data)+13.5),ylim=c(-0.5,y2),ylab="",axes=F,xlab="",xaxs="i",yaxs="i")
rect(bgdata[,1],-0.5,bgdata[,2],y2,col=rgb(204/255,204/255,204/255,0.6),border=NA)
lines(1:nrow(data),y2-1+gene.density,col = line.col[1])
lines(1:nrow(data),y2-2+SD.genome,col = line.col[2])
lines(1:nrow(data),y2-3+LTR.genome,col = line.col[2])
lines(1:nrow(data),y2-4+LINE.genome,col = line.col[2])
lines(1:nrow(data),y2-5+SINE.genome,col = line.col[2])
lines(1:nrow(data),y2-6+sd.sv,col = line.col[3])
lines(1:nrow(data),y2-7+ltr.sv,col = line.col[3])
lines(1:nrow(data),y2-8+line.sv,col = line.col[3])
lines(1:nrow(data),y2-9+sine.sv,col = line.col[3])
lines(1:nrow(data),y2-10+allsv,col = line.col[4])
lines(1:nrow(data),y2-11+del.data,col = line.col[4]) # DEL
lines(1:nrow(data),y2-12+dup.data,col = line.col[4]) # DUP
lines(1:nrow(data),y2-13+ins.data,col = line.col[4]) # INS
lines(1:nrow(data),y2-14+inv.data,col = line.col[4]) # INV
lines(1:nrow(data),y2-15+tra.data,col = line.col[4]) # TRA
lines(1:nrow(data),y2-16+els.data[,1],col = line.col[5])
lines(1:nrow(data),y2-17+els.data[,2],col = line.col[6])
lines(1:nrow(data),y2-18+hicdamin.data[,1],col = line.col[7])
lines(1:nrow(data),y2-19+hicdamin.data[,2],col = line.col[8])
lines(1:nrow(data),y2-20+hicdamin.data[,3],col = line.col[8])
#chrom.seq = chrom.seq[-length(chrom.seq)]
#abline(v=chrom.seq,col="black",lwd=0.1)
par(xpd=T)
axis(4,at=(y2-1):0+0.5,c('Gene Density','SD(genome)','LTR(genome)','LINE(genome)','SINE(genome)','SD(SV)','LTR(SV)','LINE(SV)','SINE(SV)','ALL SV Density','DEL Density','DUP Density','INS Density','INV Density','TRA Density','Enhancer','Promoter','HiC(boundary)','LAMIN(boundary)','LAMIN(domains)'),tick=F)
par(mar=c(0,1,2.5,35))
plot(1,type="n",axes=F,xlab="",ylab="")
legend('top',legend=c('Gene Density','Repeat Coverage(genome)','Repeat (SV Density)','SV Density','Enhancer','Promoter','HiC-boundary','LAMIN'),col=line.col,bty="n",ncol=4,lty=1,cex=3,lwd=4)
dev.off()
library(corrplot)
library(psych)
library(RColorBrewer)
args = commandArgs(TRUE)
input = args[1]
outpdf = args[2]
df = read.table(input,header=T,check.names=F)
df = df[c(13,8,7,6,5,12,11,10,9,4,14,15,16,17,18,19,20,21,22,23)]
colnames(df) = c('Gene Density','SD(genome)','LTR(genome)','LINE(genome)','SINE(genome)','SD(SV)','LTR(SV)','LINE(SV)','SINE(SV)','ALL SV Density','DEL Density','DUP Density','INS Density','INV Density','TRA Density','Enhancer','Promoter','HiC(boundary)','LAMIN(boundary)','LAMIN(domains)')
cor = cor(df)
cor_p = corr.p(cor,n=nrow(df),alpha=.05)
#cor[lower.tri(cor_p$p)] = cor_p$p[lower.tri(cor_p$p)]
#write.table(cor,"TE-SV.corAndP.spearman.xls",col.names=T,row.names=T,quote=F,sep="\t")
res1 = cor.mtest(df,conf.level=.95)
pdf(outpdf,width=15,height=15)
#corrplot.mixed(cor,p.mat=res1$p,lower = "number",upper = "circle",
# tl.pos = "lt",lower.col = "black",insig='label_sig',sig.level=c(.001,.01,.05),pch.cex=0.6)
diag(cor)=1
diag(res1$p)=1
col1 = colorRampPalette(rev(brewer.pal(n = 7, name ="RdYlBu")))(100)
col = colorRampPalette(c(rgb(0,0,0.8),rgb(0.8,0,0)))(100)
corrplot(cor, p.mat=res1$p,method = "circle", type = "upper", tl.pos = "lt",insig='label_sig',sig.level=c(.001,.01,.05),pch.cex=0.6,diag=TRUE,col=col,pch.col="white")
corrplot(cor, add = TRUE, type = "lower", method = "number", diag = FALSE, tl.pos = "n", cl.pos = "n",col=col)
dev.off()
args = commandArgs(TRUE)
input = args[1]
#input = "ALL.pca.info.xls"
outpdf = args[2]
library(data.table)
data = fread(input,sep="\t",header=T)
data = as.data.frame(data)
group_col = c("#009CFF", "#FF5500", "#763400","#9E819B","#D20000", "#FFC600","#54FFAA")
#group_col = apply(col2rgb(group_col)/255,2,function(x)rgb(x[1],x[2],x[3],0.6))
#Group = data$Group
data$Group = as.factor(data$Group)
group.levels = levels(data$Group)
sample.col = group_col[as.numeric(data$Group)]
names(group_col) = group.levels
draw.data = data[,-c(1:2)]
pc.rowdata = fread("usandgenotypes.1000genomeintersect.noXY.matrix.filter",header=T,sep="\t")
pc.rowdata = as.data.frame(pc.rowdata)
pc.rowdata = pc.rowdata[,-1]
MAF = apply(pc.rowdata,1,function(x)min(table(x)/length(x)))
pc.rowdata = pc.rowdata[MAF>1/ncol(pc.rowdata),]
deig=eigen(cor(scale(pc.rowdata)))
k = sum(deig$values)
pc1_rate=sprintf("%2.2f%%",deig$values[1]/k*100)
pc2_rate=sprintf("%2.2f%%",deig$values[2]/k*100)
pc3_rate=sprintf("%2.2f%%",deig$values[3]/k*100)
pc4_rate=sprintf("%2.2f%%",deig$values[4]/k*100)
pc5_rate=sprintf("%2.2f%%",deig$values[5]/k*100)
pc6_rate=sprintf("%2.2f%%",deig$values[6]/k*100)
lab1 = paste("PC1,",pc1_rate)
lab2 = paste("PC2,",pc2_rate)
lab3 = paste("PC3,",pc3_rate)
lab4 = paste("PC4,",pc4_rate)
lab5 = paste("PC5,",pc5_rate)
lab6 = paste("PC6,",pc6_rate)
PClabel = c(lab1,lab2,lab3,lab4,lab5,lab6)
plot1 =function(x,group){
group.data = split(x,group)
grouphistinfo = lapply(group.data,hist,plot=F,breaks=10)
y.max = max(unlist(lapply(grouphistinfo,function(x)max(x$density))))
plot(1,xlim=c(min(x),max(x)),ylim=c(0,y.max),type="n",xlab="",ylab="",xaxt="n",yaxt="n")
grid()
lapply(1:length(grouphistinfo),function(x){
groupi = names(grouphistinfo)[x]
grouphistinfoi = grouphistinfo[[x]]
polygon(x=c(grouphistinfoi$breaks,grouphistinfoi$breaks[length(grouphistinfoi$breaks)]),
y=c(0,grouphistinfoi$density,0),col = group_col[groupi],border=NA)
})
}
plot2 = function(x,y,col){
plot(x,y,xaxt="n",yaxt="n",xlab="",ylab="",type="n")
grid()
points(x,y,col=col,pch=16,cex=0.9)
}
main.plot = function(mar=mar){
for(i in 1:6){
for(j in i:6){
par(mar=mar)
if(i==j){
plot1(draw.data[,i],group=data$Group)
}else{
plot2(x=draw.data[,i],y=draw.data[,j],col=sample.col)
}
}
}
}
#outpdf = "g1000.PC1-PC6.pdf"
png(outpdf,width=15,height=15,units="cm",res=400)
##pdf(outpdf,width=15,height=15)
#mat = matrix(c(22,1:6,23,0,7:11,24,0,0,12:15,25,0,0,0,16:18,26,0,0,0,0,19:20,27,0,0,0,0,0,21,0,28,29,30,31,32,33),nc=7)
mat = matrix(c(22,1:6,23,0,7:11,24,0,0,12:15,25,0,0,0,16:18,26,34,34,34,0,19:20,27,34,34,34,0,0,21,0,28,29,30,31,32,33),nc=7)
layout(mat=mat,height = c(2,5,5,5,5,5,5),width=c(5,5,5,5,5,5,2))
# main
mar = c(0.5,0.5,0.5,0.5)
main.plot(mar=mar)
# col pc1 ---pc6 label
for(i in 1:6){
par(mar=mar)
plot(1,xlim=c(0,1),ylim=c(0,1),type="n",axes=F,xlab="",ylab="",xaxt="n",yaxt="n")
rect(0,0,1,1,col="grey80",border="grey80")
text(0.5,0.5,PClabel[i],cex=1)
}
# row PC1--PC6 label
for(i in 1:6){
par(mar=mar,srt=90)
plot(1,xlim=c(0,1),ylim=c(0,1),type="n",axes=F,xlab="",ylab="",xaxt="n",yaxt="n")
rect(0,0,1,1,col="grey80",border="grey80")
text(0.5,0.5,PClabel[i],cex=1)
}
par(srt=0)
# legend
plot(1,xlim=c(0,1),ylim=c(0,1),type="n",axes=F,xlab="",ylab="",xaxt="n",yaxt="n")
legend('center',legend=group.levels,col=group_col,bty="n",pch=16,cex=2.5)
dev.off()
readvcf <- function(file){
library(data.table)
df <- fread(file)
df <- as.data.frame(df)
df <- df[,-c(3:9)]
colnames(df)[1]="CHROM"
colnames(df)=gsub('(.*).ngmlr.*','\\1',colnames(df))
df[,-(1:2)] = apply(df[,-(1:2)],1:2, gsub ,pattern=':.*',replacement='')
dfdata <- df[,-(1:2)]
dfdata <- ifelse(dfdata=="1/1"|dfdata=="0/1",1,0)
data = data.frame(df[,1:2],dfdata,check.names=F)
return(data)
}
library(data.table)
vcfFile = "Final.dbsvmerge.Tibetan-Han.matrix.xls"
samplelist="samplelist"
outtre="Tibetan_Han.tre"
vcf = fread(vcfFile,header=T)
vcf = as.data.frame(vcf)
data = vcf[,-1]
MAF = apply(data,1,sum)
data = data[MAF>1,]
# "euclidean", "maximum", "manhattan", "canberra", "binary" or "minkowski".
# "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid"
# "euclidean", "maximum", "manhattan", "canberra", "binary", "pearson", "abspearson", "correlation", "abscorrelation", "spearman" or "kendall".
hc = hclust(amap::Dist(t(data),method = "euclidean", nbproc=15), method = "ward.D")
merge = hc$merge
labels = hc$labels
n = nrow(merge)
list = list()
for(i in 1:n){
if(merge[i,1]<0&merge[i,2]<0)list[[i]]=paste("(",labels[-1*merge[i,1]],",",labels[-1*merge[i,2]],")",sep="");
if(merge[i,1]<0&merge[i,2]>0)list[[i]]=paste("(",labels[-1*merge[i,1]],",",list[[merge[i,2]]],")",sep="");
if(merge[i,1]>0&merge[i,2]<0)list[[i]]=paste("(",list[[merge[i,1]]],",",labels[-1*merge[i,2]],")",sep="");
if(merge[i,1]>0&merge[i,2]>0)list[[i]]=paste("(",list[[merge[i,1]]],",",list[[merge[i,2]]],")",sep="")
}
tre = paste(list[[n]],";",sep="")
write.table(tre,outtre,col.names=F,row.names=F,quote=F)
options(stringsAsFactors=F)
library(ggtree)
pdf(paste0(outtre, date(),'.pdf'),width=10,height=10)
tree<-read.tree(outtre)
samplegroup = read.table(samplelist,sep="\t",header=F)
samplegroup = samplegroup[match(tree$tip.label,samplegroup[,1]),]
group = samplegroup[,2]
cls = split(samplegroup[,1],group)
tree = groupOTU(tree,cls)
print(tree$group)
ggtree(tree,layout="fan",ladderize=T,branch.length="none",aes(color=group))+
geom_tiplab(aes(angle=angle),size=2)+theme(legend.position='right')
dev.off()
library(data.table)
library(reshape2)
args = commandArgs(TRUE)
outpdf = args[1]
chromfile = "input/chrom.len"
svfile = "input/sv.fst.fordraw.xls"
snpfile = "input/snp.fst.fordraw.01.xls"
indelfile = "input/indel.fst.fordraw.01.xls"
genegfffile = "input/hg38.gtf.bed.gz"
genegff = gettextf("gzip -dc %s",genegfffile)
posfile = "pos.txt"
options(stringsAsFactors=F)
chrom.data = read.table(chromfile,header=F,sep="\t")
data1 = fread(svfile,header=T,sep="\t")
data1 = as.data.frame(data1)
data2 = fread(snpfile,header=T,sep="\t")
data2 = as.data.frame(data2)
data3 = fread(indelfile,header=T,sep="\t")
data3 = as.data.frame(data3)
gene.data = fread(genegff,header=F,sep="\t")
gene.data = as.data.frame(gene.data)
gene.data[,1] = gsub('chr','',gene.data[,1])
pos.data = read.table(posfile,header=F,sep="\t",fill=F)
var.color =c("#E64B35","#00A087","#3C5488")
pch = 15:17
if(grepl('.png$',outpdf)) png(outpdf,width=30,height=20, units = "in",res=300)
if(grepl('.pdf$',outpdf)) pdf(outpdf,width=30,height=20)
# up
par(mar=c(3,5,3,4))
up.manhattan(chrom.data,sv.data=data1,snp.data=data2,indel.data=data3,col=var.color,pos.data=pos.data)
dev.off()
library(data.table)
library(reshape2)
#output = "Manhattan.VAR.png"
up.manhattan = function(chrom.data,sv.data,snp.data,indel.data,col,pos.data){
# chrom.data
th = 0.1
chrom.len = chrom.data[,2]
chrom.label = chrom.data[,1]
x.cumsum = cumsum(as.numeric(chrom.len))
x.axis = x.cumsum-chrom.len/2
x.label = chrom.label
data1 = sv.data
data2 = snp.data
data3 = indel.data
# for line 1M one points
data1.max = max1Mfst(data1)
data2.max = max1Mfst(data2)
data3.max = max1Mfst(data3)
pos.data = pos.data[,1:3]
for(k in 1:nrow(pos.data)){
match.chr = match(pos.data[k,1],chrom.label)
addnum = ifelse(match.chr==1,0,x.cumsum[match.chr-1])
pos.data[k,2] = pos.data[k,2]+addnum
pos.data[k,3] = pos.data[k,3]+addnum
}
# draw
#col = c("#E64B35","#00A087","#3C5488")
par(las=1,cex.lab=2,cex.main=4,cex.axis=1.5,mgp=c(2,1,0),lab=c(5,3,0),mar=c(3,12,0.5,0.5))
cex.p = 1
plot(1,xlab="",ylab="Fst",xaxt="n",yaxt="n",ylim=c(-0.2,0.9),type="n",xlim=c(0,max(x.cumsum)))
rect(0,0,max(x.cumsum),th,col="grey90",border="grey90")
rect(pos.data[,2],-0.2,pos.data[,3],0.9,col="#FFFF00B2",border="#FFFF00B2")
# rect(0,0,max(x.cumsum),th,col="grey90",border="grey90")
#
segments(x.cumsum,0,x.cumsum,0.01,col="white",lty=1,lwd=1)
points(data2$x,data2[,3],cex=cex.p,col = data2$color,pch=data2$pch)
points(data3$x,data3[,3],cex=cex.p,col = data3$color,pch=data3$pch)
points(data1$x,data1[,3],cex=cex.p+0.1,col = data1$color,pch=data1$pch)
abline(h=th,lty=1,col="red")
lines(data1.max$x,data1.max[,2]-0.2,col=col[1],lwd=2)
lines(data2.max$x,data2.max[,2]-0.15,col=col[2],lwd=2)
lines(data3.max$x,data3.max[,2]-0.1,col=col[3],lwd=2)
abline(h=c(-0.1,-0.15,-0.2),lty=1,col="grey60")
axis(1,x.axis,x.label,tick=F)
segments(x.cumsum,-0.3,x.cumsum,-0.23,col="black",lty=1,lwd=2)
segments(0,-0.3,0,-0.23,col="black",lty=1,lwd=2)
# new line
# segments(x.cumsum,-0.2,x.cumsum,0.01,col="white",lty=1,lwd=1)
axis(2,th,th,lwd=2)
axis(2,at=c(0,0.2,0.4,0.6,0.8))
axis(2,0,0)
axis(2,0.8,0.8)
# gene name
# data1 = data1[data1[,4]!="False",,drop=F]
# text(data1$x,data1[,3],data1[,4],pos=4,cex=2,font=4)
#legend('top',legend=c('SV','SNP','InDel'),col=col,pch=15:17,cex=2,bty="n",ncol=3,lty=1,lwd=2)
legend('topleft',legend=c('SV','SNP','InDel'),col=col,pch=15:17,cex=2,bty="n")
}
max1Mfst = function(data){
data.deal = data[,c('WEIR_AND_COCKERHAM_FST','x')]
data.deal$x = ceiling(data.deal$x/1000000)
melt.data = melt(data.deal,id="x")
melt.data.max = dcast(melt.data,x~variable,max)
melt.data.max$x = (melt.data.max$x-0.5)*1000000
melt.data.max[,2] = melt.data.max[,2]/max(melt.data.max[,2])*0.05
return(melt.data.max) ## x fst // 2col
}
indel=read.csv("indel.fst01.gene.xls",header = T,sep = "\t")
indel_02=indel[indel[,3]>0.2,]
indel_02_gene=unique(unlist(strsplit(indel_02$Gene.ensGene,",")))
snp=read.csv("snp.fst01.gene.xls",header = T,sep = "\t")
snp_02=snp[snp[,3]>0.2,]
snp_02_gene=unique(unlist(strsplit(snp_02$Gene.ensGene,",")))
sv=read.csv("SV.fst01.table.xls",header = T,sep = "\t")
sv_02=sv[sv[,1]>0.2,]
sv_02_gene=unique(unlist(strsplit(sv_02$genecode.Gene.ensGene,";")))
venn_3=Vennerable::Venn(list("Indel"=indel_02_gene,"SNP"=snp_02_gene,"SV"=sv_02_gene))
pdf("VEEN_sv_indel_snp_02.pdf",width = 10, height = 10)
plot(venn_3,doWeight=T)
dev.off()
All supplementary figure code can be found in pipeline.supplementary.plot.sh
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。