Han-Tibetan-ONT-SV.code

This repository includes data and scripts to analyze structural variations of 320 Chinese samples which includes 119 Tibetan samples and 201 Han samples using nanopore sequencing.

Cite Information

If you use the methods or pipeline in this repository, please cite our paper. Also, you can get detailed information about these methods in the supplementary method of our paper.

Pipeline for Multi-sample SV-calling and annotation

Script: pipeline.sv.calling.sh

Requirements

Summary

In the bash file pipeline.sv.calling.sh, we use sample data to demonstrate the complete process of detecting structural variations based on nanopore sequencing technology.

Firstly, The FASTQ files were aligned to the GRCh38 human reference genome available from NCBI, and a BAM format alignment file was produced using minimap2(Li 2021) with -t 30 --MD -Y -L -a -x map-ont parameter..
Then Variants were called using Sniffles(Sedlazeck et al. 2018) with -s 2 --ignore_sd -l 50 parameter, cuteSV(Jiang et al. 2020) (parameter: --max_cluster_bias_DEL 100 --diff_ratio_merging_DEL 0.3 -s 2 -l 50 --genotype) and nanovar(Tham et al. 2020) (parameter: -x ont -l 50 -s 10).
The VCF format files per sample were merged using SURVIVOR if an SV was supported by at least two callers or flagged by cuteSV as high-quality with a priority of cuteSV>sniffles>nanovar to generate the genotypes.
DbSVmerge was used to obtain the nonredundant SV set for all high-quality SVs from 320 samples. The merging strategy was as follows: the distance of the variant coordinates between any two SVs must be less than 1 kb; for deletion-, duplication- and inversion-type SVs, at least 40% of the region should overlap; and for insertion SVs, the difference in length between insertions should be less than twice the length of both insertions. The numbers and lengths of SVs of different types (insertions, duplications, deletions and inversions) were counted.
High-quality SVs with upstream and downstream genes were annotated in the segdup, rmsk, dgv, 1KGP, gnomAD(Collins et al. 2020), dbvar, DGV, Decipher, OMIM, and ANNOVAR databases.

Visualization for chracteristics of SVs

Script: pipeline.plot.sh

Examples:

Fig1 c

args = commandArgs(TRUE)
input = args[1]
output = args[2]
df = read.table(input,sep="\t",header=T)
EAS = df[,5]
AT = df[,2]
AL = df[,3]
HT = df[,4]
c1 = cor.test(EAS,AT);R1 = (c1$estimate)^2;p1=c1$p.value
c2 = cor.test(EAS,AL);R2 = (c2$estimate)^2;p2=c2$p.value
c3 = cor.test(EAS,HT);R3 = (c3$estimate)^2;p3=c3$p.value
p1 = ifelse(p1==0,"<2.2e-16",paste("=",p1))
p2 = ifelse(p2==0,"<2.2e-16",paste("=",p2))
p3 = ifelse(p3==0,"<2.2e-16",paste("=",p3))

col.raw = c("#00AFBB", "purple", "#FC4E07")
col = apply(col2rgb(col.raw)/255,2,function(x)rgb(x[1],x[2],x[3],0.8))
pdf(output,width=13,height=15)
mat = matrix(c(2,1,4,0,3,0),nc=2)
layout(mat=mat,width=c(10,2.8),height=c(2,10,3))

# main
par(mar=c(0.1,8,0.5,0.1),cex.lab=2.5,cex.axis=2.2,mgp=c(3.5,1,0),las=1,tck=-0.01,cex.main=2.5,font.lab=2,mgp=c(4,1,0))
plot(EAS,AT,type="n",ylab="Tibetan and/or Han AF",xlab="",ylim=c(0,1),xlim=c(0,1),xaxt="n",main="") # EAS AF
segments(0,0,1,1,lwd=1.2)
## AL
points(EAS,AL,cex=1.2,col=col[2],pch=18)

## HT
points(EAS,HT,cex=1.2,col=col[3],pch=20)

# ALL
points(EAS,AT,cex=1.2,col=col[1],pch=16)
abline(h=1.04)
axis(1)

#  up EAS
par(mar=c(0.1,8,3,0.1),mgp=c(3,0.5,0))
hist(EAS,col="grey60",border="grey60",xaxt="n",xlab="",ylab="",breaks=seq(0,1,by=0.025),main="",axes=F)
legend('center',legend="EAS",bty="n",cex=2.5)
a2=axis(2,tick=F,labels=F)
axis(2,a2,prettyNum(a2,","))

#axis(1)
## right AL-HT
par(mar=c(0.1,0.1,2,1),mgp=c(3,0.5,0))
h1 = hist(AT,breaks=seq(0,1,by=0.05),plot=F)
h2 = hist(AL,breaks=seq(0,1,by=0.05),plot=F)
h3 = hist(HT,breaks=seq(0,1,by=0.05),plot=F)
wid = 0.014

par(las=3)
plot(1,xlab="AL-HT",ylab="",yaxt="n",xaxt="n",col=col[1],type="n",axes=F,xlim=c(0,max(h1$counts,h2$counts,h3$counts)),ylim=c(0,1))

for(i in 1:(length(h1$breaks)-1)){
	rect(0,h1$breaks[i],h1$counts[i],h1$breaks[i]+wid,col=col.raw[1],border=NA)
	rect(0,h1$breaks[i]+wid,h2$counts[i],h1$breaks[i]+wid*2,col=col.raw[2],border=NA)
	rect(0,h1$breaks[i]+wid*2,h3$counts[i],h1$breaks[i]+wid*3,col=col.raw[3],border=NA)
}
legend('right',legend=c('Tibetan&Han','Tibetan','Han'),col=col.raw[1:3],pch=15,bty="n",cex=2.5)
a3=axis(3,tick=F,labels=F)
axis(3,a3,prettyNum(a3,","))
## bottom
par(mar=c(0.1,8,1,0.1),las=1,mgp=c(3,0.5,0))
plot(1,type="n",ylab="",xlab="",ylim=c(0,1),xlim=c(0,1),xaxt="n",axes=F)
legends <- vector('expression',3)
legends[1] <- substitute(expression(paste("EAS--Tibetan&Han: ",R^2 ," = ",rR1,", Pvalue ",p1)),list(rR1=round(R1,4),p1=p1))[2]
legends[2] <- substitute(expression(paste("EAS--Tibetan: ",R^2," = ", rR2,", Pvalue ",p2)),list(rR2=round(R2,4),p2=p2))[2]
legends[3] <- substitute(expression(paste("EAS--Han: ",R^2 ," = ", rR3,", Pvalue ",p3)),list(rR3=round(R3,4),p3=p3))[2]
legend('left',legend=legends,bty="n",col=col.raw,pch=c(16,18,20),cex=2.2)
dev.off()

Fig1c

Fig 1d

ref=read.table("windows.bed.1K.gc.txt",header=F,sep="\t")
ref[,3] = ref[,3]-1  ## end
#ref[,1] = paste0('chr',ref[,1])  ## chrom
colnames(ref) = c('chr','start','end','gc')
ref = ref[,-3]
AL.DEL = read.table("splide/DEL.splide.counts.xls",header=T,sep="\t")
AL.DEL = AL.DEL[,-3]
mer = merge(ref,AL.DEL,by=c("chr",'start'),all=T)
colnames(mer)[ncol(mer)] = 'AL-DEL'

AL.INS = read.table("splide/INS.splide.counts.xls",header=T,sep="\t")
AL.INS = AL.INS[,-3]
mer = merge(mer,AL.INS,by=c("chr",'start'),all=T)
colnames(mer)[ncol(mer)] = 'AL-INS'

EAS.DEL = read.table("EAS/DEL.splide.counts.xls",header=T,sep="\t")
EAS.DEL = EAS.DEL[,-3]
mer = merge(mer,EAS.DEL,by=c("chr",'start'),all=T)
colnames(mer)[ncol(mer)] = 'EAS-DEL'

EAS.INS = read.table("EAS/INS.splide.counts.xls",header=T,sep="\t")
EAS.INS = EAS.INS[,-3]
mer = merge(mer,EAS.INS,by=c("chr",'start'),all=T)
colnames(mer)[ncol(mer)] = 'EAS-INS'

mer[is.na(mer)] = 0

out = mer[,-c(1:2)]
class = cut(out$gc,breaks=seq(0,1,by=0.01))
out$gc = as.character(class)
library(reshape2)
mel = melt(out,id="gc")
mel[is.na(mel)]="(0,0.01]"
dca=dcast(mel,gc~variable,sum)   ## col=5

dca$ratio <- apply(dca[,2:3],1,sum)/apply(dca[,2:5],1,sum)
dca[is.na(dca)]=0.0

nr = nrow(dca)
rect.width = 0.25

Col2rgb=function(a){
apply(col2rgb(a)/255,2,function(x)rgb(x[1],x[2],x[3],0.7))
}

#col = Col2rgb(c("darkred","darkgreen","#00AFBB","#E7B800"))
col = Col2rgb(c("#E64B35","#F39B7F","#4DBBD5","#00A087"))
pdf('DEL--INS.splide.gc.distribution.line.num.pdf')
plot(1,type="n",xlab="GC content",ylab="The number of SV",xlim=c(0,nrow(dca)+1),ylim=c(0,max(dca[,2])+100),bty="l",xaxt="n",main="")

#polygon(x=c(1,1:nr,nr),y=c(0,dca[,2],0),border="grey80",col=Col2rgb("darkred"))
#polygon(x=c(1,1:nr,nr),y=c(0,dca[,3],0),border="grey80",col=Col2rgb("darkgreen"))
#polygon(x=c(1,1:nr,nr),y=c(0,dca[,4],0),border="grey80",col=Col2rgb("#00AFBB"))  ## c("#00AFBB", "#E7B800", "#FC4E07")
#polygon(x=c(1,1:nr,nr),y=c(0,dca[,5],0),border="grey80",col=Col2rgb("#E7B800"))

polygon(x=c(1,1:nr,nr),y=c(0,dca[,2],0),border="grey80",col=Col2rgb("#E64B35"))
polygon(x=c(1,1:nr,nr),y=c(0,dca[,3],0),border="grey80",col=Col2rgb("#F39B7F"))
polygon(x=c(1,1:nr,nr),y=c(0,dca[,4],0),border="grey80",col=Col2rgb("#4DBBD5"))  ## c("#00AFBB", "#E7B800", "#FC4E07")
polygon(x=c(1,1:nr,nr),y=c(0,dca[,5],0),border="grey80",col=Col2rgb("#00A087"))
v1 = c(0,max(dca[,2])+100)
lines(rep(55,length(v1)),v1,col="black")
lines(rep(25, length(v1)), v1, col="black")
#axis(1,1:nr,gsub('\\(','',sapply(strsplit(dca[,1],","),'[',1)))
axis(1,seq(0,nr,10),c('0%','10%','20%','30%','40%','50%','60%','70%','80%'))
legend("topleft",legend=c('DEL (Tibetan & Han)','INS (Tibetan & Han)','DEL (EAS)','INS (EAS)'),col=col,bty="n",pch=15,cex=1.2)
dev.off()

sv_sum = c(sum(dca[,5]), sum(dca[,4]), sum(dca[,3]), sum(dca[,2]))
sv_sum_30 = c(sum(dca[1:25,5]),sum(dca[1:25,4]),sum(dca[1:25,3]),sum(dca[1:25,2]))
sv_sum_55 = c(sum(dca[55:nr,5]),sum(dca[55:nr,4]),sum(dca[55:nr,3]),sum(dca[55:nr,2]))
sv_ratio_30 = sv_sum_30/sv_sum
sv_ratio_55 = sv_sum_55/sv_sum

pdf('bar.pdf')
col2 = Col2rgb(c("#00A087", "#4DBBD5", "#F39B7F", "#E64B35"))
plot(1,type="n",xlab="",ylab="",xlim=c(0,4),ylim=c(0,0.03),bty="l",xaxt="n", yaxt="n", main="25% GC content")
rect(0.5:3.5-rect.width,0,0.5:3.5+rect.width,sv_ratio_30,col=col2,border=NA)
#axis(1,0.5:3.5, c(1,2,3,4))
#axis(2, 0:0.03)
dev.off()

pdf('bar2.pdf')
col2 = Col2rgb(c("#00A087", "#4DBBD5","#F39B7F", "#E64B35"))
plot(1,type="n",xlab="",ylab="",xlim=c(0,4),ylim=c(0,0.3),bty="l",xaxt="n", yaxt="n", main="55% GC content")
rect(0.5:3.5-rect.width,0,0.5:3.5+rect.width,sv_ratio_55,col=col2,border=NA)
#axis(1,0.5:3.5, c(1,2,3,4))
#axis(2, 0:0.2)
dev.off()

dca[,-1] = apply(dca[,-1],2,function(x)(x-mean(x))/sd(x))
pdf('DEL--INS.splide.gc.distribution.line.scale.pdf')
plot(1,type="n",xlab="",ylab="sv number(scale)",xlim=c(0,nrow(dca)+1),ylim=c(min(apply(dca[,2:3],1,sum)),max(apply(dca[,2:3],1,sum))),bty="l",xaxt="n")
polygon(x=c(1,1:nr,nr),y=c(min(dca[,2:5]),dca[,2],min(dca[,2:5])),border="grey80",col=Col2rgb("darkred"))
polygon(x=c(1,1:nr,nr),y=c(min(dca[,2:5]),dca[,3],min(dca[,2:5])),border="grey80",col=Col2rgb("darkgreen"))
polygon(x=c(1,1:nr,nr),y=c(min(dca[,2:5]),dca[,4],min(dca[,2:5])),border="grey80",col=Col2rgb("#00AFBB"))  ## c("#00AFBB", "#E7B800", "#FC4E07")
polygon(x=c(1,1:nr,nr),y=c(min(dca[,2:5]),dca[,5],min(dca[,2:5])),border="grey80",col=Col2rgb("#E7B800"))

axis(1,1:nr,gsub('\\(','',sapply(strsplit(dca[,1],","),'[',1)))
legend("topleft",legend=c('Tibetan-Han_DEL','Tibetan-Han_INS','EAS-DEL','EAS-INS'),col=col,bty="n",pch=15,cex=1.5)
dev.off()

Fig1D

Fig 1f

library(VennDiagram)
library(data.table)
args = commandArgs(TRUE)
mergefile = args[1]

df = fread(mergefile,sep="\t",header=F)
df = as.data.frame(df)
df = df[df[,7]>=50,,drop=F]
tab = table(df[,1])
oneID = names(tab[tab==1])

term = unique(df[,8])
for (i in term){
	if (grepl('Ebert32',i))
	{
		Ebert32 = unique(df[df[,8]==i,1])
	}
	else if (grepl('Han405',i))
	{
		Han405 = unique(df[df[,8]==i,1])
	}
	else if (grepl('Audano15',i))
	{
		Audano15 = unique(df[df[,8]==i,1])
	}
	else if (grepl('Icelander3622',i))
	{
		Icelander3622 = unique(df[df[,8]==i,1])
	}
	else
	{	
		TH = unique(df[df[,8]==i,1])
	}
}

allother=unique(c(Ebert32,Han405,Audano15,Icelander3622))
x = list("Tibetan-Han"=TH,"Ebert32,Han405,Audano15,Icelander3622"=allother)
p = venn.diagram(x=x,col="transparent", fill=c('cornflowerblue','yellow'), alpha=0.6, cex=1, cat.cex=1,label.col="black",fontfamily = "serif", fontface = "bold",filename=NULL,output=TRUE)
pdf("figure.S1F.TGS.v2.pdf")
grid.draw(p)
dev.off()

Fig1f

Fig 2a

library(ggplot2)
library(dplyr)

# Create Data
data <- data.frame(
  group=c("Repeat Region","Nonrepeat"),
  value=c(8134,1866)
)

# Compute the position of labels
data <- data %>% 
  arrange(desc(group)) %>%
  mutate(prop = value / sum(data$value) *100) %>%
  mutate(ypos = cumsum(prop)- 0.5*prop )

# Basic piechart
#pie color c("#00AFBB", "#E7B800")
pie=ggplot(data, aes(x="", y=prop, fill=group)) +
  geom_bar(stat="identity", width=1, color="white") +
  coord_polar("y", start=0) +
  theme_void() + 
  theme(legend.position="none") +
  
  geom_text(aes(y = ypos, label = prop), color = "white", size=5) +
  scale_fill_brewer(palette = "Set3")


pdf("pie-new.pdf",width = 4, height = 4)
print(pie)
dev.off()

Fig2a-pie

condition <- c("SINE","Simple Repeat","LINE" ,"LTR", "Other","Satellite","Low complexity","Seg Dup","Nonrepeat")
value <- c(0.1459,0.212,0.0836,0.0449,0.0492,0.0446,0.0171,0.0696,0.3331)

data <- data.frame(condition,value)
data$condition <- factor(data$condition, levels=condition, ordered=TRUE)
library(ggplot2)
library(scales)
library(ggsci)
library(ggplot2)
mypal=pal_npg("nrc",alpha=1)(9)
p=ggplot(data,aes(data$condition,data$value,fill=data$condition))+
  geom_bar(stat="identity",fill=mypal,width = 0.7)+
  scale_y_continuous(expand = c(0,0),limits=c(0,1.1*max(data$value)))+
  xlab("Genomic elements")+
  ylab("Frequency")+
  theme_bw()+
  theme(panel.grid.major =element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        legend.position = "none",
        axis.line = element_line(colour = "black"),
        axis.text.x=element_text(angle=0,hjust = 1,size = 10),
        axis.text.y = element_text(size = 10),
        plot.title=element_text(hjust=0.7),
        axis.title.y = element_text(size = 15))

pdf("barplot-new.pdf",width =10,height = 6)
print(p)
dev.off()

Fig2a-barplot

Fig 2b

args =commandArgs(TRUE)
file = args[1]
outpdf = args[2]
#file = "data.input.nogap.xls"
library(circlize)
data = read.table(file,sep="\t",header=T)
main.data = data[,5:8]
#main.data = apply(main.data,2,function(x)(x-mean(x))/sd(x))

## hg19
plotcytoBand = function(chrom,bm=1000000,start,d=0.15,D=0.5,cex=1.6,par=T,y1,y2){
        #file = system.file(package = "circlize","extdata", "cytoBand.txt")
	file = "cytoBand.txt"
        options(stringsAsFactors=F)
        cyto = read.table(file,header=F,sep="\t")
        colnames(cyto)<-c("chr","start","end","name","type")
        #cyto$start = cyto$start / bm
        #cyto$end = cyto$end / bm
        cyto$Color = "white"  # white
        cyto[cyto$type=="gneg",]$Color<-rgb(255,255,255, maxColorValue=255)
        cyto[cyto$type=="gpos25",]$Color<-rgb(200,200,200, maxColorValue=255)
        cyto[cyto$type=="gpos50",]$Color<-rgb(200,200,200, maxColorValue=255)
        cyto[cyto$type=="gpos75",]$Color<-rgb(130,130,130, maxColorValue=255)
        cyto[cyto$type=="gpos100",]$Color<-rgb(200,200,200, maxColorValue=255)
        cyto[cyto$type=="acen",]$Color<-"yellow" # red centromere 
        cyto[cyto$type=="stalk",]$Color<-rgb(100,127,164, maxColorValue=255) # repeat regions 
        cyto[cyto$type=="gvar",]$Color<-rgb(220,220,220, maxColorValue=255) # indented region
	
	cyto$chr = gsub('chr','',cyto$chr)
	cyto = cyto[cyto$chr==as.character(chrom),,drop=F]
	bei = max(cyto$end)
	cyto$start = cyto$start/bei*bm+start
	cyto$end = cyto$end/bei*bm+start

	acen.w = which(cyto$type=="acen")
	cyto.up = cyto[1:(acen.w[1]-1),,drop=F]
	cyto.acen1 = cyto[acen.w[1],,drop=F]
	cyto.acen2 = cyto[acen.w[2],,drop=F]
	cyto.down = cyto[(acen.w[2]+1):nrow(cyto),,drop=F]
	rect(min(cyto.up$start),y1,max(cyto.up$end),y2,col=NA,border="black",lwd=2)
	rect(cyto.up$start,y1,cyto.up$end,y2,col=cyto.up$Color,border=NA)
	
	rect(min(cyto.down$start),y1,max(cyto.down$end),y2,col=NA,border="black",lwd=2)
	rect(cyto.down$start,y1,cyto.down$end,y2,col=cyto.down$Color,border=NA)

	polygon(x=c(cyto.acen1$start[1],cyto.acen1$end[1],cyto.acen1$start[1],cyto.acen1$start[1]),y=c(y1,(y1+y2)/2,y2,y1),col=2,border=2)
	polygon(x=c(cyto.acen2$end[1],cyto.acen2$start[1],cyto.acen2$end[1],cyto.acen2$end[1]),y=c(y1,(y1+y2)/2,y2,y1),col=2,border=2)

}

rmsk.max = max(data[,5])
SINE.genome =  data[,5]/rmsk.max
LINE.genome = data[,6]/rmsk.max
LTR.genome = data[,7]/rmsk.max
SD.genome = data[,8]/rmsk.max
SD.genome[SD.genome>1]=1

chromlen = table(data[,1])
fenmu = min(max(data[,9]),max(data[,10]),max(data[,11]),max(data[,12]))
sine.sv = data[,9]/fenmu
sine.sv[sine.sv>1]=1
line.sv = data[,10]/fenmu
line.sv[line.sv>1]=1
ltr.sv = data[,11]/fenmu
ltr.sv[ltr.sv>1]=1
sd.sv=data[,12]/fenmu
sd.sv[sd.sv>1]=1
allsv = data[,4]/fenmu
allsv[allsv>1]=1
gene.density = data[,13]/max(data[,13])

svtype.data = data[,14:18]
svtype.data = svtype.data/fenmu
del.data = ifelse(svtype.data[,1]>1,1,svtype.data[,1])
dup.data = ifelse(svtype.data[,2]>1,1,svtype.data[,2])
ins.data = ifelse(svtype.data[,3]>1,1,svtype.data[,3])
inv.data = ifelse(svtype.data[,4]>1,1,svtype.data[,4])
tra.data = ifelse(svtype.data[,5]>1,1,svtype.data[,5])

els.data = data[,19:20]
els.data[,1] = els.data[,1]/max(els.data[,1])
els.data[,2] = els.data[,2]/max(els.data[,2])
hicdamin.data = data[,21:23]
hicdamin.data[,1] = hicdamin.data[,1]/max(hicdamin.data[,1])
hicdamin.data[,2] = hicdamin.data[,2]/max(hicdamin.data[,2])
hicdamin.data[,3] = hicdamin.data[,3]/max(hicdamin.data[,3])
##heatmap.col = c(rgb(31/255,49/255,94/255),rgb(34/255,103/255,179/255),rgb(63/255,148/255,198/255),rgb(148/255,197/255,224/255),rgb(212/255,229/255,245/255),rgb(251/255,250/255,251/255),rgb(252/255,220/255,200/255),rgb(247/255,165/255,130/255),rgb(219/255,95/255,79/255),rgb(180/255,30/255,48/255),rgb(106/255,9/255,36/255))
col.breaks = c(min(main.data),-2,-1.5,-1,-0.5,0,0.5,1,1.5,2,max(main.data))
"11"="#ada397","12"='#66c2a5',"13"='#fc8d62',"14"='#8da0cb',"15"='#e78ac3',"16"='#a6d854',"17"='#ffd92f',"18"='#e5c494',"19"='#b3b3b3',"20"="#20854E","21"="#7876B1","22"="#6F99AD","23"="#FFDC91","24"="#EE4C97")
chrom_col=c("1"="#a2b9bc","2"="#b2ad7f","3"="#878f99","4"="#6b5b95","5"="#feb236","6"="#d64161","7"="#ff7b25","8"="#d6cbd3","9"="#eca1a6","10"="#bdcebe","11"="#ada397","12"='#66c2a5',"13"='#fc8d62',"14"='#8da0cb',"15"='#e78ac3',"16"='#a6d854',"17"='#ffd92f',"18"='#e5c494',"19"='#b3b3b3',"20"="#20854E","21"="#7876B1","22"="#6F99AD","23"="#FFDC91","24"="#EE4C97")

pdf(outpdf,width=30,height=20)
layout(mat=matrix(c(1,2,3),nc=1),height=c(1.5,9,2))

par(mar=c(1,2,5,35)) # 1,8
plot(1,type="n",xlim=c(0.5,nrow(main.data)+10.5),ylim=c(0,2),xlab="",ylab="",axes=F,xaxs="i")
#chr.uniq = unique(data[,1])
start = 0.5
start.index = 1
k = 22
chrom.seq = c()
bgdata = matrix(0,nc=2,nr=11)
for(i in 1:k){
	cluster.num = i
	nx = sum(data[,1]==cluster.num)
	chrom.seq = c(chrom.seq,start+nx)
	#rect(start,0.1,start+nx,1.9,col=chrom_col[i],border=NA)
	if(i%%2==1) {
		bgdata[(i+1)/2,1] = start
		bgdata[(i+1)/2,2] = start+nx
		rect(start,1.1,start+nx,1.9,col='grey80',border=NA)
		text((start+start+nx)/2,1.5,cluster.num,col="black",cex=3)
	}else{
		rect(start,1.1,start+nx,1.9,col='grey30',border=NA)
		text((start+start+nx)/2,1.5,cluster.num,col="white",cex=3)
	}
	
	plotcytoBand(chrom=i,bm=nx,start,d=0.15,D=0.5,cex=1.6,par=T,y1=0.1,y2=0.9)

	start = start+nx
	start.index = start+0.5
}
require("RColorBrewer")
line.col = colorRampPalette(brewer.pal(9, "Paired"))(8)
line.col[1] = 'purple'
line.col[4] = 'Cyan1'
par(mar=c(8,2,0,35),cex.axis=5,las=1,mgp=c(0.5,0.1,0),cex.lab=5,font.lab=2)
y2 = 7+4+2+3+4
plot(1,type="n",xlim=c(0.5,nrow(main.data)+13.5),ylim=c(-0.5,y2),ylab="",axes=F,xlab="",xaxs="i",yaxs="i")
rect(bgdata[,1],-0.5,bgdata[,2],y2,col=rgb(204/255,204/255,204/255,0.6),border=NA)
lines(1:nrow(data),y2-1+gene.density,col = line.col[1])
lines(1:nrow(data),y2-2+SD.genome,col = line.col[2])
lines(1:nrow(data),y2-3+LTR.genome,col = line.col[2])
lines(1:nrow(data),y2-4+LINE.genome,col = line.col[2])
lines(1:nrow(data),y2-5+SINE.genome,col = line.col[2])
lines(1:nrow(data),y2-6+sd.sv,col = line.col[3])
lines(1:nrow(data),y2-7+ltr.sv,col = line.col[3])
lines(1:nrow(data),y2-8+line.sv,col = line.col[3])
lines(1:nrow(data),y2-9+sine.sv,col = line.col[3])
lines(1:nrow(data),y2-10+allsv,col = line.col[4])
lines(1:nrow(data),y2-11+del.data,col = line.col[4]) # DEL
lines(1:nrow(data),y2-12+dup.data,col = line.col[4]) # DUP
lines(1:nrow(data),y2-13+ins.data,col = line.col[4]) # INS
lines(1:nrow(data),y2-14+inv.data,col = line.col[4]) # INV
lines(1:nrow(data),y2-15+tra.data,col = line.col[4]) # TRA
lines(1:nrow(data),y2-16+els.data[,1],col = line.col[5])
lines(1:nrow(data),y2-17+els.data[,2],col = line.col[6])
lines(1:nrow(data),y2-18+hicdamin.data[,1],col = line.col[7])
lines(1:nrow(data),y2-19+hicdamin.data[,2],col = line.col[8])
lines(1:nrow(data),y2-20+hicdamin.data[,3],col = line.col[8])
#chrom.seq = chrom.seq[-length(chrom.seq)]
#abline(v=chrom.seq,col="black",lwd=0.1)
par(xpd=T)
axis(4,at=(y2-1):0+0.5,c('Gene Density','SD(genome)','LTR(genome)','LINE(genome)','SINE(genome)','SD(SV)','LTR(SV)','LINE(SV)','SINE(SV)','ALL SV Density','DEL Density','DUP Density','INS Density','INV Density','TRA Density','Enhancer','Promoter','HiC(boundary)','LAMIN(boundary)','LAMIN(domains)'),tick=F)

par(mar=c(0,1,2.5,35))
plot(1,type="n",axes=F,xlab="",ylab="")
legend('top',legend=c('Gene Density','Repeat Coverage(genome)','Repeat (SV Density)','SV Density','Enhancer','Promoter','HiC-boundary','LAMIN'),col=line.col,bty="n",ncol=4,lty=1,cex=3,lwd=4)
dev.off()

Fig 2b

Fig 2c

library(corrplot)
library(psych)
library(RColorBrewer)
args = commandArgs(TRUE)
input = args[1]
outpdf = args[2]
df = read.table(input,header=T,check.names=F)
df = df[c(13,8,7,6,5,12,11,10,9,4,14,15,16,17,18,19,20,21,22,23)]
colnames(df) = c('Gene Density','SD(genome)','LTR(genome)','LINE(genome)','SINE(genome)','SD(SV)','LTR(SV)','LINE(SV)','SINE(SV)','ALL SV Density','DEL Density','DUP Density','INS Density','INV Density','TRA Density','Enhancer','Promoter','HiC(boundary)','LAMIN(boundary)','LAMIN(domains)')
cor = cor(df)
cor_p = corr.p(cor,n=nrow(df),alpha=.05)
#cor[lower.tri(cor_p$p)] = cor_p$p[lower.tri(cor_p$p)]
#write.table(cor,"TE-SV.corAndP.spearman.xls",col.names=T,row.names=T,quote=F,sep="\t")
res1 = cor.mtest(df,conf.level=.95)
pdf(outpdf,width=15,height=15)
#corrplot.mixed(cor,p.mat=res1$p,lower = "number",upper = "circle",
#               tl.pos = "lt",lower.col = "black",insig='label_sig',sig.level=c(.001,.01,.05),pch.cex=0.6)
diag(cor)=1
diag(res1$p)=1
col1 = colorRampPalette(rev(brewer.pal(n = 7, name ="RdYlBu")))(100)
col = colorRampPalette(c(rgb(0,0,0.8),rgb(0.8,0,0)))(100)

corrplot(cor, p.mat=res1$p,method = "circle", type = "upper", tl.pos = "lt",insig='label_sig',sig.level=c(.001,.01,.05),pch.cex=0.6,diag=TRUE,col=col,pch.col="white")
corrplot(cor, add = TRUE, type = "lower", method = "number", diag = FALSE, tl.pos = "n", cl.pos = "n",col=col)
dev.off()

Fig 2c

Figure 3a

args = commandArgs(TRUE)
input = args[1]
#input = "ALL.pca.info.xls"
outpdf = args[2]
library(data.table)
data = fread(input,sep="\t",header=T)
data = as.data.frame(data)
group_col = c("#009CFF", "#FF5500", "#763400","#9E819B","#D20000", "#FFC600","#54FFAA")
#group_col = apply(col2rgb(group_col)/255,2,function(x)rgb(x[1],x[2],x[3],0.6))
#Group = data$Group
data$Group = as.factor(data$Group)
group.levels = levels(data$Group)
sample.col = group_col[as.numeric(data$Group)]
names(group_col) = group.levels
draw.data = data[,-c(1:2)]

pc.rowdata = fread("usandgenotypes.1000genomeintersect.noXY.matrix.filter",header=T,sep="\t")
pc.rowdata = as.data.frame(pc.rowdata)
pc.rowdata = pc.rowdata[,-1]
MAF = apply(pc.rowdata,1,function(x)min(table(x)/length(x)))
pc.rowdata = pc.rowdata[MAF>1/ncol(pc.rowdata),]
deig=eigen(cor(scale(pc.rowdata)))
k = sum(deig$values)
pc1_rate=sprintf("%2.2f%%",deig$values[1]/k*100)
pc2_rate=sprintf("%2.2f%%",deig$values[2]/k*100)
pc3_rate=sprintf("%2.2f%%",deig$values[3]/k*100)
pc4_rate=sprintf("%2.2f%%",deig$values[4]/k*100)
pc5_rate=sprintf("%2.2f%%",deig$values[5]/k*100)
pc6_rate=sprintf("%2.2f%%",deig$values[6]/k*100)
lab1 = paste("PC1,",pc1_rate)
lab2 = paste("PC2,",pc2_rate)
lab3 = paste("PC3,",pc3_rate)
lab4 = paste("PC4,",pc4_rate)
lab5 = paste("PC5,",pc5_rate)
lab6 = paste("PC6,",pc6_rate)
PClabel = c(lab1,lab2,lab3,lab4,lab5,lab6)

plot1 =function(x,group){
	group.data = split(x,group)
	grouphistinfo = lapply(group.data,hist,plot=F,breaks=10)
	y.max = max(unlist(lapply(grouphistinfo,function(x)max(x$density))))
	plot(1,xlim=c(min(x),max(x)),ylim=c(0,y.max),type="n",xlab="",ylab="",xaxt="n",yaxt="n")
	grid()
	lapply(1:length(grouphistinfo),function(x){
		groupi = names(grouphistinfo)[x]
		grouphistinfoi = grouphistinfo[[x]]
		polygon(x=c(grouphistinfoi$breaks,grouphistinfoi$breaks[length(grouphistinfoi$breaks)]),
			y=c(0,grouphistinfoi$density,0),col = group_col[groupi],border=NA)
	})
}

plot2 = function(x,y,col){
	plot(x,y,xaxt="n",yaxt="n",xlab="",ylab="",type="n")
	grid()
	points(x,y,col=col,pch=16,cex=0.9)
}

main.plot = function(mar=mar){
	for(i in 1:6){
		for(j in i:6){
			par(mar=mar)
		if(i==j){
			plot1(draw.data[,i],group=data$Group)
		}else{
			plot2(x=draw.data[,i],y=draw.data[,j],col=sample.col)
		}
	}
}
}
#outpdf = "g1000.PC1-PC6.pdf"
png(outpdf,width=15,height=15,units="cm",res=400)
##pdf(outpdf,width=15,height=15)
#mat = matrix(c(22,1:6,23,0,7:11,24,0,0,12:15,25,0,0,0,16:18,26,0,0,0,0,19:20,27,0,0,0,0,0,21,0,28,29,30,31,32,33),nc=7)
mat = matrix(c(22,1:6,23,0,7:11,24,0,0,12:15,25,0,0,0,16:18,26,34,34,34,0,19:20,27,34,34,34,0,0,21,0,28,29,30,31,32,33),nc=7)
layout(mat=mat,height = c(2,5,5,5,5,5,5),width=c(5,5,5,5,5,5,2))
# main
mar = c(0.5,0.5,0.5,0.5)
main.plot(mar=mar)
# col pc1 ---pc6  label
for(i in 1:6){
	par(mar=mar)
	plot(1,xlim=c(0,1),ylim=c(0,1),type="n",axes=F,xlab="",ylab="",xaxt="n",yaxt="n")
	rect(0,0,1,1,col="grey80",border="grey80")
	text(0.5,0.5,PClabel[i],cex=1)
}
# row PC1--PC6 label
for(i in 1:6){
        par(mar=mar,srt=90)
        plot(1,xlim=c(0,1),ylim=c(0,1),type="n",axes=F,xlab="",ylab="",xaxt="n",yaxt="n")
        rect(0,0,1,1,col="grey80",border="grey80")
        text(0.5,0.5,PClabel[i],cex=1)
}
par(srt=0)
# legend 
plot(1,xlim=c(0,1),ylim=c(0,1),type="n",axes=F,xlab="",ylab="",xaxt="n",yaxt="n")
legend('center',legend=group.levels,col=group_col,bty="n",pch=16,cex=2.5)

dev.off()

Fig 3a

Fig 3c

readvcf <- function(file){
	library(data.table)
	df <- fread(file)
	df <- as.data.frame(df)
	df <- df[,-c(3:9)]
	colnames(df)[1]="CHROM"
	colnames(df)=gsub('(.*).ngmlr.*','\\1',colnames(df))
	df[,-(1:2)] = apply(df[,-(1:2)],1:2, gsub ,pattern=':.*',replacement='')
	dfdata <- df[,-(1:2)]
	dfdata <- ifelse(dfdata=="1/1"|dfdata=="0/1",1,0)
	data = data.frame(df[,1:2],dfdata,check.names=F)
	return(data)
}
library(data.table)

vcfFile = "Final.dbsvmerge.Tibetan-Han.matrix.xls"
samplelist="samplelist"
outtre="Tibetan_Han.tre"
vcf = fread(vcfFile,header=T)
vcf = as.data.frame(vcf)
data = vcf[,-1]
MAF = apply(data,1,sum)
data = data[MAF>1,]
#  "euclidean", "maximum", "manhattan", "canberra", "binary" or "minkowski".
#  "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" 

#  "euclidean", "maximum", "manhattan", "canberra", "binary", "pearson", "abspearson", "correlation", "abscorrelation", "spearman" or "kendall".
hc = hclust(amap::Dist(t(data),method = "euclidean", nbproc=15), method = "ward.D")

merge = hc$merge
labels = hc$labels
n = nrow(merge)
list = list()
for(i in 1:n){
	if(merge[i,1]<0&merge[i,2]<0)list[[i]]=paste("(",labels[-1*merge[i,1]],",",labels[-1*merge[i,2]],")",sep="");
	if(merge[i,1]<0&merge[i,2]>0)list[[i]]=paste("(",labels[-1*merge[i,1]],",",list[[merge[i,2]]],")",sep="");
	if(merge[i,1]>0&merge[i,2]<0)list[[i]]=paste("(",list[[merge[i,1]]],",",labels[-1*merge[i,2]],")",sep="");
	if(merge[i,1]>0&merge[i,2]>0)list[[i]]=paste("(",list[[merge[i,1]]],",",list[[merge[i,2]]],")",sep="")
}
tre = paste(list[[n]],";",sep="")
write.table(tre,outtre,col.names=F,row.names=F,quote=F)

options(stringsAsFactors=F)
library(ggtree)
pdf(paste0(outtre, date(),'.pdf'),width=10,height=10)
tree<-read.tree(outtre)
samplegroup = read.table(samplelist,sep="\t",header=F)
samplegroup = samplegroup[match(tree$tip.label,samplegroup[,1]),]
group = samplegroup[,2]
cls = split(samplegroup[,1],group)
tree = groupOTU(tree,cls)
print(tree$group)
ggtree(tree,layout="fan",ladderize=T,branch.length="none",aes(color=group))+
    geom_tiplab(aes(angle=angle),size=2)+theme(legend.position='right')
dev.off()

Fig3c

Figure 4a

library(data.table)
library(reshape2)
args = commandArgs(TRUE)
outpdf = args[1]

chromfile = "input/chrom.len"
svfile = "input/sv.fst.fordraw.xls"
snpfile = "input/snp.fst.fordraw.01.xls"
indelfile = "input/indel.fst.fordraw.01.xls"
genegfffile = "input/hg38.gtf.bed.gz"
genegff = gettextf("gzip -dc %s",genegfffile)
posfile = "pos.txt"

options(stringsAsFactors=F)
chrom.data = read.table(chromfile,header=F,sep="\t")
data1 = fread(svfile,header=T,sep="\t")
data1 = as.data.frame(data1)
data2 = fread(snpfile,header=T,sep="\t")
data2 = as.data.frame(data2)
data3 = fread(indelfile,header=T,sep="\t")
data3 = as.data.frame(data3)
gene.data = fread(genegff,header=F,sep="\t")
gene.data = as.data.frame(gene.data)
gene.data[,1] = gsub('chr','',gene.data[,1])

pos.data = read.table(posfile,header=F,sep="\t",fill=F)

var.color =c("#E64B35","#00A087","#3C5488")
pch = 15:17


if(grepl('.png$',outpdf)) png(outpdf,width=30,height=20, units = "in",res=300)
if(grepl('.pdf$',outpdf)) pdf(outpdf,width=30,height=20)


# up 

par(mar=c(3,5,3,4))
up.manhattan(chrom.data,sv.data=data1,snp.data=data2,indel.data=data3,col=var.color,pos.data=pos.data)

dev.off()
library(data.table)
library(reshape2)
#output = "Manhattan.VAR.png"

up.manhattan = function(chrom.data,sv.data,snp.data,indel.data,col,pos.data){
	# chrom.data 
	th = 0.1
	chrom.len = chrom.data[,2]
	chrom.label = chrom.data[,1]
	x.cumsum = cumsum(as.numeric(chrom.len))
	x.axis = x.cumsum-chrom.len/2
	x.label = chrom.label
	data1 = sv.data
	data2 = snp.data
	data3 = indel.data
	# for line 1M one points
	data1.max = max1Mfst(data1)
	data2.max = max1Mfst(data2)
	data3.max = max1Mfst(data3)
	pos.data = pos.data[,1:3]
	for(k in 1:nrow(pos.data)){
		match.chr = match(pos.data[k,1],chrom.label)
		addnum = ifelse(match.chr==1,0,x.cumsum[match.chr-1])
		pos.data[k,2] = pos.data[k,2]+addnum
		pos.data[k,3] = pos.data[k,3]+addnum
	}
	
	# draw
	#col = c("#E64B35","#00A087","#3C5488")
	par(las=1,cex.lab=2,cex.main=4,cex.axis=1.5,mgp=c(2,1,0),lab=c(5,3,0),mar=c(3,12,0.5,0.5))
	cex.p = 1
	plot(1,xlab="",ylab="Fst",xaxt="n",yaxt="n",ylim=c(-0.2,0.9),type="n",xlim=c(0,max(x.cumsum)))
	rect(0,0,max(x.cumsum),th,col="grey90",border="grey90")
	rect(pos.data[,2],-0.2,pos.data[,3],0.9,col="#FFFF00B2",border="#FFFF00B2")
#	rect(0,0,max(x.cumsum),th,col="grey90",border="grey90")
	# 
	segments(x.cumsum,0,x.cumsum,0.01,col="white",lty=1,lwd=1)
	
	points(data2$x,data2[,3],cex=cex.p,col = data2$color,pch=data2$pch)
	points(data3$x,data3[,3],cex=cex.p,col = data3$color,pch=data3$pch)
	points(data1$x,data1[,3],cex=cex.p+0.1,col = data1$color,pch=data1$pch)
	abline(h=th,lty=1,col="red")
	lines(data1.max$x,data1.max[,2]-0.2,col=col[1],lwd=2)
	lines(data2.max$x,data2.max[,2]-0.15,col=col[2],lwd=2)
	lines(data3.max$x,data3.max[,2]-0.1,col=col[3],lwd=2)
	abline(h=c(-0.1,-0.15,-0.2),lty=1,col="grey60")
	axis(1,x.axis,x.label,tick=F)
	segments(x.cumsum,-0.3,x.cumsum,-0.23,col="black",lty=1,lwd=2)
	segments(0,-0.3,0,-0.23,col="black",lty=1,lwd=2)
	# new line
#	segments(x.cumsum,-0.2,x.cumsum,0.01,col="white",lty=1,lwd=1)
	axis(2,th,th,lwd=2)
	axis(2,at=c(0,0.2,0.4,0.6,0.8))
	axis(2,0,0)
	axis(2,0.8,0.8)
	# gene name
#	data1 = data1[data1[,4]!="False",,drop=F]
#	text(data1$x,data1[,3],data1[,4],pos=4,cex=2,font=4)
	#legend('top',legend=c('SV','SNP','InDel'),col=col,pch=15:17,cex=2,bty="n",ncol=3,lty=1,lwd=2)
	legend('topleft',legend=c('SV','SNP','InDel'),col=col,pch=15:17,cex=2,bty="n")
}

max1Mfst = function(data){
	data.deal = data[,c('WEIR_AND_COCKERHAM_FST','x')]
	data.deal$x = ceiling(data.deal$x/1000000)
	melt.data = melt(data.deal,id="x")
	melt.data.max = dcast(melt.data,x~variable,max)
	melt.data.max$x = (melt.data.max$x-0.5)*1000000
	melt.data.max[,2] = melt.data.max[,2]/max(melt.data.max[,2])*0.05
	return(melt.data.max)  ## x fst // 2col
}

Fig4a

Fig 5c

indel=read.csv("indel.fst01.gene.xls",header = T,sep = "\t")
indel_02=indel[indel[,3]>0.2,]
indel_02_gene=unique(unlist(strsplit(indel_02$Gene.ensGene,",")))
snp=read.csv("snp.fst01.gene.xls",header = T,sep = "\t")
snp_02=snp[snp[,3]>0.2,]
snp_02_gene=unique(unlist(strsplit(snp_02$Gene.ensGene,",")))
sv=read.csv("SV.fst01.table.xls",header = T,sep = "\t")               
sv_02=sv[sv[,1]>0.2,]
sv_02_gene=unique(unlist(strsplit(sv_02$genecode.Gene.ensGene,";")))

venn_3=Vennerable::Venn(list("Indel"=indel_02_gene,"SNP"=snp_02_gene,"SV"=sv_02_gene))
pdf("VEEN_sv_indel_snp_02.pdf",width = 10, height = 10)
plot(venn_3,doWeight=T)
dev.off()

Fig5c

Script: pipeline.supplementary.plot.sh

All supplementary figure code can be found in pipeline.supplementary.plot.sh

jinlong / Han-Tibetan-ONT-SV

Han-Tibetan-ONT-SV.code

Cite Information

Pipeline for Multi-sample SV-calling and annotation

Script: pipeline.sv.calling.sh

Requirements

Summary

Visualization for chracteristics of SVs

Script: pipeline.plot.sh

Examples:

Script: pipeline.supplementary.plot.sh

简介

发行版

贡献者

近期动态

jinlong / Han-Tibetan-ONT-SV .gitee-modal { width: 500px !important; }

Han-Tibetan-ONT-SV.code

Cite Information

Pipeline for Multi-sample SV-calling and annotation

Script: pipeline.sv.calling.sh

Requirements

Summary

Visualization for chracteristics of SVs

Script: pipeline.plot.sh

Examples:

Script: pipeline.supplementary.plot.sh

简介

发行版

贡献者

近期动态

搜索帮助

jinlong / Han-Tibetan-ONT-SV