1. QC
- 利用
bcftools
将原vcf文件中含有字母的染色体名称转换为1-26的编号;
- 忽略scaffold,只选取26条染色体上的SNP进行下一步分析;
- 忽略名称为M1,M2,M3,M4的无关个体;
- 利用
plink
--geno
和--maf
命令分别进行缺失率和MAF的质控;
- 利用
plink
--set-missing-var-ids @:#
以“染色体:位置”作为SNP名称,确保其不为缺失;
# QC and format details
bcftools annotate --rename-chrs chr_name_change.txt filted.SNP.vcf --output filted.SNP2.vcf
plink --vcf filted.SNP2.vcf --chr 1-26 --make-bed --out P302
plink --bfile P302 --keep keepid.txt --make-bed --out P302_1 --chr-set 26
plink --bfile P302_1 --geno 0.2 --make-bed --out P302_2 --chr-set 26
plink --bfile P302_2 --mind 0.2 --make-bed --out P302_3 --chr-set 26
plink --bfile P302_2 --geno 0.05 --make-bed --out P302_3 --chr-set 26
plink --bfile P302_3 --maf 0.01 --make-bed --out P302_4 --chr-set 26
plink --bfile P302_4 --set-missing-var-ids @:# --make-bed --out P302_4_raw
plink --bfile P302_4_raw --maf 0.05 --make-bed --out plink --chr-set 26
(1.1 Crop-specific QC)
- 考虑到实验材料为植物,可以根据群体类型对位点进行质控。
- 利用
plink
--hardy
计算各标记位点杂合数;- 利用
hwe.R
绘制哈温平衡概率分布图,利用plink
--hwe
对p值>1e-6
的位点进行质控;
- 利用
inbred_filter.R
绘制近交系数分布图,并对inbred coefficient<0.8
的位点进行质控;
- 利用
plink --bfile P302_4_raw --hardy --out HWE_check --chr-set 26
# HWE(不做)
Rscript --no-save ./script/hwe.R
plink --bfile P302_4_raw --hwe 1e-6 --make-bed --out P302_4_hwe --chr-set 26
# FIT
Rscript --no-save ./script/inbred_filter.R
sed 's/"// g' fail-fit-site.txt | awk '{print$2}' > fail-fit-list.txt
plink --bfile P302_4_raw --exclude fail-fit-list.txt --make-bed --out P302_4_fit --chr-set 26
hwe.R
inbred_filter.R
hwe<-read.table (file="HWE_check.hwe", header=TRUE)
pdf("hist_F(IT).pdf")
F_IT<-1-hwe[,7]/hwe[,8]
hist(F_IT,main="Histogram F(IT)")
dev.off()
pdf("hist_F(IT)_zoom.pdf")
hist(F_IT[which(F_IT>0.5)],main="Histogram F(IT) zoom")
dev.off()
write.table(hwe[which(F_IT<0.8),2],file = "fail-fit-site.txt",col.names = F)
2. SNP summary
- 2.1 统计各染色体长度、SNP个数和SNP密度分布图
- 绘制SNP密度分布图
- 2.2 计算LD衰减值
- 统计各染色体LD衰减值=0.2时的距离
2.1 SNP number and density
# chr length and SNP number per chr
head -n200 filted.SNP.vcf | grep "##contig=<ID=Ghir_" | sed 's/^.*length=//g' | sed 's/>//g'
cut -f1 P302_4_raw.bim | uniq -c
# SNP density plot
vcftools --vcf P302_4_raw.vcf --SNPdensity 100000 --out SNPden_raw
vcftools --vcf P302_4_fit.vcf --SNPdensity 100000 --out SNPden_fit
Rscript SNPdensity_plot.R
SNPdensity_plot.R
library(circlize)
library(viridis)
layout(matrix(1:2,1,2))
df1<-read.table("SNPden_raw.snpden",sep="\t",header=T)
df2<-read.table("SNPden_fit.snpden",sep="\t",header=T)
circle_plot <- function(df)
{
df<-df[,c(1,2,4)]
colnames(df)<-c("Chr","X","Y")
df$Chr<-as.factor(df$Chr)
levels(df$Chr)<-c(as.character(c(1:13)),rev(as.character(c(14:26))))
df$X<-df$X/1000000
options(scipen=999)
col<-c(viridis(13,0.2,0.4,alpha=0.8),rev(viridis(13,0.8,1,alpha=0.8)))
circos.initialize(factors=df$Chr,x=df$X)
circos.trackPlotRegion(factors=df$Chr,y=df$Y,track.height = 0.05)
for(i in 1:26){
highlight.sector(sector.index = as.character(i),col=col[i])
if(i<=13){
circos.text(mean(get.cell.meta.data("xlim",sector.index = as.character(i))),
mean(get.cell.meta.data("ylim")),
labels = paste0("A",i),
sector.index = as.character(i),cex=0.8)
}
else{
circos.text(mean(get.cell.meta.data("xlim",sector.index = as.character(i))),
mean(get.cell.meta.data("ylim")),
labels = paste0("D",i-13),
sector.index = as.character(i),cex=0.8)
}
}
col<-c(viridis(13,0.2,0.4,alpha=0.5),rev(viridis(13,0.8,1,alpha=0.5)))
circos.trackPlotRegion(factors=df$Chr,y=df$Y,track.height = 0.05)
for(i in 1:26){
highlight.sector(sector.index = as.character(i),track.index = 2,col=col[i])
if(i<=13){
circos.text(mean(get.cell.meta.data("xlim",sector.index = as.character(i))),
mean(get.cell.meta.data("ylim")),
labels = i,
sector.index = as.character(i),cex=0.8)
}
else{
circos.text(mean(get.cell.meta.data("xlim",sector.index = as.character(i))),
mean(get.cell.meta.data("ylim")),
labels = i,
sector.index = as.character(i),cex=0.8)
}
}
col<-c(viridis(13,0.2,0.4,alpha=0.8),viridis(13,0.8,1,alpha=0.8))
circos.trackPlotRegion(factors=df$Chr,y=df$Y,track.height = 0.3, ylim = c(0,40))
circos.trackLines(df$Chr,df$X,df$Y,col=col)
draw.sector(get.cell.meta.data("cell.start.degree", sector.index = "1"),
get.cell.meta.data("cell.end.degree", sector.index = "13"),
rou1 = get.cell.meta.data("cell.bottom.radius", track.index = 3), col = col[6])
draw.sector(get.cell.meta.data("cell.end.degree", sector.index = "13"),
get.cell.meta.data("cell.start.degree", sector.index = "1"),
rou1 = get.cell.meta.data("cell.bottom.radius", track.index = 3), col = col[19])
circos.clear()
}
circle_plot(df1)
circle_plot(df2)
2.2 LD decay
- 方法一:利用plink
--r2
命令可以得到一定大小窗口下标记之间的LD值。再根据标记之间距离大小的不同设置一组bin,对各bin内所有\(r^2\)求平均值,并绘制LD衰减图;
- 方法二:借助PopLDdecay和
ld_chr_plot.sh
,分染色体绘制LD衰减图;
- 主要使用方法二
# Method I
plink --bfile P302_4_raw --chr 1 --thin 0.05 --r2 --ld-window-r2 0 --ld-window 99999 --ld-window-kb 1000
Rscript --no-save LD_decay_plot.R
# Method II
nohup sh ld_all_plot.sh P302_4_raw &
# ld decay = 0.2
nohup sh ld_chr_plot.sh P302_4_raw &
for i in `seq 26`; do Rscript --no-save cal_ld_thre.R chr$i.bin ; done
ld_decay_plot.R
library(viridis)
arg <- commandArgs(T)
filename <- arg[1]
picname <- paste0(filename,".tiff")
dt <- read.table(filename)
dt <- dt[,1:2]
dt[,1] <- dt[,1]/1000
tiff(picname, width = 1500, height = 1550,
units = "px", res = 300, compression="lzw")
plot(dt, xlab = "Distance (Kb)", ylab = expression(LD~(r^{2})),
main = "LD decay", sub = filename, type = "l", col=viridis(1,0.8), lwd=2)
abline(h=0.2,col="grey40",lty=2,lwd=0.7)
abline(h=0.1,col="grey40",lty=2,lwd=0.7)
dev.off()
ld_all_plot.sh
#!/bin/bash
file=$1
mkdir ld_$file
cd ld_$file/
for i in `seq 26`
do
{
plink --bfile ../$file --chr $i --chr-set 26 --recode vcf-iid --out chr$i
gzip chr$i.vcf
PopLDdecay -InVCF chr$i.vcf.gz -MaxDist 10000 -OutStat chr$i
} &
done
wait
ls chr*.stat.gz > all_list.txt
perl ~/software/PopLDdecay/bin/Plot_OnePop.pl -inList all_list.txt -bin1 500 -bin2 50000 -output all
cd ..
gzip -d all.bin.gz
Rscript --no-save ../script/ld_decay_plot.R all.bin
ld_chr_plot.sh
#!/bin/bash
file=$1
mkdir ld_$file
cd ld_$file/
for i in `seq 26`
do
{
plink --bfile ../$file --chr $i --chr-set 26 --recode vcf-iid --out chr$i
gzip chr$i.vcf
PopLDdecay -InVCF chr$i.vcf.gz -MaxDist 10000 -OutStat chr$i
perl ~/software/PopLDdecay/bin/Plot_OnePop.pl -inFile chr$i.stat.gz -bin1 500 -bin2 50000 -output chr$i
gzip -d chr$i.bin.gz
} &
done
cd ..
3. Population Stratification
3.1 PCA
- 利用
plink
--pca
命令得到.eigenval
和.eigenvec
文件
- 利用
EIGENSOFT
twstats 方法,计算显著的eigenval个数
PCA_plot.R
library(ggplot2)
file="indep"
pca_num=5
vec_file=paste0(file,".eigenvec")
val_file=paste0(file,".eigenval")
vec <- read.table(vec_file,header = FALSE)
colnames(vec)<-c("FID","IID",paste0(rep("PC",pca_num),c(1:pca_num)))
ggplot(vec,aes(x=PC1,y=PC2))+geom_point()+labs(title=vec_file)+
theme_classic()+theme(plot.title = element_text(hjust = 0.5))
# percentage of explained variance
val <- read.table(val_file,header = FALSE)
per <- val[,1]/sum(val[which(val[,1]>1),1])
plot(per[1:30],type = "b", pch=18, cex=0.8,
xlab = "Principal Component",
ylab = "Proportion of Variance Explained")
plot(cumsum(per[1:30]),type = "b", pch=18, cex=0.8,
xlab = "Principal Component",
ylab = "Cumulative Proportion of Variance Explained")
3.2 EigenGWAS
P302_4_raw/_fit.bed/.bim/.fam
to do eigenGWAS.
3.3 fastStructure
- 根据LD decay值,进行适当的LD pruning,得到相互独立的SNP,利用这些SNP做structure分析;
- 下载
fastStructure
,建议在conda中配置环境。
- 下载
fastStructure
后直接运行会报错,需要根据Link对distruct.py进行修改。
- 再借助R包
pophelper
绘图
# LD pruning 1
plink --bfile P302_4_raw --indep-pairwise 300 5 0.2 --out indepSNP_w300 --chr-set 26
plink --bfile P302_4_raw --extract indepSNP_w300.prune.in --make-bed --out P302_5_raw --chr-set 26
# LD pruning 2 (after filtering by FIT)
plink --bfile P302_4_fit --indep-pairwise 100 5 0.2 --out indepSNP_w100_fit --chr-set 26
plink --bfile P302_4_fit --extract indepSNP_w100_fit.prune.in --make-bed --out P302_5_fit --chr-set 26
# Asubgenome (3000kb 5 0.2)
# Dsubgenome (1500kb 5 0.2)
plink --bfile plink --indep-pairwise 3000kb 5 0.2 --chr 1-13 --out indepA --chr-set 26
plink --bfile plink --indep-pairwise 1500kb 5 0.2 --chr 14-26 --out indepD --chr-set 26
cat indepA.prune.in >> indepAD.prune.in
cat indepD.prune.in >> indepAD.prune.in
plink --bfile plink --extract indepAD.prune.in --make-bed --out indepAD --chr-set 26
conda activate fStructure
python structure.py -K 3 --input=/public3/zqx/Cotton/1_SNP/3_fastStructure/P302_5_hwe --output=/public3/zqx/Cotton/1_SNP/3_fastStructure/P302_5_hwe_output
python chooseK.py --input=/public3/zqx/Cotton/1_SNP/3_fastStructure/P302_5_hwe_output
python distruct.py -K 3 --input=/public3/zqx/Cotton/1_SNP/3_fastStructure/P302_5_hwe_output --output=/public3/zqx/Cotton/1_SNP/3_fastStructure/P302_5_hwe_output_distruct.svg
nohup python structure.py -K 2 --input=/public3/zqx/Cotton/1_SNP/3_fastStructure/P302_5_hwe --output=/public3/zqx/Cotton/1_SNP/3_fastStructure/P302_5_hwe_output &
nohup python structure.py -K 4 --input=/public3/zqx/Cotton/1_SNP/3_fastStructure/P302_5_hwe --output=/public3/zqx/Cotton/1_SNP/3_fastStructure/P302_5_hwe_output &
nohup python structure.py -K 5 --input=/public3/zqx/Cotton/1_SNP/3_fastStructure/P302_5_hwe --output=/public3/zqx/Cotton/1_SNP/3_fastStructure/P302_5_hwe_output &
structure_plot.R
library(pophelper)
structure_plot<-function(file,k1,k2)
{
phe.lst<-as.data.frame(read.csv("names.csv",header = F))
phe.lst$V1<-as.character(phe.lst$V1)
ffiles<-paste0(file,".",c(k1:k2),".meanQ")
flist <- readQ(files=ffiles)
rownames(flist[[1]])<-phe.lst[sort(phe.lst$V1,index.return=TRUE)$ix,5]
rownames(flist[[2]])<-phe.lst[sort(phe.lst$V1,index.return=TRUE)$ix,5]
rownames(flist[[3]])<-phe.lst[sort(phe.lst$V1,index.return=TRUE)$ix,5]
rownames(flist[[4]])<-phe.lst[sort(phe.lst$V1,index.return=TRUE)$ix,5]
#jpeg(paste(file,"_fStructure.jpg",sep=""))
p<-plotQ(flist, imgoutput = "join", sortind = "all", sharedindlab=F, panelspacer = 0.3,
showindlab=T, useindlab=T,indlabsize=1, indlabangle = 45, indlabspacer = -2,
splabsize=5,
showlegend=T, legendpos="right",legendkeysize=10, legendtextsize=5,
returnplot=T, exportplot=F)
p$plot[[1]]
#dev.off()
}
file="indepA_Q"
file="indepD_Q"
file="indepAD_Q"
structure_plot(file,2,4)
3.4 phylogenetic tree
- 下载并安装
phylip
;
- 同样也需要用LD pruning之后的数据;
- 考虑到phylip要求的个体名称需要有十个字符,可以使用R脚本
id_fill.R
来对原个体名称进行修改,得到文件id4phylip.txt
;
- 使用
vcf2phylip.py
将vcf文件格式转化为phylip文件格式;得到P302_5_hwe.min4.phy文件,其中第一行为构建进化树的样品数以及每个样品使用的snp数目,第二行及以下为每个样品的名称及snp的具体内容。 - 运行
phylip
(失败ing)
# install Phylip
tar -zxvf phylip-3.697.tar.gz
cd phylip-3.697/src/
cp Makefile.unx Makefile
make install
echo export LD_LIBRARY_PATH=/home/qxzhang/software/phylip-3.697/exe:$LOAD_LIBRARY_PATH >> ~/.bashrc
# provide individual ID for phylip(ID name length=10)
Rscript --no-save ./3_Script/id_fill.R
# convert plink files to phylip files
plink --bfile P302_5_hwe --update-ids id4phylip.txt --recode vcf-iid --out P302_5_hwe --chr-set 26
# download code from https://github.com/edgardomortiz/vcf2phylip
unzip vcf2phylip-master.zip
python ~/software/vcf2phylip.py --input P302_5_hwe.vcf
# run phylip
echo -ne "P302_5_hwe.min4.phy\nr\n100\ny\n9\n" > seqboot.par
echo -ne "seqboot.out\nt\n2.3628\nm\nd\n100\n2\ny\n" > dnadist.par
echo -ne "dnadist.out\nm\n100\n9\ny\n" > neighbor.par
echo -ne "nei.tree y" > consense.par
seqboot<seqboot.par && mv outfile seqboot.out
dnadist<dnadist.par && mv outfile dnadist.out
neighbor<neighbor.par && mv outfile nei.out && mv outtree nei.tree
consense<consense.par && mv outfile cons.out && mv outtree constree
4. Association(P model)
- 在进行GWAS之前可以对SNP进行一个高LD值的质控。
- 注意:对应的QQ图和曼哈顿图的版本需要对应plink的版本。
plink --bfile plink --indep-pairwise 500kb 5 0.9 --out indep --chr-set 26
# plink1.9
plink --bfile P302_4_fit --linear --pheno blup.phe --all-pheno --covar P302_4_fit.eigenvec --covar-number 1-5 --out P302_4_fit_blup_pca --allow-no-sex --chr-set 26
# plink2.0
plink2 --bfile indep --glm allow-no-covars --pheno blup.phe --out indep-nocov --chr-set 26
plink2 --bfile indep --glm --covar indep.eigenvec --covar-col-nums 3 --pheno blup.phe --out indep-1pca --chr-set 26
# QQ plot and Manhattan plot
Rscript --no-save ./script/QQ_plot.R [filepath] [filename]
Rscript --no-save ./script/Manhattan.R [filepath] [filename]
QQ_plot.R
# for plink1.9
library(qqman)
arg<-commandArgs(T)
path<-arg[1]
file<-arg[2]
setwd(path)
pic_file <- paste0(file, ".QQ.tiff")
tiff(filename = pic_file, width = 1500, height = 1500,
units = "px", res = 300, compression="lzw")
assoc <- read.table(file, head=TRUE, comment.char = "")
z = qnorm(assoc$P/ 2)
lambda = round(median(z^2, na.rm = TRUE) / 0.454, 3)
qq(assoc$P, main = file, sub=paste("lamda=",lambda),cex=0.2)
# for plink2.0
library(qqman)
arg<-commandArgs(T)
path<-arg[1]
file<-arg[2]
setwd(path)
pic_file <- paste0(file, ".QQ.tiff")
tiff(filename = pic_file, width = 1500, height = 1500,
units = "px", res = 300, compression="lzw")
assoc <- read.table(file, head=TRUE, comment.char = "")
z = qnorm(assoc[,4]/ 2)
lambda = round(median(z^2, na.rm = TRUE) / 0.454, 3)
qq(assoc[,4], main = file, sub=paste("lamda=",lambda),cex=0.2)
Manhattan.R
# for plink1.9
library(qqman)
arg<-commandArgs(T)
path<-arg[1]
file<-arg[2]
setwd(path)
pic_file <- paste0(file, ".Mht.tiff")
tiff(filename = pic_file, width = 3000, height = 1500,
units = "px", res = 300, compression="lzw")
assoc <- read.table(file, head=TRUE, comment.char = "")
#assoc<-assoc[-which(is.na(assoc$P)),]
colnames(assoc)<-c("CHR","BP","SNP","P")
manhattan(assoc,chr="CHR",bp="BP",p="P",snp="SNP", main = file, cex=0.2)
dev.off()
# for plink2.0
library(qqman)
arg<-commandArgs(T)
path<-arg[1]
file<-arg[2]
setwd(path)
pic_file <- paste0(file, ".Mht.tiff")
tiff(filename = pic_file, width = 3000, height = 1500,
units = "px", res = 300, compression="lzw")
assoc <- read.table(file, head=FALSE, comment.char = "")
colnames(assoc)<-c("SNP","CHR","BP","P")
manhattan(assoc,chr="CHR",bp="BP",p="P",snp="SNP", main = file, cex=0.2)
5. Association(P+K model)
- 需要借助
TASSEL
计算亲缘关系矩阵,并放入GWAS模型中;
- 绘制QQ图和曼哈顿图;
# indep.map indep.ped
plink --bfile indep --recode --out indep --chr-set 26
# get kinship matrix(indep.kinship.txt)
run_pipeline.pl -Xmx24g -plink -ped indep.ped -map indep.map -KinshipPlugin -method Centered_IBS -endPlugin -ExportPlugin -saveAs indep.kinship -format SqrMatrix
# run GWAS P+K model
sh gwasPK.sh indep BN
gwasPK.sh
#!bin/bash
############# Parameter ################
pca_num=5
phe_num=9
trt=(BN BW LP LY FL FU MC FS FE)
file=$1
phe_name=$2
pca_file_o=${file}.eigenvec
pca_file=${file}.pca${pca_num}.txt
phe_file_o=${file}.phe
phe_file=${file}.phenotype.txt
########## input covariate file ###########
# pca_file
echo -ne "<Covariate>\n<Trait>" >> ${pca_file}
for id in $(seq 1 $pca_num)
do
echo -ne "\tPC$id" >> ${pca_file}
done
echo -ne "\n" >> ${pca_file}
cut -d " " -f2-$(($pca_num+2)) ${pca_file_o} >> ${pca_file}
echo "tassel covariate file made!"
########## input phenotype file ###########
# phe_file
echo -ne "<Phenotype>\n" > ${phe_file}
echo -ne "taxa\t" >> ${phe_file}
for id in `seq $phe_num`
do
echo -ne "data\t" >> ${phe_file}
done
echo -ne "\n" >> ${phe_file}
cut --complement -d " " -f2 ${phe_file_o} >> ${phe_file}
sed -i 's/FID/Taxa/g' ${phe_file}
echo "tassel phenotype file made!"
############# run GWAS model ##################
# "P+K" model
run_pipeline.pl -Xmx24g -fork1 -plink -ped ${file}.ped -map ${file}.map -fork2 -r ${file}.phenotype.txt -fork3 -k ${file}.kinship.txt -fork4 -q $pca_file -combine5 -input1 -input2 -input3 -input4 -intersect -FixedEffectLMPlugin -endPlugin -export ${file}_p${pca_num}k_glm
# QQ_plot
for i in `seq ${phe_num}`
do
{
awk '{if($1=="'${phe_name}'")print $2,$3,$4,$6}' ${file}_p${pca_num}k_glm1.txt > ${file}_p${pca_num}k_glm.${phe_name}.txt
Rscript --no-save QQ_plot.R ./ ${file}_p${pca_num}k_glm.${phe_name}.txt
Rscript --no-save Manhattan.R ./ ${file}_p${pca_num}k_glm.${phe_name}.txt
} &
done
wait
rm $pca_file
rm ${file}_p${pca_num}k_glm.${phe_name}.txt
6. GMDR
- GPU服务器运行
QTXNetwork
中GMDR模块;
- 由于GMDR要求基因型文件和表型文件的个体数一致,因此在keepid.txt中去掉缺少表型的个体;
- 由于GMDR对基因型文件与plink不同,利用脚本
plk2gmdr.sh
进行格式转化得到GMDR基因型文件,此外对表型文件也根据要求做简单调整即可;
plk2gmdr.sh
plk2gmdr_bychr.sh
#!/bin/bash
#plink -> gmdr
file=$1
for i in `seq 26`
do
{
plink --silent --bfile $file --chr $i --keep keepid.txt --recode A --out $file.$i --chr-set 26
cut --complement -d " " -f2-6 $file.$i.raw > $file.$i.gen
sed -i 's/FID/#Ind/g' $file.$i.gen
}&
done
wait
rm $file.*.nosex
rm $file.*.log
rm $file.*.raw
7. QTS
gmdr2qts.sh
#!/bin/bash
file=$1
trt=(BN BW LP LY FL FU MC FS FE)
trt_num=${#trt[@]}
#--------------------------------
#gmdr.out -> qts.gen
line=(`cut -f1 $file.out | grep Trait -n | cut -d ":" -f1`)
line[trt_num]=`awk 'END{print NR}' $file.out`
for i in `seq $trt_num`
do
{
#seperate into traits
awk -va=${line[i-1]} -vb=${line[i]} '(NR>=a&&NR<b){print $0}' $file.out > $file.${trt[i-1]}.out
#substract SNP ids
awk '{for(i=1;i<=NF;i++){ if($i ~ /._./){print $i} } }' $file.${trt[i-1]}.out | sort -u > $file.0.${trt[i-1]}.list
sed -i 's/_[ATCG]//g' $file.0.${trt[i-1]}.list
#recode with GMDR significant SNP only
plink --silent --bfile ../indep --extract $file.0.${trt[i-1]}.list --keep ../keepid.txt --recode A --out $file.${trt[i-1]} --chr-set 26
#recode 0/1/2 to A/H/B
cut --complement -d " " -f2-6 $file.${trt[i-1]}.raw | awk 'BEGIN{OFS="\t"}(NR==1){print $0}(NR>1){for(i=2;i<=NF;i++) {gsub("0","A",$i);gsub("2","B",$i);gsub("1","H",$i);gsub("NA",".",$i);} print $0}' > $file.${trt[i-1]}.gen
sed -i 's/FID/#Ind/g' $file.${trt[i-1]}.gen
} &
done
wait
rm $file.*.out
rm $file.*.raw
rm *nosex
rm *log
#---------------------------------
#plink.phe & gmdr.out -> gts.phe
for i in `seq $trt_num`
do
{
n=`awk 'END{print NR}' $file.${trt[i-1]}.gen`
m=`awk 'END{print NF}' $file.${trt[i-1]}.gen`
n=$[$n-1]
m=$[$m-1]
echo -e "_Population\tF2" > $file.${trt[i-1]}.phe
echo -e "_Genotypes\t${n}" >> $file.${trt[i-1]}.phe
echo -e "_Observations\t${n}" >> $file.${trt[i-1]}.phe
echo -e "_Environments\tno" >> $file.${trt[i-1]}.phe
echo -e "_Replications\tno" >> $file.${trt[i-1]}.phe
echo -e "_TraitNumber\t1" >> $file.${trt[i-1]}.phe
echo -e "_Chromosomes\t1\tChr1" >> $file.${trt[i-1]}.phe
echo -e "_TotalMarker\t${m}\t${m}" >> $file.${trt[i-1]}.phe
echo -e "_MarkerCode\tP1=A\tP2=B\tF1=H\tF1P1=C\tF1P2=D\n" >> $file.${trt[i-1]}.phe
echo -e "*TraitBegin*" >> $file.${trt[i-1]}.phe
echo -e "Geno#\t${trt[i-1]}\t;" >> $file.${trt[i-1]}.phe
awk -va=$[$i+1] 'BEGIN{OFS="\t"}(NR>1){print $1,$a,";"}' $file.phe >> $file.${trt[i-1]}.phe
echo -e "*TraitEnd*" >> $file.${trt[i-1]}.phe
} &
done
gmdr2qts_bychr.sh
- 注意:还需要包括不分染色体时的结果(\(file.0.\){trt[i-1]}.list)
#!/bin/bash
file=$1
trt=(BN BW LP LY FL FU MC FS FE)
trt_num=${#trt[@]}
#--------------------------------
#gmdr.out -> qts.gen
for c in `seq 26`
do
{
line=(`cut -f1 $file.$c.out | grep Trait -n | cut -d ":" -f1`)
line[trt_num]=`awk 'END{print NR}' $file.$c.out`
for i in `seq $trt_num`
do
{
#seperate into traits
awk -va=${line[i-1]} -vb=${line[i]} '(NR>=a&&NR<b){print $0}' $file.$c.out > $file.$c.${trt[i-1]}.out
#substract SNP ids
awk '{for(i=1;i<=NF;i++){ if($i ~ /._./){print $i} } }' $file.$c.${trt[i-1]}.out | sort -u > $file.$c.${trt[i-1]}.list
sed -i 's/_[ATCG]//g' $file.$c.${trt[i-1]}.list
}
done
} &
done
wait
echo "list files produced!"
for i in `seq $trt_num`
do
{
#combine all lists(26+1) by traits
cat `ls $file.*.${trt[i-1]}.list` | sort -u > $file.${trt[i-1]}.list
#recode with GMDR significant SNP only
plink --silent --bfile ../indep --extract $file.${trt[i-1]}.list --keep ../keepid.txt --recode A --out $file.${trt[i-1]} --chr-set 26
#recode 0/1/2 to A/H/B
cut --complement -d " " -f2-6 $file.${trt[i-1]}.raw | awk 'BEGIN{OFS="\t"}(NR==1){print $0}(NR>1){for(i=2;i<=NF;i++) {gsub("0","A",$i);gsub("2","B",$i);gsub("1","H",$i);gsub("NA",".",$i);} print $0}' > $file.${trt[i-1]}.gen
sed -i 's/FID/#Ind/g' $file.${trt[i-1]}.gen
} &
done
wait
echo "QTS (.gen) files prepared!"
rm $file.*.*.out
rm $file.*.list
rm $file.*.raw
rm *nosex
rm *log
#---------------------------------
#plink.phe & gmdr.out -> gts.phe
for i in `seq $trt_num`
do
{
n=`awk 'END{print NR}' $file.${trt[i-1]}.gen`
m=`awk 'END{print NF}' $file.${trt[i-1]}.gen`
n=$[$n-1]
m=$[$m-1]
echo -e "_Population\tF2" > $file.${trt[i-1]}.phe
echo -e "_Genotypes\t${n}" >> $file.${trt[i-1]}.phe
echo -e "_Observations\t${n}" >> $file.${trt[i-1]}.phe
echo -e "_Environments\tno" >> $file.${trt[i-1]}.phe
echo -e "_Replications\tno" >> $file.${trt[i-1]}.phe
echo -e "_TraitNumber\t1" >> $file.${trt[i-1]}.phe
echo -e "_Chromosomes\t1\tChr1" >> $file.${trt[i-1]}.phe
echo -e "_TotalMarker\t${m}\t${m}" >> $file.${trt[i-1]}.phe
echo -e "_MarkerCode\tP1=A\tP2=B\tF1=H\tF1P1=C\tF1P2=D\n" >> $file.${trt[i-1]}.phe
echo -e "*TraitBegin*" >> $file.${trt[i-1]}.phe
echo -e "Geno#\t${trt[i-1]}\t;" >> $file.${trt[i-1]}.phe
awk -va=$[$i+1] 'BEGIN{OFS="\t"}(NR>1){print $1,$a,";"}' $file.phe >> $file.${trt[i-1]}.phe
echo -e "*TraitEnd*" >> $file.${trt[i-1]}.phe
} &
done
wait
echo "QTS (.phe) files prepared!"
gmdr2qts_byenv.sh
#!/bin/bash
file=$1
trt=(BN BW LP LY FL FU MC FS FE)
trt_num=${#trt[@]}
#--------------------------------
#gmdr.out -> qts.gen
for e in `seq 7`
do
{
line=(`cut -f1 $file.e$e.out | grep Trait -n | cut -d ":" -f1`)
line[trt_num]=`awk 'END{print NR}' $file.e$e.out`
for i in `seq $trt_num`
do
{
#seperate into traits
awk -va=${line[i-1]} -vb=${line[i]} '(NR>=a&&NR<b){print $0}' $file.e$e.out > $file.e$e.${trt[i-1]}.out
#substract SNP ids
awk '{for(i=1;i<=NF;i++){ if($i ~ /._./){print $i} } }' $file.e$e.${trt[i-1]}.out | sort -u > $file.e$e.${trt[i-1]}.list
sed -i 's/_[ATCG]//g' $file.e$e.${trt[i-1]}.list
}
done
} &
done
wait
echo "list files produced!"
for i in `seq $trt_num`
do
{
#combine all environments by traits
cat `ls $file.*.${trt[i-1]}.list` | sort -u > $file.${trt[i-1]}.list
#recode with GMDR significant SNP only
plink --silent --bfile ../indep --extract $file.${trt[i-1]}.list --keep ../keepid.txt --recode A --out $file.${trt[i-1]} --chr-set 26
#recode 0/1/2 to A/H/B
cut --complement -d " " -f2-6 $file.${trt[i-1]}.raw | awk 'BEGIN{OFS="\t"}(NR==1){print $0}(NR>1){for(i=2;i<=NF;i++) {gsub("0","A",$i);gsub("2","B",$i);gsub("1","H",$i);gsub("NA",".",$i);} print $0}' > $file.${trt[i-1]}.gen
sed -i 's/FID/#Ind/g' $file.${trt[i-1]}.gen
} &
done
wait
echo "QTS (.gen) files prepared!"
rm $file.*.*.out
rm $file.*.*.list
rm $file.*.raw
rm *nosex
rm *log
#---------------------------------
#plink.phe & gmdr.out -> gts.phe
for i in `seq $trt_num`
do
{
n=`awk 'END{print NR}' $file.${trt[i-1]}.gen`
m=`awk 'END{print NF}' $file.${trt[i-1]}.gen`
n=$[$n-1]
m=$[$m-1]
echo -e "_Population\tF2" > $file.${trt[i-1]}.phe
echo -e "_Genotypes\t${n}" >> $file.${trt[i-1]}.phe
echo -e "_Observations\t$[$n*7]" >> $file.${trt[i-1]}.phe
echo -e "_Environments\tyes" >> $file.${trt[i-1]}.phe
echo -e "_Replications\tno" >> $file.${trt[i-1]}.phe
echo -e "_TraitNumber\t1" >> $file.${trt[i-1]}.phe
echo -e "_Chromosomes\t1\tChr1" >> $file.${trt[i-1]}.phe
echo -e "_TotalMarker\t${m}\t${m}" >> $file.${trt[i-1]}.phe
echo -e "_MarkerCode\tP1=A\tP2=B\tF1=H\tF1P1=C\tF1P2=D\n" >> $file.${trt[i-1]}.phe
echo -e "*TraitBegin*" >> $file.${trt[i-1]}.phe
echo -e "Env#\tGeno#\t${trt[i-1]}\t;" >> $file.${trt[i-1]}.phe
for e in `seq 7`
do
{
awk -ve=$e -va=$[$i+1] 'BEGIN{OFS="\t"}(NR>1){print e,$1,$a,";"}' $file.e$e.phe >> $file.${trt[i-1]}.phe
}
done
echo -e "*TraitEnd*" >> $file.${trt[i-1]}.phe
} &
done
wait
echo "QTS (.phe) files prepared!"