1. QC

  • 利用 bcftools 将原vcf文件中含有字母的染色体名称转换为1-26的编号;
  • 忽略scaffold,只选取26条染色体上的SNP进行下一步分析;
  • 忽略名称为M1,M2,M3,M4的无关个体;
  • 利用 plink --geno--maf 命令分别进行缺失率和MAF的质控;
  • 利用 plink --set-missing-var-ids @:# 以“染色体:位置”作为SNP名称,确保其不为缺失;
# QC and format details
bcftools annotate --rename-chrs chr_name_change.txt filted.SNP.vcf --output filted.SNP2.vcf 
plink --vcf filted.SNP2.vcf --chr 1-26 --make-bed --out P302
plink --bfile P302 --keep keepid.txt --make-bed --out P302_1 --chr-set 26
plink --bfile P302_1 --geno 0.2 --make-bed --out P302_2 --chr-set 26
plink --bfile P302_2 --mind 0.2 --make-bed --out P302_3 --chr-set 26
plink --bfile P302_2 --geno 0.05 --make-bed --out P302_3 --chr-set 26
plink --bfile P302_3 --maf 0.01 --make-bed --out P302_4 --chr-set 26
plink --bfile P302_4 --set-missing-var-ids @:# --make-bed --out P302_4_raw

plink --bfile P302_4_raw --maf 0.05 --make-bed --out plink --chr-set 26

(1.1 Crop-specific QC)

  • 考虑到实验材料为植物,可以根据群体类型对位点进行质控。
  • 利用 plink --hardy 计算各标记位点杂合数;
    • 利用 hwe.R 绘制哈温平衡概率分布图,利用 plink --hwe 对p值>1e-6的位点进行质控;
    • 利用 inbred_filter.R 绘制近交系数分布图,并对inbred coefficient<0.8的位点进行质控;
plink --bfile P302_4_raw --hardy --out HWE_check --chr-set 26
# HWE(不做)
Rscript --no-save ./script/hwe.R
plink --bfile P302_4_raw --hwe 1e-6 --make-bed --out P302_4_hwe --chr-set 26
# FIT
Rscript --no-save ./script/inbred_filter.R
sed 's/"// g' fail-fit-site.txt | awk '{print$2}' > fail-fit-list.txt 
plink --bfile P302_4_raw --exclude fail-fit-list.txt --make-bed --out P302_4_fit --chr-set 26

hwe.R

hwe<-read.table (file="HWE_check.hwe", header=TRUE)
pdf("histhwe.pdf")
hist(hwe[,9],main="Histogram HWE")
dev.off()
hwe_zoom<-read.table (file="HWE_checkzoom.hwe", header=TRUE)
pdf("histhwe_below_threshold.pdf")
hist(hwe_zoom[,9],main="Histogram HWE: strongly deviating SNPs only")
dev.off()

inbred_filter.R

hwe<-read.table (file="HWE_check.hwe", header=TRUE)
pdf("hist_F(IT).pdf")
F_IT<-1-hwe[,7]/hwe[,8]
hist(F_IT,main="Histogram F(IT)")
dev.off()
pdf("hist_F(IT)_zoom.pdf")
hist(F_IT[which(F_IT>0.5)],main="Histogram F(IT) zoom")
dev.off()
write.table(hwe[which(F_IT<0.8),2],file = "fail-fit-site.txt",col.names = F)

2. SNP summary

  • 2.1 统计各染色体长度、SNP个数和SNP密度分布图
    • 绘制SNP密度分布图
  • 2.2 计算LD衰减值
    • 统计各染色体LD衰减值=0.2时的距离

2.1 SNP number and density

# chr length and SNP number per chr
head -n200 filted.SNP.vcf | grep "##contig=<ID=Ghir_" | sed 's/^.*length=//g' | sed 's/>//g'
cut -f1 P302_4_raw.bim | uniq -c

# SNP density plot
vcftools --vcf P302_4_raw.vcf --SNPdensity 100000 --out SNPden_raw
vcftools --vcf P302_4_fit.vcf --SNPdensity 100000 --out SNPden_fit
Rscript SNPdensity_plot.R

SNPdensity_plot.R

library(circlize)
library(viridis)

layout(matrix(1:2,1,2))

df1<-read.table("SNPden_raw.snpden",sep="\t",header=T)
df2<-read.table("SNPden_fit.snpden",sep="\t",header=T)

circle_plot <- function(df)
{
  df<-df[,c(1,2,4)]
  colnames(df)<-c("Chr","X","Y")
  df$Chr<-as.factor(df$Chr)
  levels(df$Chr)<-c(as.character(c(1:13)),rev(as.character(c(14:26))))
  df$X<-df$X/1000000
  options(scipen=999)
  
  col<-c(viridis(13,0.2,0.4,alpha=0.8),rev(viridis(13,0.8,1,alpha=0.8)))
  circos.initialize(factors=df$Chr,x=df$X)
  circos.trackPlotRegion(factors=df$Chr,y=df$Y,track.height = 0.05)
  for(i in 1:26){
    highlight.sector(sector.index = as.character(i),col=col[i])
    if(i<=13){
      circos.text(mean(get.cell.meta.data("xlim",sector.index = as.character(i))), 
                  mean(get.cell.meta.data("ylim")),
                  labels = paste0("A",i),
                  sector.index = as.character(i),cex=0.8)
    }
    else{
      circos.text(mean(get.cell.meta.data("xlim",sector.index = as.character(i))), 
                  mean(get.cell.meta.data("ylim")),
                  labels = paste0("D",i-13),
                  sector.index = as.character(i),cex=0.8)
    }
  }
  col<-c(viridis(13,0.2,0.4,alpha=0.5),rev(viridis(13,0.8,1,alpha=0.5)))
  circos.trackPlotRegion(factors=df$Chr,y=df$Y,track.height = 0.05)
  for(i in 1:26){
    highlight.sector(sector.index = as.character(i),track.index = 2,col=col[i])
    if(i<=13){
      circos.text(mean(get.cell.meta.data("xlim",sector.index = as.character(i))), 
                  mean(get.cell.meta.data("ylim")),
                  labels = i,
                  sector.index = as.character(i),cex=0.8)
    }
    else{
      circos.text(mean(get.cell.meta.data("xlim",sector.index = as.character(i))), 
                  mean(get.cell.meta.data("ylim")),
                  labels = i,
                  sector.index = as.character(i),cex=0.8)
    }
  }
  col<-c(viridis(13,0.2,0.4,alpha=0.8),viridis(13,0.8,1,alpha=0.8))
  circos.trackPlotRegion(factors=df$Chr,y=df$Y,track.height = 0.3, ylim = c(0,40))
  circos.trackLines(df$Chr,df$X,df$Y,col=col)
  
  draw.sector(get.cell.meta.data("cell.start.degree", sector.index = "1"), 
              get.cell.meta.data("cell.end.degree", sector.index = "13"), 
              rou1 = get.cell.meta.data("cell.bottom.radius", track.index = 3), col = col[6])
  draw.sector(get.cell.meta.data("cell.end.degree", sector.index = "13"), 
              get.cell.meta.data("cell.start.degree", sector.index = "1"), 
              rou1 = get.cell.meta.data("cell.bottom.radius", track.index = 3), col = col[19])
  circos.clear()
}
circle_plot(df1)
circle_plot(df2)

2.2 LD decay

  • 方法一:利用plink --r2 命令可以得到一定大小窗口下标记之间的LD值。再根据标记之间距离大小的不同设置一组bin,对各bin内所有\(r^2\)求平均值,并绘制LD衰减图;
  • 方法二:借助PopLDdecay和 ld_chr_plot.sh ,分染色体绘制LD衰减图;
  • 主要使用方法二
# Method I
plink --bfile P302_4_raw --chr 1 --thin 0.05 --r2 --ld-window-r2 0 --ld-window 99999 --ld-window-kb 1000
Rscript --no-save LD_decay_plot.R
# Method II
nohup sh ld_all_plot.sh P302_4_raw &
# ld decay = 0.2
nohup sh ld_chr_plot.sh P302_4_raw &
for i in `seq 26`; do Rscript --no-save cal_ld_thre.R  chr$i.bin ; done

ld_decay_plot.R

library(viridis)
arg <- commandArgs(T)
filename <- arg[1]
picname <- paste0(filename,".tiff")
dt <- read.table(filename)
dt <- dt[,1:2]
dt[,1] <- dt[,1]/1000
tiff(picname, width = 1500, height = 1550, 
     units = "px", res = 300, compression="lzw")
plot(dt, xlab = "Distance (Kb)", ylab = expression(LD~(r^{2})), 
     main = "LD decay", sub = filename, type = "l", col=viridis(1,0.8), lwd=2)
abline(h=0.2,col="grey40",lty=2,lwd=0.7)
abline(h=0.1,col="grey40",lty=2,lwd=0.7)
dev.off()

ld_all_plot.sh

#!/bin/bash
file=$1
mkdir ld_$file
cd ld_$file/
for i in `seq 26`
do
{
  plink --bfile ../$file --chr $i --chr-set 26 --recode vcf-iid --out chr$i
  gzip chr$i.vcf
  PopLDdecay -InVCF chr$i.vcf.gz -MaxDist 10000 -OutStat chr$i
} &
done
wait
ls chr*.stat.gz > all_list.txt
perl ~/software/PopLDdecay/bin/Plot_OnePop.pl -inList all_list.txt -bin1 500 -bin2 50000 -output all
cd ..

gzip -d all.bin.gz
Rscript --no-save ../script/ld_decay_plot.R all.bin

ld_chr_plot.sh

#!/bin/bash
file=$1
mkdir ld_$file
cd ld_$file/
for i in `seq 26`
do
{
  plink --bfile ../$file --chr $i --chr-set 26 --recode vcf-iid --out chr$i
  gzip chr$i.vcf
  PopLDdecay -InVCF chr$i.vcf.gz -MaxDist 10000 -OutStat chr$i
  perl ~/software/PopLDdecay/bin/Plot_OnePop.pl -inFile chr$i.stat.gz -bin1 500 -bin2 50000 -output chr$i
  gzip -d chr$i.bin.gz
} &
done
cd ..

cal_ld_thre.R

file<-commandArgs(T)
data<-read.table(file)
thred<-which(data[,2]<0.2)[1]
print(paste(data[thred-1,1]/1000000,"-",data[thred,1]/1000000,sep=""))

3. Population Stratification

3.1 PCA

  • 利用 plink --pca 命令得到.eigenval.eigenvec文件
  • 利用 EIGENSOFT twstats 方法,计算显著的eigenval个数
# PCA 2 (after filtering by FIT)
plink --bfile P302_4_fit --pca 20 --out P302_4_fit --chr-set 26
plink --bfile P302_5_fit --pca 20 --out P302_5_fit --chr-set 26
twstats -t twtable -i P302_4_fit.eigenval -o test
twstats -t twtable -i P302_5_fit.eigenval -o test

PCA_plot.R

library(ggplot2)
file="indep"
pca_num=5
vec_file=paste0(file,".eigenvec")
val_file=paste0(file,".eigenval")
vec <- read.table(vec_file,header = FALSE)
colnames(vec)<-c("FID","IID",paste0(rep("PC",pca_num),c(1:pca_num)))
ggplot(vec,aes(x=PC1,y=PC2))+geom_point()+labs(title=vec_file)+
  theme_classic()+theme(plot.title = element_text(hjust = 0.5)) 

# percentage of explained variance
val <- read.table(val_file,header = FALSE)
per <- val[,1]/sum(val[which(val[,1]>1),1])
plot(per[1:30],type = "b", pch=18, cex=0.8,
     xlab = "Principal Component",
     ylab = "Proportion of Variance Explained")
plot(cumsum(per[1:30]),type = "b", pch=18, cex=0.8,
     xlab = "Principal Component",
     ylab = "Cumulative Proportion of Variance Explained")

3.2 EigenGWAS

  • P302_4_raw/_fit.bed/.bim/.fam to do eigenGWAS.

3.3 fastStructure

  • 根据LD decay值,进行适当的LD pruning,得到相互独立的SNP,利用这些SNP做structure分析;
  • 下载fastStructure,建议在conda中配置环境。
  • 下载fastStructure后直接运行会报错,需要根据Link对distruct.py进行修改。
  • 再借助R包pophelper绘图
# LD pruning 1
plink --bfile P302_4_raw --indep-pairwise 300 5 0.2 --out indepSNP_w300 --chr-set 26
plink --bfile P302_4_raw --extract indepSNP_w300.prune.in --make-bed --out P302_5_raw --chr-set 26

# LD pruning 2 (after filtering by FIT)
plink --bfile P302_4_fit --indep-pairwise 100 5 0.2 --out indepSNP_w100_fit --chr-set 26
plink --bfile P302_4_fit --extract indepSNP_w100_fit.prune.in --make-bed --out P302_5_fit --chr-set 26

# Asubgenome (3000kb 5 0.2) 
# Dsubgenome (1500kb 5 0.2)
plink --bfile plink --indep-pairwise 3000kb 5 0.2 --chr 1-13 --out indepA --chr-set 26
plink --bfile plink --indep-pairwise 1500kb 5 0.2 --chr 14-26 --out indepD --chr-set 26
cat indepA.prune.in >> indepAD.prune.in
cat indepD.prune.in >> indepAD.prune.in
plink --bfile plink --extract indepAD.prune.in --make-bed --out indepAD --chr-set 26
conda activate fStructure
python structure.py -K 3 --input=/public3/zqx/Cotton/1_SNP/3_fastStructure/P302_5_hwe --output=/public3/zqx/Cotton/1_SNP/3_fastStructure/P302_5_hwe_output 
python chooseK.py --input=/public3/zqx/Cotton/1_SNP/3_fastStructure/P302_5_hwe_output 
python distruct.py -K 3 --input=/public3/zqx/Cotton/1_SNP/3_fastStructure/P302_5_hwe_output --output=/public3/zqx/Cotton/1_SNP/3_fastStructure/P302_5_hwe_output_distruct.svg

nohup python structure.py -K 2 --input=/public3/zqx/Cotton/1_SNP/3_fastStructure/P302_5_hwe --output=/public3/zqx/Cotton/1_SNP/3_fastStructure/P302_5_hwe_output &
nohup python structure.py -K 4 --input=/public3/zqx/Cotton/1_SNP/3_fastStructure/P302_5_hwe --output=/public3/zqx/Cotton/1_SNP/3_fastStructure/P302_5_hwe_output &
nohup python structure.py -K 5 --input=/public3/zqx/Cotton/1_SNP/3_fastStructure/P302_5_hwe --output=/public3/zqx/Cotton/1_SNP/3_fastStructure/P302_5_hwe_output &

structure_plot.R

library(pophelper)
structure_plot<-function(file,k1,k2)
{
  phe.lst<-as.data.frame(read.csv("names.csv",header = F))
  phe.lst$V1<-as.character(phe.lst$V1)
  ffiles<-paste0(file,".",c(k1:k2),".meanQ")
  flist <- readQ(files=ffiles)
  
  rownames(flist[[1]])<-phe.lst[sort(phe.lst$V1,index.return=TRUE)$ix,5]
  rownames(flist[[2]])<-phe.lst[sort(phe.lst$V1,index.return=TRUE)$ix,5]
  rownames(flist[[3]])<-phe.lst[sort(phe.lst$V1,index.return=TRUE)$ix,5]
  rownames(flist[[4]])<-phe.lst[sort(phe.lst$V1,index.return=TRUE)$ix,5]
  
  #jpeg(paste(file,"_fStructure.jpg",sep=""))
  p<-plotQ(flist, imgoutput = "join", sortind = "all", sharedindlab=F, panelspacer = 0.3,
           showindlab=T, useindlab=T,indlabsize=1, indlabangle = 45, indlabspacer = -2, 
           splabsize=5, 
           showlegend=T, legendpos="right",legendkeysize=10, legendtextsize=5,
           returnplot=T, exportplot=F)
  p$plot[[1]]
  #dev.off()
}
file="indepA_Q"
file="indepD_Q"
file="indepAD_Q"
structure_plot(file,2,4)

3.4 phylogenetic tree

  • 下载并安装 phylip;
  • 同样也需要用LD pruning之后的数据;
  • 考虑到phylip要求的个体名称需要有十个字符,可以使用R脚本id_fill.R来对原个体名称进行修改,得到文件id4phylip.txt
  • 使用vcf2phylip.py将vcf文件格式转化为phylip文件格式;得到P302_5_hwe.min4.phy文件,其中第一行为构建进化树的样品数以及每个样品使用的snp数目,第二行及以下为每个样品的名称及snp的具体内容。
  • 运行phylip(失败ing)
# install Phylip
tar -zxvf phylip-3.697.tar.gz
cd phylip-3.697/src/
cp Makefile.unx Makefile
make install
echo export LD_LIBRARY_PATH=/home/qxzhang/software/phylip-3.697/exe:$LOAD_LIBRARY_PATH >> ~/.bashrc
# provide individual ID for phylip(ID name length=10)
Rscript --no-save ./3_Script/id_fill.R
# convert plink files to phylip files
plink --bfile P302_5_hwe --update-ids id4phylip.txt --recode vcf-iid --out P302_5_hwe --chr-set 26
# download code from https://github.com/edgardomortiz/vcf2phylip
unzip vcf2phylip-master.zip
python ~/software/vcf2phylip.py --input P302_5_hwe.vcf
# run phylip
echo -ne "P302_5_hwe.min4.phy\nr\n100\ny\n9\n" > seqboot.par
echo -ne "seqboot.out\nt\n2.3628\nm\nd\n100\n2\ny\n" > dnadist.par
echo -ne "dnadist.out\nm\n100\n9\ny\n" > neighbor.par
echo -ne "nei.tree y" > consense.par
seqboot<seqboot.par && mv outfile seqboot.out
dnadist<dnadist.par && mv outfile dnadist.out
neighbor<neighbor.par && mv outfile nei.out && mv outtree nei.tree
consense<consense.par && mv outfile cons.out && mv outtree constree

4. Association(P model)

  • 在进行GWAS之前可以对SNP进行一个高LD值的质控。
  • 注意:对应的QQ图和曼哈顿图的版本需要对应plink的版本。
plink --bfile plink --indep-pairwise 500kb 5 0.9 --out indep --chr-set 26
# plink1.9
plink --bfile P302_4_fit --linear --pheno blup.phe --all-pheno --covar P302_4_fit.eigenvec --covar-number 1-5 --out P302_4_fit_blup_pca --allow-no-sex --chr-set 26
# plink2.0 
plink2 --bfile indep --glm allow-no-covars --pheno blup.phe --out indep-nocov --chr-set 26
plink2 --bfile indep --glm --covar indep.eigenvec --covar-col-nums 3 --pheno blup.phe --out indep-1pca --chr-set 26 
# QQ plot and Manhattan plot
Rscript --no-save ./script/QQ_plot.R [filepath] [filename]
Rscript --no-save ./script/Manhattan.R [filepath] [filename]

QQ_plot.R

# for plink1.9
library(qqman)
arg<-commandArgs(T)
path<-arg[1]
file<-arg[2]
setwd(path)
pic_file <- paste0(file, ".QQ.tiff")
tiff(filename = pic_file, width = 1500, height = 1500, 
     units = "px", res = 300, compression="lzw")
assoc <- read.table(file, head=TRUE, comment.char = "")
z = qnorm(assoc$P/ 2)
lambda = round(median(z^2, na.rm = TRUE) / 0.454, 3)
qq(assoc$P, main = file, sub=paste("lamda=",lambda),cex=0.2)

# for plink2.0
library(qqman)
arg<-commandArgs(T)
path<-arg[1]
file<-arg[2]
setwd(path)
pic_file <- paste0(file, ".QQ.tiff")
tiff(filename = pic_file, width = 1500, height = 1500, 
     units = "px", res = 300, compression="lzw")
assoc <- read.table(file, head=TRUE, comment.char = "")
z = qnorm(assoc[,4]/ 2)
lambda = round(median(z^2, na.rm = TRUE) / 0.454, 3)
qq(assoc[,4], main = file, sub=paste("lamda=",lambda),cex=0.2)

Manhattan.R

# for plink1.9
library(qqman)
arg<-commandArgs(T)
path<-arg[1]
file<-arg[2]
setwd(path)
pic_file <- paste0(file, ".Mht.tiff")
tiff(filename = pic_file, width = 3000, height = 1500, 
     units = "px", res = 300, compression="lzw")
assoc <- read.table(file, head=TRUE, comment.char = "")
#assoc<-assoc[-which(is.na(assoc$P)),]
colnames(assoc)<-c("CHR","BP","SNP","P")
manhattan(assoc,chr="CHR",bp="BP",p="P",snp="SNP", main = file, cex=0.2)
dev.off()

# for plink2.0
library(qqman)
arg<-commandArgs(T)
path<-arg[1]
file<-arg[2]
setwd(path)
pic_file <- paste0(file, ".Mht.tiff")
tiff(filename = pic_file, width = 3000, height = 1500, 
     units = "px", res = 300, compression="lzw")
assoc <- read.table(file, head=FALSE, comment.char = "")
colnames(assoc)<-c("SNP","CHR","BP","P")
manhattan(assoc,chr="CHR",bp="BP",p="P",snp="SNP", main = file, cex=0.2)

5. Association(P+K model)

  • 需要借助TASSEL计算亲缘关系矩阵,并放入GWAS模型中;
  • 绘制QQ图和曼哈顿图;
# indep.map indep.ped
plink --bfile indep --recode --out indep --chr-set 26
# get kinship matrix(indep.kinship.txt)
run_pipeline.pl -Xmx24g -plink -ped indep.ped -map indep.map -KinshipPlugin -method Centered_IBS -endPlugin -ExportPlugin -saveAs indep.kinship -format SqrMatrix

# run GWAS P+K model
sh gwasPK.sh indep BN

gwasPK.sh

#!bin/bash
############# Parameter ################
pca_num=5
phe_num=9
trt=(BN BW LP LY FL FU MC FS FE)
file=$1
phe_name=$2
pca_file_o=${file}.eigenvec
pca_file=${file}.pca${pca_num}.txt
phe_file_o=${file}.phe
phe_file=${file}.phenotype.txt
########## input covariate file ###########
# pca_file
echo -ne "<Covariate>\n<Trait>" >> ${pca_file}
for id in $(seq 1 $pca_num)
do
  echo -ne "\tPC$id" >> ${pca_file}
done
echo -ne "\n" >> ${pca_file}
cut -d " " -f2-$(($pca_num+2)) ${pca_file_o} >> ${pca_file}
echo "tassel covariate file made!"
########## input phenotype file ###########
# phe_file
echo -ne "<Phenotype>\n" > ${phe_file}
echo -ne "taxa\t" >> ${phe_file}
for id in `seq $phe_num`
do
  echo -ne "data\t" >> ${phe_file}
done
echo -ne "\n" >> ${phe_file}
cut --complement -d " " -f2 ${phe_file_o} >> ${phe_file}
sed -i 's/FID/Taxa/g' ${phe_file}
echo "tassel phenotype file made!"
############# run GWAS model ##################
# "P+K" model
run_pipeline.pl -Xmx24g -fork1 -plink -ped ${file}.ped -map ${file}.map -fork2 -r ${file}.phenotype.txt -fork3 -k ${file}.kinship.txt -fork4 -q $pca_file -combine5 -input1 -input2 -input3 -input4 -intersect -FixedEffectLMPlugin -endPlugin -export ${file}_p${pca_num}k_glm

# QQ_plot
for i in `seq ${phe_num}`
do
{
  awk '{if($1=="'${phe_name}'")print $2,$3,$4,$6}' ${file}_p${pca_num}k_glm1.txt > ${file}_p${pca_num}k_glm.${phe_name}.txt
  Rscript --no-save QQ_plot.R ./ ${file}_p${pca_num}k_glm.${phe_name}.txt
  Rscript --no-save Manhattan.R ./ ${file}_p${pca_num}k_glm.${phe_name}.txt
} &
done
wait

rm $pca_file
rm ${file}_p${pca_num}k_glm.${phe_name}.txt

6. GMDR

  • GPU服务器运行QTXNetwork中GMDR模块;
  • 由于GMDR要求基因型文件和表型文件的个体数一致,因此在keepid.txt中去掉缺少表型的个体;
  • 由于GMDR对基因型文件与plink不同,利用脚本plk2gmdr.sh进行格式转化得到GMDR基因型文件,此外对表型文件也根据要求做简单调整即可;

plk2gmdr.sh

#!/bin/bash
#plink -> gmdr
file=$1
plink --bfile $file --keep keepid.txt --recode A --out $file --chr-set 26
cut --complement -d " " -f2-6 $file.raw | sed -i 's/FID/#Ind/g' $file.gen

plk2gmdr_bychr.sh

#!/bin/bash
#plink -> gmdr
file=$1
for i in `seq 26`
do
{
  plink --silent --bfile $file --chr $i --keep keepid.txt --recode A --out $file.$i --chr-set 26
  cut --complement -d " " -f2-6 $file.$i.raw > $file.$i.gen
  sed -i 's/FID/#Ind/g' $file.$i.gen
}&
done
wait
rm $file.*.nosex
rm $file.*.log
rm $file.*.raw

7. QTS

gmdr2qts.sh

#!/bin/bash
file=$1
trt=(BN BW LP LY FL FU MC FS FE)
trt_num=${#trt[@]}
#--------------------------------
#gmdr.out -> qts.gen
line=(`cut -f1 $file.out | grep Trait -n | cut -d ":" -f1`)
line[trt_num]=`awk 'END{print NR}' $file.out`
for i in `seq $trt_num`
do
{
    #seperate into traits
    awk -va=${line[i-1]} -vb=${line[i]} '(NR>=a&&NR<b){print $0}' $file.out > $file.${trt[i-1]}.out
    #substract SNP ids
    awk '{for(i=1;i<=NF;i++){ if($i ~ /._./){print $i} } }'  $file.${trt[i-1]}.out | sort -u > $file.0.${trt[i-1]}.list
    sed -i 's/_[ATCG]//g' $file.0.${trt[i-1]}.list
    #recode with GMDR significant SNP only
    plink --silent --bfile ../indep --extract $file.0.${trt[i-1]}.list --keep ../keepid.txt --recode A --out $file.${trt[i-1]} --chr-set 26
    #recode 0/1/2 to A/H/B
    cut --complement -d " " -f2-6 $file.${trt[i-1]}.raw | awk 'BEGIN{OFS="\t"}(NR==1){print $0}(NR>1){for(i=2;i<=NF;i++) {gsub("0","A",$i);gsub("2","B",$i);gsub("1","H",$i);gsub("NA",".",$i);} print $0}' > $file.${trt[i-1]}.gen
    sed -i 's/FID/#Ind/g' $file.${trt[i-1]}.gen
} &
done
wait
rm $file.*.out
rm $file.*.raw
rm *nosex
rm *log
#---------------------------------
#plink.phe & gmdr.out -> gts.phe
for i in `seq $trt_num`
do
{
  n=`awk 'END{print NR}' $file.${trt[i-1]}.gen`
  m=`awk 'END{print NF}' $file.${trt[i-1]}.gen`
  n=$[$n-1]
  m=$[$m-1]
  echo -e "_Population\tF2" > $file.${trt[i-1]}.phe
  echo -e "_Genotypes\t${n}" >> $file.${trt[i-1]}.phe
  echo -e "_Observations\t${n}" >> $file.${trt[i-1]}.phe
  echo -e "_Environments\tno" >> $file.${trt[i-1]}.phe
  echo -e "_Replications\tno" >> $file.${trt[i-1]}.phe
  echo -e "_TraitNumber\t1" >> $file.${trt[i-1]}.phe
  echo -e "_Chromosomes\t1\tChr1" >> $file.${trt[i-1]}.phe
  echo -e "_TotalMarker\t${m}\t${m}" >> $file.${trt[i-1]}.phe
  echo -e "_MarkerCode\tP1=A\tP2=B\tF1=H\tF1P1=C\tF1P2=D\n" >> $file.${trt[i-1]}.phe
  echo -e "*TraitBegin*" >> $file.${trt[i-1]}.phe
  echo -e "Geno#\t${trt[i-1]}\t;" >> $file.${trt[i-1]}.phe
  awk -va=$[$i+1] 'BEGIN{OFS="\t"}(NR>1){print $1,$a,";"}' $file.phe >> $file.${trt[i-1]}.phe
  echo -e "*TraitEnd*" >> $file.${trt[i-1]}.phe
} &
done

gmdr2qts_bychr.sh

  • 注意:还需要包括不分染色体时的结果(\(file.0.\){trt[i-1]}.list)
#!/bin/bash
file=$1
trt=(BN BW LP LY FL FU MC FS FE)
trt_num=${#trt[@]}
#--------------------------------
#gmdr.out -> qts.gen
for c in `seq 26`
do 
{
  line=(`cut -f1 $file.$c.out | grep Trait -n | cut -d ":" -f1`)
  line[trt_num]=`awk 'END{print NR}' $file.$c.out`
  for i in `seq $trt_num`
  do
  {
    #seperate into traits
    awk -va=${line[i-1]} -vb=${line[i]} '(NR>=a&&NR<b){print $0}' $file.$c.out > $file.$c.${trt[i-1]}.out
    #substract SNP ids
    awk '{for(i=1;i<=NF;i++){ if($i ~ /._./){print $i} } }'  $file.$c.${trt[i-1]}.out | sort -u > $file.$c.${trt[i-1]}.list
    sed -i 's/_[ATCG]//g' $file.$c.${trt[i-1]}.list
  }
  done
} &
done
wait
echo "list files produced!"
for i in `seq $trt_num`
do
{
  #combine all lists(26+1) by traits
  cat `ls $file.*.${trt[i-1]}.list` | sort -u > $file.${trt[i-1]}.list
  #recode with GMDR significant SNP only
  plink --silent --bfile ../indep --extract $file.${trt[i-1]}.list --keep ../keepid.txt --recode A --out $file.${trt[i-1]} --chr-set 26
  #recode 0/1/2 to A/H/B
  cut --complement -d " " -f2-6 $file.${trt[i-1]}.raw | awk 'BEGIN{OFS="\t"}(NR==1){print $0}(NR>1){for(i=2;i<=NF;i++) {gsub("0","A",$i);gsub("2","B",$i);gsub("1","H",$i);gsub("NA",".",$i);} print $0}' > $file.${trt[i-1]}.gen
  sed -i 's/FID/#Ind/g' $file.${trt[i-1]}.gen
} &
done
wait
echo "QTS (.gen) files prepared!"

rm $file.*.*.out
rm $file.*.list
rm $file.*.raw
rm *nosex
rm *log
#---------------------------------
#plink.phe & gmdr.out -> gts.phe
for i in `seq $trt_num`
do
{
  n=`awk 'END{print NR}' $file.${trt[i-1]}.gen`
  m=`awk 'END{print NF}' $file.${trt[i-1]}.gen`
  n=$[$n-1]
  m=$[$m-1]
  echo -e "_Population\tF2" > $file.${trt[i-1]}.phe
  echo -e "_Genotypes\t${n}" >> $file.${trt[i-1]}.phe
  echo -e "_Observations\t${n}" >> $file.${trt[i-1]}.phe
  echo -e "_Environments\tno" >> $file.${trt[i-1]}.phe
  echo -e "_Replications\tno" >> $file.${trt[i-1]}.phe
  echo -e "_TraitNumber\t1" >> $file.${trt[i-1]}.phe
  echo -e "_Chromosomes\t1\tChr1" >> $file.${trt[i-1]}.phe
  echo -e "_TotalMarker\t${m}\t${m}" >> $file.${trt[i-1]}.phe
  echo -e "_MarkerCode\tP1=A\tP2=B\tF1=H\tF1P1=C\tF1P2=D\n" >> $file.${trt[i-1]}.phe
  echo -e "*TraitBegin*" >> $file.${trt[i-1]}.phe
  echo -e "Geno#\t${trt[i-1]}\t;" >> $file.${trt[i-1]}.phe
  
  awk -va=$[$i+1] 'BEGIN{OFS="\t"}(NR>1){print $1,$a,";"}' $file.phe >> $file.${trt[i-1]}.phe
  
  echo -e "*TraitEnd*" >> $file.${trt[i-1]}.phe
} &
done
wait
echo "QTS (.phe) files prepared!"

gmdr2qts_byenv.sh

#!/bin/bash
file=$1
trt=(BN BW LP LY FL FU MC FS FE)
trt_num=${#trt[@]}
#--------------------------------
#gmdr.out -> qts.gen
for e in `seq 7`
do 
{
  line=(`cut -f1 $file.e$e.out | grep Trait -n | cut -d ":" -f1`)
  line[trt_num]=`awk 'END{print NR}' $file.e$e.out`
  for i in `seq $trt_num`
  do
  {
    #seperate into traits
    awk -va=${line[i-1]} -vb=${line[i]} '(NR>=a&&NR<b){print $0}' $file.e$e.out > $file.e$e.${trt[i-1]}.out
    #substract SNP ids
    awk '{for(i=1;i<=NF;i++){ if($i ~ /._./){print $i} } }'  $file.e$e.${trt[i-1]}.out | sort -u > $file.e$e.${trt[i-1]}.list
    sed -i 's/_[ATCG]//g' $file.e$e.${trt[i-1]}.list
  }
  done
} &
done
wait
echo "list files produced!"
for i in `seq $trt_num`
do
{
  #combine all environments by traits
  cat `ls $file.*.${trt[i-1]}.list` | sort -u > $file.${trt[i-1]}.list
  #recode with GMDR significant SNP only
  plink --silent --bfile ../indep --extract $file.${trt[i-1]}.list --keep ../keepid.txt --recode A --out $file.${trt[i-1]} --chr-set 26
  #recode 0/1/2 to A/H/B
  cut --complement -d " " -f2-6 $file.${trt[i-1]}.raw | awk 'BEGIN{OFS="\t"}(NR==1){print $0}(NR>1){for(i=2;i<=NF;i++) {gsub("0","A",$i);gsub("2","B",$i);gsub("1","H",$i);gsub("NA",".",$i);} print $0}' > $file.${trt[i-1]}.gen
  sed -i 's/FID/#Ind/g' $file.${trt[i-1]}.gen
} &
done
wait
echo "QTS (.gen) files prepared!"

rm $file.*.*.out
rm $file.*.*.list
rm $file.*.raw
rm *nosex
rm *log
#---------------------------------
#plink.phe & gmdr.out -> gts.phe
for i in `seq $trt_num`
do
{
  n=`awk 'END{print NR}' $file.${trt[i-1]}.gen`
  m=`awk 'END{print NF}' $file.${trt[i-1]}.gen`
  n=$[$n-1]
  m=$[$m-1]
  echo -e "_Population\tF2" > $file.${trt[i-1]}.phe
  echo -e "_Genotypes\t${n}" >> $file.${trt[i-1]}.phe
  echo -e "_Observations\t$[$n*7]" >> $file.${trt[i-1]}.phe
  echo -e "_Environments\tyes" >> $file.${trt[i-1]}.phe
  echo -e "_Replications\tno" >> $file.${trt[i-1]}.phe
  echo -e "_TraitNumber\t1" >> $file.${trt[i-1]}.phe
  echo -e "_Chromosomes\t1\tChr1" >> $file.${trt[i-1]}.phe
  echo -e "_TotalMarker\t${m}\t${m}" >> $file.${trt[i-1]}.phe
  echo -e "_MarkerCode\tP1=A\tP2=B\tF1=H\tF1P1=C\tF1P2=D\n" >> $file.${trt[i-1]}.phe
  echo -e "*TraitBegin*" >> $file.${trt[i-1]}.phe
  echo -e "Env#\tGeno#\t${trt[i-1]}\t;" >> $file.${trt[i-1]}.phe
  
  for e in `seq 7`
  do
  {
    awk -ve=$e -va=$[$i+1] 'BEGIN{OFS="\t"}(NR>1){print e,$1,$a,";"}' $file.e$e.phe >> $file.${trt[i-1]}.phe
  }
  done
  
  echo -e "*TraitEnd*" >> $file.${trt[i-1]}.phe
} &
done
wait
echo "QTS (.phe) files prepared!"