Linkage VS Association analysis

数据

表型数据

来源于两个部分：

常规表型：来自版本号为UKB40268 (获取时间应该是2020年，比较老)的81个重要的数量性状，与齐国安计算遗传力的性状保持一致

phe_info<-read_xlsx("/disk195/zz/HE_GE/VC.RHE.202406.split2.xlsx")
head(unique(phe_info[,1:2]))

## # A tibble: 6 × 2
##   Field_ID Field                              
##   <chr>    <chr>                              
## 1 21021    Pulse wave Arterial Stiffness index
## 2 23099    Body fat percentage                
## 3 23102    Whole body water mass              
## 4 23106    Impedance of whole body            
## 5 23107    Impedance of leg (right)           
## 6 23108    Impedance of leg (left)

代谢表型：来自版本号为UKB674363 (获取时间应该是2023年，比较新)的251个代谢性状，与钱雨做代谢UKC的性状保持一致

met_info<-read.table("/disk195/zz/HE_GE/miss_number_metablism.txt",h=T,sep="\t")
head(met_info[,2,drop=FALSE])

##                  V2
## 1 3-Hydroxybutyrate
## 2           Acetate
## 3      Acetoacetate
## 4           Acetone
## 5           Alanine
## 6           Albumin

基因型数据

也分为两个部分： + UKB中的Unrelated British样本，个体数为292223，SNP数量为784256

urltd_fam<-read.table("/disk201/zzy/1_GenomicSelection/2-Phenotype/4.UKB/ukb/british/unrelated_british/urltd_British.fam",h=F)
dim(urltd_fam)

## [1] 292223      6

urltd_bim<-fread("/disk201/zzy/1_GenomicSelection/2-Phenotype/4.UKB/ukb/british/unrelated_british/urltd_British.bim",h=F)
dim(urltd_bim)

## [1] 784256      6

UKB中的Sib pair样本，来自家系18281，个体数为36562，位点数为784256

sib_fam<-read.table("/disk201/zzy/1_GenomicSelection/2-Phenotype/4.UKB/ukb/british/ukbSib/British.2fullSib.newFID.fam",h=F)
data.frame(Family=length(unique(sib_fam$V1)),
           Ind=length(unique(sib_fam$V2)))

##   Family   Ind
## 1  18281 36562

sib_bim<-read.table("/disk201/zzy/1_GenomicSelection/2-Phenotype/4.UKB/ukb/british/ukbSib/British.2fullSib.newFID.bim",h=F)
dim(urltd_bim)

## [1] 784256      6

比较奇怪的是Unrelated个体与Sib个体之间存在53个overlap

length(intersect(sib_fam$V2,urltd_fam$V2))

## [1] 53

所有关注性状在unrelated与sib个体中的分布情况

# 常规性状
trt_dir<-"/disk224/zz/HE_GE/ukb_phe_40268"
gen_dir<-"/disk201/zzy/1_GenomicSelection/2-Phenotype/4.UKB/ukb/british/ukbSib"
gen_dir2<-"/disk201/zzy/1_GenomicSelection/2-Phenotype/4.UKB/ukb/british/unrelated_british"

phe_info<-read_xlsx("/disk195/zz/HE_GE/VC.RHE.202406.split2.xlsx")
fam<-read.table(str_glue("{gen_dir}/British.2fullSib.newFID.fam"),h=F)
fam2<-read.table(str_glue("{gen_dir2}/urltd_British.fam"),h=F)
codes<-unique(phe_info$Field_ID)
phe_size<-map_dfr(codes,~{
  phe<-read.table(str_glue("{trt_dir}/{.x}.tab"),h=T,sep="\t")
  res1<-data.frame(TRT=phe[,2][match(fam$V2,phe[,1])],FAM=fam$V1)
  res1<-res1[complete.cases(res1),]
  res2<-data.frame(TRT=phe[,2][match(fam2$V2,phe[,1])])
  
  res<-data.frame(Code=.x,
                   Trait=phe_info$Field[phe_info$Field_ID%in%.x][1],
                   SibSize=nrow(res1),
                   SibFamilySize=(sum(table(res1$FAM)>1)),
                   UnrelatedSize=sum(!is.na(res2$TRT))
                   )
  return(res)
})

write.table(phe_size,file="/disk224/zz/HE_GE/phe_size.txt",row.names=F,quote=F,sep="\t")

# 代谢性状
trt_dir<-"/disk224/zz/HE_GE/ukb_metab_674363"
phe_info<-read.table("/disk195/zz/HE_GE/miss_number_metablism.txt",sep="\t",
                     h=T)
codes<-unique(phe_info$V1)
phe_size2<-map_dfr(codes,~{
  phe<-read.table(str_glue("{trt_dir}/{.x}.tab"),h=T,sep="\t")
  res1<-data.frame(TRT=phe[,2][match(fam$V2,phe[,1])],FAM=fam$V1)
  res1<-res1[complete.cases(res1),]
  res2<-data.frame(TRT=phe[,2][match(fam2$V2,phe[,1])])
  
  res<-data.frame(Code=.x,
                   Trait=phe_info$V2[phe_info$V1%in%.x][1],
                   SibSize=nrow(res1),
                   SibFamilySize=(sum(table(res1$FAM)>1)),
                   UnrelatedSize=sum(!is.na(res2$TRT))
                   )
  return(res)
})

write.table(phe_size2,file="/disk224/zz/HE_GE/phe_size2.txt",row.names=F,quote=F,sep="\t")

具体的两类性状的检测量如下

phe_size<-read.table("/disk224/zz/HE_GE/phe_size.txt",h=T,sep="\t")
phe_size2<-read.table("/disk224/zz/HE_GE/phe_size2.txt",h=T,sep="\t")
phe_size<-rbind(phe_size,phe_size2)
head(phe_size)

##    Code                               Trait SibSize SibFamilySize UnrelatedSize
## 1 21021 Pulse wave Arterial Stiffness index   10914          3660         96830
## 2 23099                 Body fat percentage   35979         17706        286924
## 3 23102               Whole body water mass   36002         17729        287102
## 4 23106             Impedance of whole body   35997         17725        287075
## 5 23107            Impedance of leg (right)   35999         17726        287091
## 6 23108             Impedance of leg (left)   35999         17726        287087

分析内容

基于Sib数据的基因型计算位点层面的IBS

程序为/disk195/zz/HE_GE/01_snp-wise_ibs.R，对于某一个位点\(k\)，同一家系的两个个体\(i\)与个体\(j\)之间的IBS计算公式为 \[ \text{IBS}_{ijk} = \frac{(x_{ik}-2p_i)(x_{jk}-2p_i)}{2p_i(1-p_i)} \tag{1} \] 根据上式，首先根据Sib群体的基因型计算每个位点的等位基因频率，然后针对每个家系分别计算Sib之间的所有位点的IBS，核心代码如下：

# 先用plink的freqx命令计算基因型频率，之后用公式p=A+1/2H计算等位基因频率
cmd<-str_glue("{plink} --bfile {gen_dir}/British.2fullSib.newFID --freqx --out {out_dir}/sib")
system(cmd)
frq<-fread(str_glue("{out_dir}/sib.frqx"))
p<-frq$`C(HOM A2)`/(frq$`C(HOM A1)`+frq$`C(HET)`+frq$`C(HOM A2)`) +
  0.5*(frq$`C(HET)`/(frq$`C(HOM A1)`+frq$`C(HET)`+frq$`C(HOM A2)`))

# 之后拆分每个家系，并用下面的命令计算计算位点层面的IBS
worker<-function(famid,out_dir,plink2,gen_dir){
  #famid<-1
  out_dir2<-str_glue("{out_dir}/ibs/fam{famid}")
  dir.create(out_dir2,showWarnings = F,recursive = T)
  write.table(famid,file=str_glue("{out_dir2}/ind"),
              row.names=F,col.names = F,quote=F)
  cmd<-str_glue("{plink2} --bfile {gen_dir}/British.2fullSib.newFID --keep-fam {out_dir2}/ind --export vcf 'vcf-dosage=DS-only' --out {out_dir2}/geno")
  system(cmd)
  geno<-fread(str_glue("{out_dir2}/geno.vcf"))
  a<-2-as.numeric(geno[[names(geno)[10]]])
  b<-2-as.numeric(geno[[names(geno)[11]]])
  pq<-2*p*(1-p)
  ibs<-data.frame(IBS=(a-2*p)*(b-2*p)/pq)
  fwrite(ibs,file=str_glue("{out_dir2}/ibs"),row.names=F,
         col.names=F,quote=F,sep="\t",na="NA")
  
}

当然，对每对sib-pair的所有位点的IBS取均值就可以得到两个个体之间的平均IBS，整个sib群体的平均IBS分布如下

knitr::opts_chunk$set(dpi=300)
ibs_mean<-read.table("/disk224/zz/HE_GE/ukb_sib_geno/ibs_mean.txt",h=T)

ggplot(ibs_mean, aes(x=IBS))+
  geom_density(color="darkblue", fill="lightblue")+
  geom_vline(aes(xintercept=mean(IBS)),
             color="blue", linetype="dashed", linewidth=1)+
  theme_classic() +
  theme(panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        #legend.title = element_blank(),
        plot.background = element_rect(fill ='white', color ='white'),
        # axis.text.x = element_blank(),
        axis.text.y = element_text(size=25,color="black"),
        axis.text.x = element_text(size=25,color="black"),
        text = element_text(size=25,color="black"),
        legend.position = "bottom"
  )

计算IBS的方差，并取倒数

1/var(ibs_mean$IBS)

## [1] 585.609

用RHE的方法计算unrelated British群体在81个传统性状的遗传力

分为2步： 1. 对于无关个体，我们使用RHE方法计算其在81个传统性状上的遗传力，注意首先剔除掉这些个体中与sib群体存在交集的部分（n=53）。之后，写出其协变量文件，考虑的协变量包括性别+前三个主成分。 2. 运算程序为/disk195/zz/HE_GE/11_he_rand_heritability_ulrd_run.pl，该程序首先调用plink，对于每一个性状，分别去除掉表型缺失的个体以及等位基因频率小于0.01或者缺失率大于0.05的位点。再调用gear软件，利用RHE算法计算遗传力，参数为--rand-he-max-it 60 --rand-he-b0 10 --rand-he-b1 10 --rand-he-max-it 60 --rand-he-b0 10 --rand-he-b1 10

基于sib的IBS，利用HE方法计算81个传统性状的遗传力

对于Sib的遗传力，首先利用Sib的表型信息校正掉三个主成分以及性别的固定效应，并对残差进行标准化，然后分别获取Sib-pair的性状差值的平方以及Sib-pair表型值的乘积两类表型，具体代码如下：

# 读入主成分、性别、家系等信息
gen_dir<-"/disk201/zzy/1_GenomicSelection/2-Phenotype/4.UKB/ukb/british/ukbSib"
fam<-read.table(str_glue("{gen_dir}/British.2fullSib.newFID.fam"),h=F)
pc_dir<-"/disk224/zz/HE_GE/reml_herit"
pcs<-read.table(str_glue("{pc_dir}/sib.eigenvec"),h=F)
dat<-cbind(fam[,1:5],pcs[,3:5])
colnames(dat)<-c("FAMID","ID","SIRE","DAM","SEX","PC1",
                 "PC2","PC3")
dat$SEX<-as.factor(dat$SEX)

# 利用LM校正掉固定效应的影响，然后进行标准化
y<-map_dfc(1:ncol(phe),~{
  dat$TRT<-phe[,.x]
  dat2<-dat[complete.cases(dat),]
  if(length(unique(dat2$SEX))<2){
    mod<-lm(TRT~PC1+PC2+PC3-1,data=dat)
  }else{
    mod<-lm(TRT~SEX+PC1+PC2+PC3-1,data=dat)
  }
  resid<-residuals(mod)
  resid<-resid[match(1:nrow(dat), names(resid))]
  res<-data.frame(PHE=scale(resid))
  colnames(res)<-colnames(phe)[.x]
  return(res)
})
y<-cbind(dat[,1:2],y)

# 按照家系计算表性差方与表型乘积
y2<-y %>%
  group_split(FAMID)

ysq<-map_dfr(y2,~{
  tmp<-.x[,3:ncol(.x)]
  res<-data.frame(FAMID=.x[1,1])
  ysq<-(tmp[1,]-tmp[2,])^2
  res<-cbind(res,ysq)
  return(res)
})

yprod<-map_dfr(y2,~{
  tmp<-.x[,3:ncol(.x)]
  res<-data.frame(FAMID=.x[1,1])
  ysq<-tmp[1,]*tmp[2,]
  res<-cbind(res,ysq)
  return(res)
})

# 输出到计算机
ibs_dir<-"/disk224/zz/HE_GE/ukb_sib_geno"
ibs_mean<-read.table(str_glue("{ibs_dir}/ibs_mean.txt"),h=T)

ysq<-ysq[match(ibs_mean$FAM,ysq$FAMID),]
yprod<-yprod[match(ibs_mean$FAM,yprod$FAMID),]
write.table(ysq,file=str_glue("{out_dir}/ysq.txt"),row.names = F,
            col.names = F,quote = F,sep = "\t")
write.table(yprod,file=str_glue("{out_dir}/yprod.txt"),row.names = F,
            col.names = F,quote = F,sep = "\t")

之后使用线性模型计算遗传力，代码为：

knitr::opts_chunk$set(dpi=300)

# calculate heritability
ibs_dir<-"/disk224/zz/HE_GE/ukb_sib_geno"
ibs_mean<-read.table(str_glue("{ibs_dir}/ibs_mean.txt"),h=T)

ysq<-read.table("/disk224/zz/HE_GE/he_herit/ysq.txt",h=T,sep="\t",check.names = F)
yprod<-read.table("/disk224/zz/HE_GE/he_herit/yprod.txt",h=T,sep="\t",check.names = F)
h2_ysq<-map_dfr(2:ncol(ysq),function(i){
  mod<-lm(ysq[,i]~ibs_mean$IBS)
  res<-summary(mod)$coefficients
  res<-data.frame(TRT=colnames(ysq)[i],
                  h2=-res[2,1]/2,se=res[2,2],pval=res[2,4])
})

h2_prod<-map_dfr(2:ncol(yprod),function(i){
  mod<-lm(yprod[,i]~ibs_mean$IBS)
  res<-summary(mod)$coefficients
  res<-data.frame(TRT=colnames(yprod)[i],
                  h2=res[2,1],se=res[2,2],pval=res[2,4])
})

再读取利用RHE计算的无关个体的遗传力，然后利用散点图进行对比

knitr::opts_chunk$set(dpi=300)
phe_info<-read_xlsx("/disk195/zz/HE_GE/VC.RHE.202406.split2.xlsx")
codes<-unique(phe_info$Field_ID)
# h2_rand_dir<-"/disk224/zz/HE_GE/he_herit_rand"
# h2_rand<-map_dfr(codes,~{
#   h2<-read.table(str_glue("{h2_rand_dir}/{.x}/rand.heReg"),skip = 1,h=F,sep="\t",fill=T)
#   return(data.frame(ID=.x,
#                     h2=as.numeric(str_match(h2[2,2],"\\) (.+)")[,2])
#                     ))
# })

h2_rand<-phe_info[phe_info$B==50,c("Field_ID","h2")]
colnames(h2_rand)<-c("ID","h2")
h2<-data.frame(TRT=h2_prod$TRT,
               h2_prod=h2_prod$h2,
               h2_ysq=h2_ysq$h2,
               h2_rand=as.numeric(h2_rand$h2[match(h2_prod$TRT,h2_rand$ID)]))
h2$h2_prod<-as.numeric(h2$h2_prod)
h2$h2_ysq<-as.numeric(h2$h2_ysq)
h2$h2_rand<-as.numeric(h2$h2_rand)

# 有一个性状3872有点问题

# 画图
h2<-h2[complete.cases(h2),]
h2$Name<-phe_size$Trait[match(h2$TRT,phe_size$Code)]
h2$Name[!h2$TRT%in%c(50,2976,21001)]<-"Other"
h2$Name[h2$TRT%in%c(50)]<-"Height"
h2$Name[h2$TRT%in%c(2976)]<-"AgeDiabetes"
h2$Name[h2$TRT%in%c(21001)]<-"BMI"

p1<-ggplot(h2, aes(x=h2_rand, y=h2_ysq, color=Name)) + 
  geom_point(size=2)+
  geom_abline(slope=1, intercept=0)+
  theme_minimal()+xlab("Unrelated heritability")+
  ylab("Sibs heritability (square)")+
  gghighlight(Name!="Other")

## Warning: Tried to calculate with group_by(), but the calculation failed.
## Falling back to ungrouped filter operation...

## Warning: Could not calculate the predicate for layer 2; ignored

## label_key: Name

p2<-ggplot(h2, aes(x=h2_rand, y=h2_prod, color=Name)) + 
  geom_point(size=2)+
  geom_abline(slope=1, intercept=0)+xlab("Unrelated heritability")+
  ylab("Sibs heritability (production)")+
  theme_minimal()+
  gghighlight(Name!="Other")

## Warning: Tried to calculate with group_by(), but the calculation failed.
## Falling back to ungrouped filter operation...
## Could not calculate the predicate for layer 2; ignored

## label_key: Name

ggarrange(p1,p2,nrow=1)

Corss-Production VS Square

h2<-read.table("/disk224/zz/HE_GE/he_herit/h2.txt",h=F)
colnames(h2)<-c("TRT","h2_prod","h2_ysq","h2_rand")
h2$Name<-phe_size$Trait[match(h2$TRT,phe_size$Code)]
h2$Name[!h2$TRT%in%c(50,2976,21001)]<-"Other"
h2$Name[h2$TRT%in%c(50)]<-"Height"
h2$Name[h2$TRT%in%c(2976)]<-"AgeDiabetes"
h2$Name[h2$TRT%in%c(21001)]<-"BMI"

# 画图
ggplot(h2, aes(x=h2_prod, y=h2_ysq,color=Name)) + 
  geom_point(size=2)+
  geom_abline(slope=1, intercept=0)+
  theme_minimal()+xlab("Cross-Production")+
  ylab("Square")+
  gghighlight(Name!="Other")

## Warning: Tried to calculate with group_by(), but the calculation failed.
## Falling back to ungrouped filter operation...

## Warning: Could not calculate the predicate for layer 2; ignored

## label_key: Name

从每个家系中提取出一个个体，用RHE的方法计算81个传统性状的遗传力

从每个家系中随机选择一个个体，利用RHE计算遗传力，参数与前面保持一致，继续与利用IBS的Sib遗传力进行对比

knitr::opts_chunk$set(dpi=300)
h2_singleSib_rand_dir<-"/disk224/zz/HE_GE/he_herit_rand_singleSib"
# h2_rand_singleSib<-map_dfr(codes,~{
#   h2<-read.table(str_glue("{h2_singleSib_rand_dir}/{.x}/rand.heReg"),skip = 1,h=F,sep="\t",fill=T)
#   return(data.frame(ID=.x,
#                     h2=as.numeric(str_match(h2[2,2],"\\) (.+)")[,2])
#   ))
# })
h2_rand_singleSib<-map_dfr(codes,~{
  h2<-read.table(str_glue("{h2_singleSib_rand_dir}/{.x}/gcta.he.HEreg"),skip = 6,h=T,fill=T)
  return(data.frame(ID=.x,
                    h2=h2[2,2]))
})

h2<-data.frame(TRT=h2_prod$TRT,
               h2_prod=h2_prod$h2,
               h2_ysq=h2_ysq$h2,
               h2_rand=as.numeric(h2_rand_singleSib$h2[match(h2_prod$TRT,h2_rand_singleSib$ID)]))
h2$h2_prod<-as.numeric(h2$h2_prod)
h2$h2_ysq<-as.numeric(h2$h2_ysq)
h2$h2_rand<-as.numeric(h2$h2_rand)
h2$Name<-phe_size$Trait[match(h2$TRT,phe_size$Code)]
h2$Name[!h2$TRT%in%c(50,2976,21001)]<-"Other"
h2$Name[h2$TRT%in%c(50)]<-"Height"
h2$Name[h2$TRT%in%c(2976)]<-"AgeDiabetes"
h2$Name[h2$TRT%in%c(21001)]<-"BMI"

# 画图
h2<-h2[complete.cases(h2),]
p1<-ggplot(h2, aes(x=h2_rand, y=h2_ysq,color=Name)) + 
  geom_point()+
  geom_abline(slope=1, intercept=0)+
  theme_minimal()+xlab("SingleSib heritability")+
  ylab("Sibs heritability (square)")+
  gghighlight(Name!="Other")

## Warning: Tried to calculate with group_by(), but the calculation failed.
## Falling back to ungrouped filter operation...

## Warning: Could not calculate the predicate for layer 2; ignored

## label_key: Name

p2<-ggplot(h2, aes(x=h2_rand, y=h2_prod,color=Name)) + 
  geom_point()+
  geom_abline(slope=1, intercept=0)+xlab("SingleSib heritability")+
  ylab("Sibs heritability (production)")+
  theme_minimal()+
  gghighlight(Name!="Other")

## Warning: Tried to calculate with group_by(), but the calculation failed.
## Falling back to ungrouped filter operation...
## Could not calculate the predicate for layer 2; ignored

## label_key: Name

ggarrange(p1,p2,nrow=1)

用RHE的方法计算unrelated British群体在251个代谢性状的遗传力

分为2步： 1. 对于无关个体，我们使用RHE方法计算其在2511个传统性状上的遗传力，注意首先剔除掉这些个体中与sib群体存在交集的部分（n=53）。之后，写出其协变量文件，考虑的协变量包括性别+前三个主成分。 2. 运算程序为/disk195/zz/HE_GE/11_he_rand_heritability_ulrd_metab_run.pl，该程序首先调用plink，对于每一个性状，分别去除掉表型缺失的个体以及等位基因频率小于0.01或者缺失率大于0.05的位点。再调用gear软件，利用RHE算法计算遗传力，参数为--rand-he-max-it 60 --rand-he-b0 10 --rand-he-b1 10 --rand-he-max-it 60 --rand-he-b0 10 --rand-he-b1 10

基于sib的IBS，利用HE方法计算251个代谢性状的遗传力

代码在/disk195/zz/HE_GE/03_heritability_metab.R，模式与上面常规性状的处理较为类似，重点看作图部分

knitr::opts_chunk$set(dpi=300)
h2<-read.table("/disk224/zz/HE_GE/he_herit_metab/h2.txt",h=F)
colnames(h2)<-c("TRT","h2_prod","h2_ysq","h2_rand")

# 画图
p1<-ggplot(h2, aes(x=h2_rand, y=h2_ysq)) + 
  geom_point()+
  geom_abline(slope=1, intercept=0)+
  theme_minimal()+xlab("Unrelated heritability")+
  ylab("Sibs heritability (square)")
  

p2<-ggplot(h2, aes(x=h2_rand, y=h2_prod)) + 
  geom_point()+
  geom_abline(slope=1, intercept=0)+xlab("Unrelated heritability")+
  ylab("Sibs heritability (production)")+
  theme_minimal()

ggarrange(p1,p2,nrow=1)