The aim of this paper is to test the hypothesis that SNPs with more trans-racially (Black-White) homogeneous effect (GWAS Beta) on Education have higher Black-White gap in frequencies. The null hypothesis of no racial differences in IQ predicts no interaction between SNP effect homogeneity and racial gap. An interaction - manifested as higher allele frequency difference for more homogeneous SNPs - supports the racial differences hypothesis. Compute frequencies for SNPs and subset by joint_p.High joint_p indicates weak evidence for trans-racial reduction in predictive validity (that is, more homogeneous effects).
Run on EA_MTAG GWAS significant SNPs
setwd("~/genetics_2018")
Piffer_validity_by_SIRE <- read.delim("C:/Users/Admin/Desktop/Piffer_validity_by_SIRE.tsv")
EA_MTAG_1kg=read.csv("MTAG_EA_snps_wide_chrpos.csv")
library(stringr)
## Warning: package 'stringr' was built under R version 3.5.3
EA_MTAG_1kg$chrpos=str_c(EA_MTAG_1kg$CHR,EA_MTAG_1kg$POS, sep = ":")
#merge
merged_df=merge(Piffer_validity_by_SIRE,EA_MTAG_1kg,by.x = "snp",by.y = "chrpos")#merge GWAS and 1kg datasets
#subset by joint_p
quantile(merged_df$joint_p, probs = c(0.1,0.25,0.5,0.75,0.9), na.rm=TRUE)
## 10% 25% 50% 75% 90%
## 0.05758579 0.19248772 0.45318204 0.71862021 0.88117050
High_joint_p=merged_df[which(merged_df$joint_p > 0.71862021 ),]#use 0.25 and 0.75 quantiles
Low_joint_p=merged_df[which(merged_df$joint_p <0.19248772),]
There are 785 SNPs for each of the two quantile groups. Compute PGS for 1000 Genomes populations for low and high joint_p SNPs and check if higher joint_p SNPs have more predictive validity than lower joint_p SNPs
PGS_high=apply(High_joint_p[,c(18:43)],2,mean,na.rm=TRUE)
PGS_low=apply(Low_joint_p[,c(18:43)],2,mean,na.rm=TRUE)
#compute CEU-YRI differences
delta_low=PGS_low[5]-PGS_low[26]
delta_high=PGS_high[5]-PGS_high[26]
delta_high
## CEU
## 0.03620226
delta_low
## CEU
## 0.01946321
high_lowdiff=delta_high-delta_low
high_lowdiff
## CEU
## 0.01673905
Run Monte Carlo simulation. Draw 1000 subsets at random, each comprising N SNPs, where N is the number of SNPs belonging to the high joint_p and low joint_p subsets. In this case, N=785. Then PGS are computed for each of the 1000 subsets for YRI and CEU. Difference between CEU and YRI for each PGS is computed. Then I compute mean and sd of the difference.
merged_df_1=merged_df[18:43]#keep only freqs.
randomSample = function(n, merged_df_1) {
return(merged_df_1[sample(nrow(merged_df_1), 785),] )
}
n=1000
random_snps=lapply(rep(1,n), function(x)randomSample(x,merged_df_1))
#compute CEU and YRI PGS for each subset
my_names_1 = c("CEU")#take only CEU and YRI columns
result_CEU = lapply(random_snps, "[", , my_names_1)#create list with only CEU and YRI
my_names_2 = c("YRI")#take only CEU and YRI columns
result_YRI=lapply(random_snps, "[", , my_names_2)
#Compute PGS for each element of each list
PGS_CEU=sapply(result_CEU,mean)
PGS_YRI=sapply(result_YRI,mean)
#compute difference between two PGS
CEU_YRI_diff=PGS_CEU - PGS_YRI
#compute mean,sd of the difference
mean(CEU_YRI_diff)
## [1] 0.02436715
sd(CEU_YRI_diff)
## [1] 0.007235846
#CEU-YRI difference for high and low joint_p SNPs
delta_high
## CEU
## 0.03620226
delta_low
## CEU
## 0.01946321
Histogram of CEU_YRI PGS difference at 1000 random trials
Compute Z score of the difference between high and low joint_p
Z_score=(delta_high - delta_low)/sd(CEU_YRI_diff)
Z_score
## CEU
## 2.31335
The Z score of the difference between high and low joint p value PGS is a bit over 2 (2.26). This corresponds to p=0.024
Do the same for EA_MTAG_10K
setwd("~/genetics_2018")
Piffer_validity_by_SIRE <- read.delim("C:/Users/Admin/Desktop/Piffer_validity_by_SIRE.tsv")
EA_MTAG_1kg=read.csv("EA_MTAG_10K_snps_wide.csv")
library(stringr)
EA_MTAG_1kg$chrpos=str_c(EA_MTAG_1kg$CHR,EA_MTAG_1kg$POS, sep = ":")
#merge
merged_df=merge(Piffer_validity_by_SIRE,EA_MTAG_1kg,by.x = "snp",by.y = "chrpos")#merge GWAS and 1kg datasets
#subset by joint_p
quantile(merged_df$joint_p, probs = c(0.1,0.25,0.5,0.75,0.9), na.rm=TRUE)
## 10% 25% 50% 75% 90%
## 0.0662076 0.2029190 0.4532834 0.7207942 0.8809646
High_joint_p=merged_df[which(merged_df$joint_p > 0.7207942),]
Low_joint_p=merged_df[which(merged_df$joint_p <0.2029190 ),]
#compute PGS for 1000 Genomes populations for low and high joint_p SNPs and check if higher joint_p SNPs have more predictive validity than lower joint_p SNPs
PGS_high=apply(High_joint_p[,c(18:43)],2,mean,na.rm=TRUE)
PGS_low=apply(Low_joint_p[,c(18:43)],2,mean,na.rm=TRUE)
IQ=c(83,85,71,NA,99,105,105,83.5,71,101,100,NA,62,97,NA,105,99.4,74,64,88,85,84,83.5,79,99,71)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
rcorr(PGS_low,IQ, type = "pearson")
## x y
## x 1.00 0.81
## y 0.81 1.00
##
## n
## x y
## x 26 23
## y 23 23
##
## P
## x y
## x 0
## y 0
rcorr(PGS_high,IQ, type = "pearson")
## x y
## x 1.00 0.85
## y 0.85 1.00
##
## n
## x y
## x 26 23
## y 23 23
##
## P
## x y
## x 0
## y 0
#compute CEU-YRI differences
delta_low=PGS_low[5]-PGS_low[26]
delta_high=PGS_high[5]-PGS_high[26]
delta_high
## CEU
## 0.02628667
delta_low
## CEU
## 0.01290239
high_lowdiff=delta_high-delta_low
high_lowdiff
## CEU
## 0.01338428
#run Monte Carlo simulation
merged_df_1=merged_df[18:43]#keep only freqs.
#draw 1000 random SNP subsets
#merged_df_1[sample(nrow(merged_df_1), 785), ]#draw one sample
randomSample = function(n, merged_df_1) {
#note that the function doesn't really use n inside it, if you want so, you should #replace 785 for n and use rep(n,n) inside the lapply call
return(merged_df_1[sample(nrow(merged_df_1), 2076),] )
}
n=1000
random_snps=lapply(rep(1,n), function(x)randomSample(x,merged_df_1))
#compute CEU and YRI PGS for each subset
my_names_1 = c("CEU")#take only CEU and YRI columns
result_CEU = lapply(random_snps, "[", , my_names_1)#create list with only CEU and YRI
my_names_2 = c("YRI")#take only CEU and YRI columns
result_YRI=lapply(random_snps, "[", , my_names_2)
#Compute PGS for each element of each list
PGS_CEU=sapply(result_CEU,mean)
PGS_YRI=sapply(result_YRI,mean)
#compute difference between two PGS
CEU_YRI_diff=PGS_CEU - PGS_YRI
mean(CEU_YRI_diff,na.rm = TRUE)
## [1] 0.01617886
hist(CEU_YRI_diff)
sd=sd(CEU_YRI_diff,na.rm = TRUE)
delta_high
## CEU
## 0.02628667
delta_low
## CEU
## 0.01290239
Z_score=(delta_high - delta_low)/sd
Z_score
## CEU
## 3.021231
2*pnorm(-abs(Z_score))
## CEU
## 0.002517492