The aim of this paper is to test the hypothesis that SNPs with more trans-racially (Black-White) homogeneous effect (GWAS Beta) on Education have higher Black-White gap in frequencies. The null hypothesis of no racial differences in IQ predicts no interaction between SNP effect homogeneity and racial gap. An interaction - manifested as higher allele frequency difference for more homogeneous SNPs - supports the racial differences hypothesis. Compute frequencies for SNPs and subset by joint_p.High joint_p indicates weak evidence for trans-racial reduction in predictive validity (that is, more homogeneous effects).

Run on EA_MTAG GWAS significant SNPs

setwd("~/genetics_2018")
Piffer_validity_by_SIRE <- read.delim("C:/Users/Admin/Desktop/Piffer_validity_by_SIRE.tsv")
EA_MTAG_1kg=read.csv("MTAG_EA_snps_wide_chrpos.csv")
library(stringr)
## Warning: package 'stringr' was built under R version 3.5.3
EA_MTAG_1kg$chrpos=str_c(EA_MTAG_1kg$CHR,EA_MTAG_1kg$POS, sep = ":")
#merge
merged_df=merge(Piffer_validity_by_SIRE,EA_MTAG_1kg,by.x = "snp",by.y = "chrpos")#merge GWAS and 1kg datasets
#subset by joint_p
quantile(merged_df$joint_p, probs = c(0.1,0.25,0.5,0.75,0.9), na.rm=TRUE)
##        10%        25%        50%        75%        90% 
## 0.05758579 0.19248772 0.45318204 0.71862021 0.88117050
High_joint_p=merged_df[which(merged_df$joint_p > 0.71862021 ),]#use 0.25 and 0.75 quantiles
Low_joint_p=merged_df[which(merged_df$joint_p <0.19248772),]

There are 785 SNPs for each of the two quantile groups. Compute PGS for 1000 Genomes populations for low and high joint_p SNPs and check if higher joint_p SNPs have more predictive validity than lower joint_p SNPs

PGS_high=apply(High_joint_p[,c(18:43)],2,mean,na.rm=TRUE)
PGS_low=apply(Low_joint_p[,c(18:43)],2,mean,na.rm=TRUE)
#compute CEU-YRI differences
delta_low=PGS_low[5]-PGS_low[26]
delta_high=PGS_high[5]-PGS_high[26]
delta_high
##        CEU 
## 0.03620226
delta_low
##        CEU 
## 0.01946321
high_lowdiff=delta_high-delta_low
high_lowdiff
##        CEU 
## 0.01673905

Run Monte Carlo simulation. Draw 1000 subsets at random, each comprising N SNPs, where N is the number of SNPs belonging to the high joint_p and low joint_p subsets. In this case, N=785. Then PGS are computed for each of the 1000 subsets for YRI and CEU. Difference between CEU and YRI for each PGS is computed. Then I compute mean and sd of the difference.

merged_df_1=merged_df[18:43]#keep only freqs. 
randomSample = function(n, merged_df_1) { 

  return(merged_df_1[sample(nrow(merged_df_1), 785),] )
}

n=1000
random_snps=lapply(rep(1,n), function(x)randomSample(x,merged_df_1))
#compute CEU and YRI PGS for each subset
my_names_1 = c("CEU")#take only CEU and YRI columns
result_CEU = lapply(random_snps, "[", , my_names_1)#create list with only CEU and YRI
my_names_2 = c("YRI")#take only CEU and YRI columns
result_YRI=lapply(random_snps, "[", , my_names_2)
#Compute PGS for each element of each list
PGS_CEU=sapply(result_CEU,mean)
PGS_YRI=sapply(result_YRI,mean)
#compute difference between two PGS
CEU_YRI_diff=PGS_CEU - PGS_YRI
#compute mean,sd of the difference
mean(CEU_YRI_diff)
## [1] 0.02436715
sd(CEU_YRI_diff)
## [1] 0.007235846
#CEU-YRI difference for high and low joint_p SNPs
delta_high
##        CEU 
## 0.03620226
delta_low
##        CEU 
## 0.01946321

Histogram of CEU_YRI PGS difference at 1000 random trials

Compute Z score of the difference between high and low joint_p

Z_score=(delta_high - delta_low)/sd(CEU_YRI_diff)
Z_score
##     CEU 
## 2.31335

The Z score of the difference between high and low joint p value PGS is a bit over 2 (2.26). This corresponds to p=0.024

Do the same for EA_MTAG_10K

setwd("~/genetics_2018")
Piffer_validity_by_SIRE <- read.delim("C:/Users/Admin/Desktop/Piffer_validity_by_SIRE.tsv")
EA_MTAG_1kg=read.csv("EA_MTAG_10K_snps_wide.csv")
library(stringr)
EA_MTAG_1kg$chrpos=str_c(EA_MTAG_1kg$CHR,EA_MTAG_1kg$POS, sep = ":")
#merge
merged_df=merge(Piffer_validity_by_SIRE,EA_MTAG_1kg,by.x = "snp",by.y = "chrpos")#merge GWAS and 1kg datasets
#subset by joint_p
quantile(merged_df$joint_p, probs = c(0.1,0.25,0.5,0.75,0.9), na.rm=TRUE)
##       10%       25%       50%       75%       90% 
## 0.0662076 0.2029190 0.4532834 0.7207942 0.8809646
High_joint_p=merged_df[which(merged_df$joint_p >  0.7207942),]
Low_joint_p=merged_df[which(merged_df$joint_p <0.2029190 ),]


#compute PGS for 1000 Genomes populations for low and high joint_p SNPs and check if higher joint_p SNPs have more predictive validity than lower joint_p SNPs
PGS_high=apply(High_joint_p[,c(18:43)],2,mean,na.rm=TRUE)
PGS_low=apply(Low_joint_p[,c(18:43)],2,mean,na.rm=TRUE)
IQ=c(83,85,71,NA,99,105,105,83.5,71,101,100,NA,62,97,NA,105,99.4,74,64,88,85,84,83.5,79,99,71)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
rcorr(PGS_low,IQ, type = "pearson")
##      x    y
## x 1.00 0.81
## y 0.81 1.00
## 
## n
##    x  y
## x 26 23
## y 23 23
## 
## P
##   x  y 
## x     0
## y  0
rcorr(PGS_high,IQ, type = "pearson")
##      x    y
## x 1.00 0.85
## y 0.85 1.00
## 
## n
##    x  y
## x 26 23
## y 23 23
## 
## P
##   x  y 
## x     0
## y  0
#compute CEU-YRI differences
delta_low=PGS_low[5]-PGS_low[26]
delta_high=PGS_high[5]-PGS_high[26]
delta_high
##        CEU 
## 0.02628667
delta_low
##        CEU 
## 0.01290239
high_lowdiff=delta_high-delta_low
high_lowdiff
##        CEU 
## 0.01338428
#run Monte Carlo simulation
merged_df_1=merged_df[18:43]#keep only freqs. 
#draw 1000 random SNP subsets
#merged_df_1[sample(nrow(merged_df_1), 785), ]#draw one sample

randomSample = function(n, merged_df_1) { 
  #note that the function doesn't really use n inside it, if you want so, you should #replace 785 for n and use rep(n,n) inside the lapply call
  return(merged_df_1[sample(nrow(merged_df_1), 2076),] )
}

n=1000
random_snps=lapply(rep(1,n), function(x)randomSample(x,merged_df_1))
#compute CEU and YRI PGS for each subset
my_names_1 = c("CEU")#take only CEU and YRI columns
result_CEU = lapply(random_snps, "[", , my_names_1)#create list with only CEU and YRI
my_names_2 = c("YRI")#take only CEU and YRI columns
result_YRI=lapply(random_snps, "[", , my_names_2)
#Compute PGS for each element of each list
PGS_CEU=sapply(result_CEU,mean)
PGS_YRI=sapply(result_YRI,mean)
#compute difference between two PGS
CEU_YRI_diff=PGS_CEU - PGS_YRI
mean(CEU_YRI_diff,na.rm = TRUE)
## [1] 0.01617886
hist(CEU_YRI_diff)

sd=sd(CEU_YRI_diff,na.rm = TRUE)
delta_high
##        CEU 
## 0.02628667
delta_low
##        CEU 
## 0.01290239
Z_score=(delta_high - delta_low)/sd
Z_score
##      CEU 
## 3.021231
2*pnorm(-abs(Z_score))
##         CEU 
## 0.002517492