Download files from: https://osf.io/ewxqj/ HGDP frequencies file: “Okbay_sig_results.csv”. Frequencies of GWAS hits from 52 populations pre-computed using HGDP-CEPH browser (http://spsmart.cesga.es/ceph.php?dataSet=ceph_stanford). GWAS summary file: “Okbay_sig.csv”. Pre-selected GWAS significant hits from the full GWAS summary file by Okbay et al., 2016.
You will be requested to upload them as the code runs. Note that GWAS files were not LD clumped because this would result in too small number of SNPs being found in the HGDP dataset. However, LD clumping should be performed on the output HGDP files. For simplicity, this step is omitted but will be done in a follow-up version.
HGDP_CEPH=read.csv(file.choose(), header=TRUE, sep = ";")#open HGDP-CEPH browser output with freqs from GWAS hits
gwas=read.csv(file.choose(), header=TRUE)#open GWAS summary file
Merge two datasets
HGDP_CEPH_merged=merge(HGDP_CEPH,gwas, by.x = "SNP", by.y = "MarkerName")
Split into 8 groups according to levels of vector “var”: “AC” ,“AG”, “CA”, “CT”, “GA”, “GT” ,“TC”, “TG”
df_AC=subset(HGDP_CEPH_merged, var=="AC")
df_AG=subset(HGDP_CEPH_merged, var=="AG")
df_CA=subset(HGDP_CEPH_merged, var=="CA")
df_CT=subset(HGDP_CEPH_merged, var=="CT")
df_GA=subset(HGDP_CEPH_merged, var=="GA")
df_GT=subset(HGDP_CEPH_merged, var=="GT")
df_TC=subset(HGDP_CEPH_merged, var=="TC")
df_TG=subset(HGDP_CEPH_merged, var=="TG")
Note that some SNPs are on reverse strand so AC is TG on GWAS and viceversa, so this will cause a nonmatch. Select reverse-stranded SNPs and translate into same strand as HGDP-CEPH reference panel. Then, flip frequency of non-matching alleles (when A2 (A1) is (not) HGDP reference allele).
#AC
df_AC_rev_strand_matching=subset(df_AC,A1=="T")
df_AC_rev_strand_nonmatching=subset(df_AC,A1=="G")
#replace "T" with "A" and "C" with "G"
df_AC_rev_strand_matching$A1[df_AC_rev_strand_matching$A1=="T"] <- "A"
df_AC_rev_strand_matching$A2[df_AC_rev_strand_matching$A2=="G"] <- "C"
df_AC_rev_strand_nonmatching$A1[df_AC_rev_strand_nonmatching$A1=="G"] <- "C"
df_AC_rev_strand_nonmatching$A2[df_AC_rev_strand_nonmatching$A2=="T"] <- "A"
## Warning in `[<-.factor`(`*tmp*`, df_AC_rev_strand_nonmatching$A2 == "T", :
## invalid factor level, NA generated
df_AC_rev_strand_nonmatching$freq_A=df_AC_rev_strand_nonmatching$freq_C#flip freq
#AG
df_AG_rev_strand_matching=subset(df_AG,A1=="T")
df_AG_rev_strand_nonmatching=subset(df_AG,A1=="C")
#replace "T" with "A" and "C" with "G"
df_AG_rev_strand_matching$A1[df_AG_rev_strand_matching$A1=="T"] <- "A"
df_AG_rev_strand_matching$A2[df_AG_rev_strand_matching$A2=="C"] <- "G"
df_AG_rev_strand_nonmatching$A1[df_AG_rev_strand_nonmatching$A1=="C"] <- "G"
## Warning in `[<-.factor`(`*tmp*`, df_AG_rev_strand_nonmatching$A1 == "C", :
## invalid factor level, NA generated
df_AG_rev_strand_nonmatching$A2[df_AG_rev_strand_nonmatching$A2=="T"] <- "A"
## Warning in `[<-.factor`(`*tmp*`, df_AG_rev_strand_nonmatching$A2 == "T", :
## invalid factor level, NA generated
df_AG_rev_strand_nonmatching$freq_A=df_AG_rev_strand_nonmatching$freq_G#flip freq
#CA
df_CA_rev_strand_matching=subset(df_CA,A1=="G")
df_CA_rev_strand_nonmatching=subset(df_CA,A1=="T")
#replace "T" with "A" and "C" with "G"
df_CA_rev_strand_matching$A1[df_CA_rev_strand_matching$A1=="G"] <- "C"
df_CA_rev_strand_matching$A2[df_CA_rev_strand_matching$A2=="T"] <- "A"
## Warning in `[<-.factor`(`*tmp*`, df_CA_rev_strand_matching$A2 == "T",
## value = structure(integer(0), .Label = c("C", : invalid factor level, NA
## generated
df_CA_rev_strand_nonmatching$A1[df_CA_rev_strand_nonmatching$A1=="T"] <- "A"
df_CA_rev_strand_nonmatching$A2[df_CA_rev_strand_nonmatching$A2=="G"] <- "C"
df_CA_rev_strand_nonmatching$freq_C=df_CA_rev_strand_nonmatching$freq_A#flip freq
#CT
df_CT_rev_strand_matching=subset(df_CT,A1=="G")
df_CT_rev_strand_nonmatching=subset(df_CT,A1=="A")
#replace "T" with "A" and "C" with "G"
df_CT_rev_strand_matching$A1[df_CT_rev_strand_matching$A1=="G"] <- "C"
df_CT_rev_strand_matching$A2[df_CT_rev_strand_matching$A2=="A"] <- "T"
df_CT_rev_strand_nonmatching$A1[df_CT_rev_strand_nonmatching$A1=="A"] <- "T"
df_CT_rev_strand_nonmatching$A2[df_CT_rev_strand_nonmatching$A2=="G"] <- "C"
df_CT_rev_strand_nonmatching$freq_C=df_CT_rev_strand_nonmatching$freq_T#flip freq
#GA
df_GA_rev_strand_matching=subset(df_GA,A1=="C")
df_GA_rev_strand_nonmatching=subset(df_GA,A1=="T")
#replace "T" with "A" and "C" with "G"
df_GA_rev_strand_matching$A1[df_GA_rev_strand_matching$A1=="C"] <- "G"
## Warning in `[<-.factor`(`*tmp*`, df_GA_rev_strand_matching$A1 == "C",
## value = structure(integer(0), .Label = c("A", : invalid factor level, NA
## generated
df_GA_rev_strand_matching$A2[df_GA_rev_strand_matching$A2=="T"] <- "A"
## Warning in `[<-.factor`(`*tmp*`, df_GA_rev_strand_matching$A2 == "T",
## value = structure(integer(0), .Label = c("C", : invalid factor level, NA
## generated
df_GA_rev_strand_nonmatching$A1[df_GA_rev_strand_nonmatching$A1=="T"] <- "A"
df_GA_rev_strand_nonmatching$A2[df_GA_rev_strand_nonmatching$A2=="C"] <- "G"
df_GA_rev_strand_nonmatching$freq_G=df_GA_rev_strand_nonmatching$freq_A#flip freq
#GT
df_GT_rev_strand_matching=subset(df_GT,A1=="C")
df_GT_rev_strand_nonmatching=subset(df_GT,A1=="A")
#replace "T" with "A" and "C" with "G"and viceversa
df_GT_rev_strand_matching$A1[df_GT_rev_strand_matching$A1=="C"] <- "G"
## Warning in `[<-.factor`(`*tmp*`, df_GT_rev_strand_matching$A1 == "C",
## value = structure(integer(0), .Label = c("A", : invalid factor level, NA
## generated
df_GT_rev_strand_matching$A2[df_GT_rev_strand_matching$A2=="A"] <- "T"
df_GT_rev_strand_nonmatching$A1[df_GT_rev_strand_nonmatching$A1=="A"] <- "T"
df_GT_rev_strand_nonmatching$A2[df_GT_rev_strand_nonmatching$A2=="C"] <- "G"
df_GT_rev_strand_nonmatching$freq_G=df_GT_rev_strand_nonmatching$freq_T#flip freq
#TC
df_TC_rev_strand_matching=subset(df_TC,A1=="A")
df_TC_rev_strand_nonmatching=subset(df_TC,A1=="G")
#replace "T" with "A" and "C" with "G"and viceversa
df_TC_rev_strand_matching$A1[df_TC_rev_strand_matching$A1=="A"] <- "T"
df_TC_rev_strand_matching$A2[df_TC_rev_strand_matching$A2=="G"] <- "C"
df_TC_rev_strand_nonmatching$A1[df_TC_rev_strand_nonmatching$A1=="G"] <- "C"
df_TC_rev_strand_nonmatching$A2[df_TC_rev_strand_nonmatching$A2=="A"] <- "T"
df_TC_rev_strand_nonmatching$freq_T=df_TC_rev_strand_nonmatching$freq_C#flip freq
#TG
df_TG_rev_strand_matching=subset(df_TG,A1=="A")
df_TG_rev_strand_nonmatching=subset(df_TG,A1=="C")
#replace "T" with "A" and "C" with "G"and viceversa
df_TG_rev_strand_matching$A1[df_TG_rev_strand_matching$A1=="A"] <- "T"
df_TG_rev_strand_matching$A2[df_TG_rev_strand_matching$A2=="C"] <- "G"
df_TG_rev_strand_nonmatching$A1[df_TG_rev_strand_nonmatching$A1=="C"] <- "G"
## Warning in `[<-.factor`(`*tmp*`, df_TG_rev_strand_nonmatching$A1 == "C", :
## invalid factor level, NA generated
df_TG_rev_strand_nonmatching$A2[df_TG_rev_strand_nonmatching$A2=="A"] <- "T"
df_TG_rev_strand_nonmatching$freq_T=df_TG_rev_strand_nonmatching$freq_G#flip freq
Create objects based on match or nonmatch between A1 and reference allele and flip frequency of nonmatch alleles
#AC
df_AC_match=df_AC[which(as.character(df_AC$reference)==as.character(df_AC$A1)),]
df_AC_nonmatch=df_AC[which(as.character(df_AC$reference)==as.character(df_AC$A2)),]
df_AC_nonmatch$freq_A=df_AC_nonmatch$freq_C
#re-merge objects
df_AC=rbind(df_AC_nonmatch,df_AC_match, df_AC_rev_strand_matching,df_AC_rev_strand_nonmatching)
#AG
df_AG_match=df_AG[which(as.character(df_AG$reference)==as.character(df_AG$A1)),]
df_AG_nonmatch=df_AG[which(as.character(df_AG$reference)==as.character(df_AG$A2)),]
df_AG_nonmatch$freq_A=df_AG_nonmatch$freq_G
#re-merge objects
df_AG=rbind(df_AG_nonmatch,df_AG_match,df_AG_rev_strand_matching,df_AG_rev_strand_nonmatching )
#CA
df_CA_match=df_CA[which(as.character(df_CA$reference)==as.character(df_CA$A1)),]
df_CA_nonmatch=df_CA[which(as.character(df_CA$reference)==as.character(df_CA$A2)),]
df_CA_nonmatch$freq_C=df_CA_nonmatch$freq_A
#re-merge objects
df_CA=rbind(df_CA_nonmatch,df_CA_match,df_CA_rev_strand_matching,df_CA_rev_strand_nonmatching)
#CT
df_CT_match=df_CT[which(as.character(df_CT$reference)==as.character(df_CT$A1)),]
df_CT_nonmatch=df_CT[which(as.character(df_CT$reference)==as.character(df_CT$A2)),]
df_CT_nonmatch$freq_C=df_CT_nonmatch$freq_T
#re-merge objects
df_CT=rbind(df_CT_nonmatch,df_CT_match,df_CT_rev_strand_matching,df_CT_rev_strand_nonmatching)
#GA
df_GA_match=df_GA[which(as.character(df_GA$reference)==as.character(df_GA$A1)),]
df_GA_nonmatch=df_GA[which(as.character(df_GA$reference)==as.character(df_GA$A2)),]
df_GA_nonmatch$freq_G=df_GA_nonmatch$freq_A
#re-merge objects
df_GA=rbind(df_GA_nonmatch,df_GA_match,df_GA_rev_strand_matching,df_GA_rev_strand_nonmatching)
df_GT_match=df_GT[which(as.character(df_GT$reference)==as.character(df_GT$A1)),]
df_GT_nonmatch=df_GT[which(as.character(df_GT$reference)==as.character(df_GT$A2)),]
df_GT_nonmatch$freq_G=df_GT_nonmatch$freq_T
#re-merge objects
df_GT=rbind(df_GT_nonmatch,df_GT_match,df_GT_rev_strand_matching,df_GT_rev_strand_nonmatching)
#TC
df_TC_match=df_TC[which(as.character(df_TC$reference)==as.character(df_TC$A1)),]
df_TC_nonmatch=df_TC[which(as.character(df_TC$reference)==as.character(df_TC$A2)),]
df_TC_nonmatch$freq_T=df_TC_nonmatch$freq_C
#re-merge objects
df_TC=rbind(df_TC_nonmatch,df_TC_match,df_TC_rev_strand_matching,df_TC_rev_strand_nonmatching)
#TG
df_TG_match=df_TG[which(as.character(df_TG$reference)==as.character(df_TG$A1)),]
df_TG_nonmatch=df_TG[which(as.character(df_TG$reference)==as.character(df_TG$A2)),]
df_TG_nonmatch$freq_T=df_TG_nonmatch$freq_G
#re-merge objects
df_TG=rbind(df_TG_nonmatch,df_TG_match,df_TG_rev_strand_matching,df_TG_rev_strand_nonmatching)
Calculate weighted frequency by SNP by group.
df_AC$GVS=df_AC$Beta*df_AC$freq_A
df_AG$GVS=df_AG$Beta*df_AG$freq_A
df_CA$GVS=df_CA$Beta*df_CA$freq_C
df_CT$GVS=df_CT$Beta*df_CT$freq_C
df_GA$GVS=df_GA$Beta*df_GA$freq_G
df_GT$GVS=df_GT$Beta*df_GT$freq_G
df_TC$GVS=df_TC$Beta*df_TC$freq_T
df_TG$GVS=df_TG$Beta*df_TG$freq_T
Re-merge groups
df_remerged=rbind(df_AC,df_AG,df_CA,df_CT,df_GA,df_GT,df_TC,df_TG)
compute PGS by population
PGS<-with(df_remerged, tapply(GVS, population, mean))
PGS
## Algeria (Mzab) - Mozabite Bougainville - NAN Melanesian
## -0.0002316000 -0.0030247818
## Brazil - Karitiana Brazil - Surui
## 0.0001603091 -0.0013435818
## C. African Republic - Biaka Pygmy Cambodia - Cambodian
## 0.0002624364 -0.0001927273
## China - Dai China - Daur
## 0.0017254545 0.0022674727
## China - Han China - Hezhen
## 0.0019698545 0.0012267818
## China - Lahu China - Miaozu
## 0.0003238000 0.0027827273
## China - Mongola China - Naxi
## 0.0010736364 0.0011460182
## China - Oroqen China - She
## 0.0026346364 0.0018563636
## China - Tu China - Tujia
## 0.0024372727 0.0006100000
## China - Uygur China - Xibo
## 0.0007054545 0.0017222545
## China - Yizu Colombia - Piapoco and Curripaco
## 0.0016454545 -0.0011265455
## D. R. of Congo - Mbuti Pygmy France - Basque
## 0.0010947818 -0.0004387636
## France - French Israel (Carmel) - Druze
## 0.0001790909 0.0011744364
## Israel (Central) - Palestinian Israel (Negev) - Bedouin
## 0.0002558182 -0.0002814545
## Italy - from Bergamo Italy - Sardinian
## 0.0003393091 -0.0008507455
## Italy - Tuscan Japan - Japanese
## -0.0011610909 0.0013065273
## Kenya - Bantu Mexico - Maya
## 0.0001313455 -0.0015682182
## Mexico - Pima Namibia - San
## 0.0011377818 -0.0015818182
## New Guinea - Papuan Nigeria - Yoruba
## -0.0027480909 0.0003360364
## Orkney Islands - Orcadian Pakistan - Balochi
## -0.0008993455 0.0016867455
## Pakistan - Brahui Pakistan - Burusho
## 0.0016527273 0.0012149091
## Pakistan - Hazara Pakistan - Kalash
## 0.0020064000 0.0032865818
## Pakistan - Makrani Pakistan - Pathan
## 0.0016827636 0.0011325455
## Pakistan - Sindhi Population Set 1
## 0.0013174909 0.0006138727
## Russia - Russian Russia (Caucasus) - Adygei
## -0.0012556364 -0.0001059636
## Senegal - Mandenka Siberia - Yakut
## 0.0005634909 0.0018727273
## South Africa - Bantu
## -0.0011705273
Sort in PGS in descending order
sort(PGS,decreasing = TRUE)
## Pakistan - Kalash China - Miaozu
## 0.0032865818 0.0027827273
## China - Oroqen China - Tu
## 0.0026346364 0.0024372727
## China - Daur Pakistan - Hazara
## 0.0022674727 0.0020064000
## China - Han Siberia - Yakut
## 0.0019698545 0.0018727273
## China - She China - Dai
## 0.0018563636 0.0017254545
## China - Xibo Pakistan - Balochi
## 0.0017222545 0.0016867455
## Pakistan - Makrani Pakistan - Brahui
## 0.0016827636 0.0016527273
## China - Yizu Pakistan - Sindhi
## 0.0016454545 0.0013174909
## Japan - Japanese China - Hezhen
## 0.0013065273 0.0012267818
## Pakistan - Burusho Israel (Carmel) - Druze
## 0.0012149091 0.0011744364
## China - Naxi Mexico - Pima
## 0.0011460182 0.0011377818
## Pakistan - Pathan D. R. of Congo - Mbuti Pygmy
## 0.0011325455 0.0010947818
## China - Mongola China - Uygur
## 0.0010736364 0.0007054545
## Population Set 1 China - Tujia
## 0.0006138727 0.0006100000
## Senegal - Mandenka Italy - from Bergamo
## 0.0005634909 0.0003393091
## Nigeria - Yoruba China - Lahu
## 0.0003360364 0.0003238000
## C. African Republic - Biaka Pygmy Israel (Central) - Palestinian
## 0.0002624364 0.0002558182
## France - French Brazil - Karitiana
## 0.0001790909 0.0001603091
## Kenya - Bantu Russia (Caucasus) - Adygei
## 0.0001313455 -0.0001059636
## Cambodia - Cambodian Algeria (Mzab) - Mozabite
## -0.0001927273 -0.0002316000
## Israel (Negev) - Bedouin France - Basque
## -0.0002814545 -0.0004387636
## Italy - Sardinian Orkney Islands - Orcadian
## -0.0008507455 -0.0008993455
## Colombia - Piapoco and Curripaco Italy - Tuscan
## -0.0011265455 -0.0011610909
## South Africa - Bantu Russia - Russian
## -0.0011705273 -0.0012556364
## Brazil - Surui Mexico - Maya
## -0.0013435818 -0.0015682182
## Namibia - San New Guinea - Papuan
## -0.0015818182 -0.0027480909
## Bougainville - NAN Melanesian
## -0.0030247818
Calculate Z-score
scale(PGS, center = TRUE, scale = TRUE)
## [,1]
## Algeria (Mzab) - Mozabite -0.571713100
## Bougainville - NAN Melanesian -2.595074553
## Brazil - Karitiana -0.287816884
## Brazil - Surui -1.377224983
## C. African Republic - Biaka Pygmy -0.213836599
## Cambodia - Cambodian -0.543553967
## China - Dai 0.845963581
## China - Daur 1.238597776
## China - Han 1.023005239
## China - Hezhen 0.484728531
## China - Lahu -0.169385208
## China - Miaozu 1.611844580
## China - Mongola 0.373791030
## China - Naxi 0.426223915
## China - Oroqen 1.504568557
## China - She 0.940793215
## China - Tu 1.361599714
## China - Tujia 0.037936078
## China - Uygur 0.107082686
## China - Xibo 0.843645523
## China - Yizu 0.788012138
## Colombia - Piapoco and Curripaco -1.220005353
## D. R. of Congo - Mbuti Pygmy 0.389108650
## France - Basque -0.721780995
## France - French -0.274211466
## Israel (Carmel) - Druze 0.446809848
## Israel (Central) - Palestinian -0.218630764
## Israel (Negev) - Bedouin -0.607827385
## Italy - from Bergamo -0.158150531
## Italy - Sardinian -1.020217754
## Italy - Tuscan -1.245029840
## Japan - Japanese 0.542495583
## Kenya - Bantu -0.308797941
## Mexico - Maya -1.539950000
## Mexico - Pima 0.420257551
## Namibia - San -1.549801746
## New Guinea - Papuan -2.394641585
## Nigeria - Yoruba -0.160521272
## Orkney Islands - Orcadian -1.055423256
## Pakistan - Balochi 0.817922985
## Pakistan - Brahui 0.793280451
## Pakistan - Burusho 0.476128010
## Pakistan - Hazara 1.049478511
## Pakistan - Kalash 1.976833303
## Pakistan - Makrani 0.815038584
## Pakistan - Pathan 0.416464365
## Pakistan - Sindhi 0.550437564
## Population Set 1 0.040741455
## Russia - Russian -1.313517908
## Russia (Caucasus) - Adygei -0.480702993
## Senegal - Mandenka 0.004245217
## Siberia - Yakut 0.952646919
## South Africa - Bantu -1.251865476
## attr(,"scaled:center")
## [1] 0.0005576305
## attr(,"scaled:scale")
## [1] 0.001380466
The ranked scores are roughly in line with what we know from estimated phenotypic population IQs and previous genotypic estimates (e.g. Piffer, 2013; Piffer, 2015). The top scores are achieved by East Asians and the lowest scores by some African populations, Melanesians and Papuans (remarkably, populations that are both genetically and geographically very far apart). The three major limitations of this approach are: 1) Very low coverage of the HGDP-CEPH database, which comprises only about 10% of all the SNPs present in 1000 Genomes; 2) Very small sample sizes,with some populations numbering less than 10 individuals (e.g. Italy - Tuscan= 8; San=5; Bantu=8; Colombia - Piapoco and Curripaco= 7; China - Lahu=8); 3) Unequal coverage, so that some populations have less SNPs than others.