I will calculate group-level Polygenic Scores (PGS) for 52 populations and 7 super-populations for SNPs that were found to explain variation in educational attainment by the largest GWAS (N= 1.1 million) to date (Lee et al., 2018).
(Skip this part you’re not running the code but just reading this) ** Download files from: https://osf.io/ewxqj/ HGDP frequencies file: “Lee_results_10k_final.csv”. Frequencies of GWAS hits from 52 populations pre-computed using HGDP-CEPH browser (http://spsmart.cesga.es/ceph.php?dataSet=ceph_stanford). GWAS summary file: “GWAS_EA.to10K.txt”. HGDP frequencies by continent file: “Lee_final_results_continent.csv”"
You will be requested to upload them (in the same order) as the code runs. Note that GWAS files were not LD clumped because this would result in too small number of SNPs being found in the HGDP dataset. However, LD clumping should be performed on the output HGDP files. For simplicity, this step is omitted but will be done in a follow-up version. **
HGDP_CEPH=read.csv(file.choose(), header=TRUE, sep = ";")#open HGDP-CEPH browser output with freqs from GWAS hits
gwas=read.csv(file.choose(), header=TRUE,sep = "\t")#open GWAS summary file
Merge two datasets
HGDP_CEPH_merged=merge(HGDP_CEPH,gwas, by.x = "SNP", by.y = "MarkerName")
Split into 8 groups according to levels of vector “var”: “AC” ,“AG”, “CA”, “CT”, “GA”, “GT” ,“TC”, “TG”
df_AC=subset(HGDP_CEPH_merged, var=="AC")
df_AG=subset(HGDP_CEPH_merged, var=="AG")
df_CA=subset(HGDP_CEPH_merged, var=="CA")
df_CT=subset(HGDP_CEPH_merged, var=="CT")
df_GA=subset(HGDP_CEPH_merged, var=="GA")
df_GT=subset(HGDP_CEPH_merged, var=="GT")
df_TC=subset(HGDP_CEPH_merged, var=="TC")
df_TG=subset(HGDP_CEPH_merged, var=="TG")
Note that some SNPs are on reverse strand so AC is TG on GWAS and viceversa, so this will cause a nonmatch. Select reverse-stranded SNPs and translate into same strand as HGDP-CEPH reference panel. Then, flip frequency of non-matching alleles (when A2 (A1) is (not) HGDP reference allele).
#AC
df_AC_rev_strand_matching=subset(df_AC,A1=="T")
df_AC_rev_strand_nonmatching=subset(df_AC,A1=="G")
#replace "T" with "A" and "C" with "G"
df_AC_rev_strand_matching$A1[df_AC_rev_strand_matching$A1=="T"] <- "A"
df_AC_rev_strand_matching$A2[df_AC_rev_strand_matching$A2=="G"] <- "C"
df_AC_rev_strand_nonmatching$A1[df_AC_rev_strand_nonmatching$A1=="G"] <- "C"
df_AC_rev_strand_nonmatching$A2[df_AC_rev_strand_nonmatching$A2=="T"] <- "A"
df_AC_rev_strand_nonmatching$freq_A=df_AC_rev_strand_nonmatching$freq_C#flip freq
#AG
df_AG_rev_strand_matching=subset(df_AG,A1=="T")
df_AG_rev_strand_nonmatching=subset(df_AG,A1=="C")
#replace "T" with "A" and "C" with "G"
df_AG_rev_strand_matching$A1[df_AG_rev_strand_matching$A1=="T"] <- "A"
df_AG_rev_strand_matching$A2[df_AG_rev_strand_matching$A2=="C"] <- "G"
df_AG_rev_strand_nonmatching$A1[df_AG_rev_strand_nonmatching$A1=="C"] <- "G"
df_AG_rev_strand_nonmatching$A2[df_AG_rev_strand_nonmatching$A2=="T"] <- "A"
df_AG_rev_strand_nonmatching$freq_A=df_AG_rev_strand_nonmatching$freq_G#flip freq
#CA
df_CA_rev_strand_matching=subset(df_CA,A1=="G")
df_CA_rev_strand_nonmatching=subset(df_CA,A1=="T")
#replace "T" with "A" and "C" with "G"
df_CA_rev_strand_matching$A1[df_CA_rev_strand_matching$A1=="G"] <- "C"
df_CA_rev_strand_matching$A2[df_CA_rev_strand_matching$A2=="T"] <- "A"
df_CA_rev_strand_nonmatching$A1[df_CA_rev_strand_nonmatching$A1=="T"] <- "A"
df_CA_rev_strand_nonmatching$A2[df_CA_rev_strand_nonmatching$A2=="G"] <- "C"
df_CA_rev_strand_nonmatching$freq_C=df_CA_rev_strand_nonmatching$freq_A#flip freq
#CT
df_CT_rev_strand_matching=subset(df_CT,A1=="G")
df_CT_rev_strand_nonmatching=subset(df_CT,A1=="A")
#replace "T" with "A" and "C" with "G"
df_CT_rev_strand_matching$A1[df_CT_rev_strand_matching$A1=="G"] <- "C"
df_CT_rev_strand_matching$A2[df_CT_rev_strand_matching$A2=="A"] <- "T"
df_CT_rev_strand_nonmatching$A1[df_CT_rev_strand_nonmatching$A1=="A"] <- "T"
df_CT_rev_strand_nonmatching$A2[df_CT_rev_strand_nonmatching$A2=="G"] <- "C"
df_CT_rev_strand_nonmatching$freq_C=df_CT_rev_strand_nonmatching$freq_T#flip freq
#GA
df_GA_rev_strand_matching=subset(df_GA,A1=="C")
df_GA_rev_strand_nonmatching=subset(df_GA,A1=="T")
#replace "T" with "A" and "C" with "G"
df_GA_rev_strand_matching$A1[df_GA_rev_strand_matching$A1=="C"] <- "G"
df_GA_rev_strand_matching$A2[df_GA_rev_strand_matching$A2=="T"] <- "A"
df_GA_rev_strand_nonmatching$A1[df_GA_rev_strand_nonmatching$A1=="T"] <- "A"
df_GA_rev_strand_nonmatching$A2[df_GA_rev_strand_nonmatching$A2=="C"] <- "G"
df_GA_rev_strand_nonmatching$freq_G=df_GA_rev_strand_nonmatching$freq_A#flip freq
#GT
df_GT_rev_strand_matching=subset(df_GT,A1=="C")
df_GT_rev_strand_nonmatching=subset(df_GT,A1=="A")
#replace "T" with "A" and "C" with "G"and viceversa
df_GT_rev_strand_matching$A1[df_GT_rev_strand_matching$A1=="C"] <- "G"
df_GT_rev_strand_matching$A2[df_GT_rev_strand_matching$A2=="A"] <- "T"
df_GT_rev_strand_nonmatching$A1[df_GT_rev_strand_nonmatching$A1=="A"] <- "T"
df_GT_rev_strand_nonmatching$A2[df_GT_rev_strand_nonmatching$A2=="C"] <- "G"
df_GT_rev_strand_nonmatching$freq_G=df_GT_rev_strand_nonmatching$freq_T#flip freq
#TC
df_TC_rev_strand_matching=subset(df_TC,A1=="A")
df_TC_rev_strand_nonmatching=subset(df_TC,A1=="G")
#replace "T" with "A" and "C" with "G"and viceversa
df_TC_rev_strand_matching$A1[df_TC_rev_strand_matching$A1=="A"] <- "T"
df_TC_rev_strand_matching$A2[df_TC_rev_strand_matching$A2=="G"] <- "C"
df_TC_rev_strand_nonmatching$A1[df_TC_rev_strand_nonmatching$A1=="G"] <- "C"
df_TC_rev_strand_nonmatching$A2[df_TC_rev_strand_nonmatching$A2=="A"] <- "T"
df_TC_rev_strand_nonmatching$freq_T=df_TC_rev_strand_nonmatching$freq_C#flip freq
#TG
df_TG_rev_strand_matching=subset(df_TG,A1=="A")
df_TG_rev_strand_nonmatching=subset(df_TG,A1=="C")
#replace "T" with "A" and "C" with "G"and viceversa
df_TG_rev_strand_matching$A1[df_TG_rev_strand_matching$A1=="A"] <- "T"
df_TG_rev_strand_matching$A2[df_TG_rev_strand_matching$A2=="C"] <- "G"
df_TG_rev_strand_nonmatching$A1[df_TG_rev_strand_nonmatching$A1=="C"] <- "G"
df_TG_rev_strand_nonmatching$A2[df_TG_rev_strand_nonmatching$A2=="A"] <- "T"
df_TG_rev_strand_nonmatching$freq_T=df_TG_rev_strand_nonmatching$freq_G#flip freq
Create objects based on match or nonmatch between A1 and reference allele and flip frequency of nonmatch alleles
#AC
df_AC_match=df_AC[which(as.character(df_AC$reference)==as.character(df_AC$A1)),]
df_AC_nonmatch=df_AC[which(as.character(df_AC$reference)==as.character(df_AC$A2)),]
df_AC_nonmatch$freq_A=df_AC_nonmatch$freq_C
#re-merge objects
df_AC=rbind(df_AC_nonmatch,df_AC_match, df_AC_rev_strand_matching,df_AC_rev_strand_nonmatching)
#AG
df_AG_match=df_AG[which(as.character(df_AG$reference)==as.character(df_AG$A1)),]
df_AG_nonmatch=df_AG[which(as.character(df_AG$reference)==as.character(df_AG$A2)),]
df_AG_nonmatch$freq_A=df_AG_nonmatch$freq_G
#re-merge objects
df_AG=rbind(df_AG_nonmatch,df_AG_match,df_AG_rev_strand_matching,df_AG_rev_strand_nonmatching )
#CA
df_CA_match=df_CA[which(as.character(df_CA$reference)==as.character(df_CA$A1)),]
df_CA_nonmatch=df_CA[which(as.character(df_CA$reference)==as.character(df_CA$A2)),]
df_CA_nonmatch$freq_C=df_CA_nonmatch$freq_A
#re-merge objects
df_CA=rbind(df_CA_nonmatch,df_CA_match,df_CA_rev_strand_matching,df_CA_rev_strand_nonmatching)
#CT
df_CT_match=df_CT[which(as.character(df_CT$reference)==as.character(df_CT$A1)),]
df_CT_nonmatch=df_CT[which(as.character(df_CT$reference)==as.character(df_CT$A2)),]
df_CT_nonmatch$freq_C=df_CT_nonmatch$freq_T
#re-merge objects
df_CT=rbind(df_CT_nonmatch,df_CT_match,df_CT_rev_strand_matching,df_CT_rev_strand_nonmatching)
#GA
df_GA_match=df_GA[which(as.character(df_GA$reference)==as.character(df_GA$A1)),]
df_GA_nonmatch=df_GA[which(as.character(df_GA$reference)==as.character(df_GA$A2)),]
df_GA_nonmatch$freq_G=df_GA_nonmatch$freq_A
#re-merge objects
df_GA=rbind(df_GA_nonmatch,df_GA_match,df_GA_rev_strand_matching,df_GA_rev_strand_nonmatching)
df_GT_match=df_GT[which(as.character(df_GT$reference)==as.character(df_GT$A1)),]
df_GT_nonmatch=df_GT[which(as.character(df_GT$reference)==as.character(df_GT$A2)),]
df_GT_nonmatch$freq_G=df_GT_nonmatch$freq_T
#re-merge objects
df_GT=rbind(df_GT_nonmatch,df_GT_match,df_GT_rev_strand_matching,df_GT_rev_strand_nonmatching)
#TC
df_TC_match=df_TC[which(as.character(df_TC$reference)==as.character(df_TC$A1)),]
df_TC_nonmatch=df_TC[which(as.character(df_TC$reference)==as.character(df_TC$A2)),]
df_TC_nonmatch$freq_T=df_TC_nonmatch$freq_C
#re-merge objects
df_TC=rbind(df_TC_nonmatch,df_TC_match,df_TC_rev_strand_matching,df_TC_rev_strand_nonmatching)
#TG
df_TG_match=df_TG[which(as.character(df_TG$reference)==as.character(df_TG$A1)),]
df_TG_nonmatch=df_TG[which(as.character(df_TG$reference)==as.character(df_TG$A2)),]
df_TG_nonmatch$freq_T=df_TG_nonmatch$freq_G
#re-merge objects
df_TG=rbind(df_TG_nonmatch,df_TG_match,df_TG_rev_strand_matching,df_TG_rev_strand_nonmatching)
Calculate weighted frequency by SNP by group.
df_AC$GVS=df_AC$Beta*df_AC$freq_A
df_AG$GVS=df_AG$Beta*df_AG$freq_A
df_CA$GVS=df_CA$Beta*df_CA$freq_C
df_CT$GVS=df_CT$Beta*df_CT$freq_C
df_GA$GVS=df_GA$Beta*df_GA$freq_G
df_GT$GVS=df_GT$Beta*df_GT$freq_G
df_TC$GVS=df_TC$Beta*df_TC$freq_T
df_TG$GVS=df_TG$Beta*df_TG$freq_T
Re-merge groups
df_remerged=rbind(df_AC,df_AG,df_CA,df_CT,df_GA,df_GT,df_TC,df_TG)
compute PGS by population
PGS<-with(df_remerged, tapply(GVS, population, mean))
Sort in PGS in descending order
sort(PGS,decreasing = TRUE)
## China - Mongola China - Oroqen
## 0.0010909252 0.0010874991
## China - Dai France - Basque
## 0.0010324528 0.0010156329
## China - Tujia China - Xibo
## 0.0010141417 0.0009781704
## China - Naxi China - Han
## 0.0009734147 0.0009671330
## Orkney Islands - Orcadian China - Miaozu
## 0.0009529228 0.0009406957
## France - French D. R. of Congo - Mbuti Pygmy
## 0.0009327792 0.0009184703
## China - Yizu Siberia - Yakut
## 0.0009036732 0.0008967102
## Pakistan - Hazara Pakistan - Burusho
## 0.0008821695 0.0008580274
## China - Daur Algeria (Mzab) - Mozabite
## 0.0008561148 0.0008479185
## China - Hezhen Pakistan - Kalash
## 0.0008446146 0.0008241576
## Italy - Tuscan Pakistan - Balochi
## 0.0008167104 0.0008103325
## China - She Russia - Russian
## 0.0008078725 0.0008040898
## Japan - Japanese Italy - from Bergamo
## 0.0007915242 0.0007785030
## China - Uygur Pakistan - Sindhi
## 0.0007542795 0.0007507413
## Population Set 1 Pakistan - Brahui
## 0.0007378389 0.0006938104
## Israel (Central) - Palestinian Russia (Caucasus) - Adygei
## 0.0006934061 0.0006889604
## C. African Republic - Biaka Pygmy China - Tu
## 0.0006773677 0.0006756050
## Pakistan - Makrani Colombia - Piapoco and Curripaco
## 0.0006723307 0.0006627465
## China - Lahu Kenya - Bantu
## 0.0006432017 0.0006300149
## Israel (Carmel) - Druze Pakistan - Pathan
## 0.0006195176 0.0006167010
## South Africa - Bantu Israel (Negev) - Bedouin
## 0.0005829878 0.0005815820
## Senegal - Mandenka Italy - Sardinian
## 0.0005766954 0.0005760035
## Mexico - Pima Brazil - Karitiana
## 0.0005438127 0.0005280740
## Bougainville - NAN Melanesian Nigeria - Yoruba
## 0.0005079682 0.0004929285
## Mexico - Maya Cambodia - Cambodian
## 0.0004737000 0.0004451065
## Namibia - San Brazil - Surui
## 0.0003823465 0.0003093824
## New Guinea - Papuan
## 0.0000792115
Calculate Z-score
PGS_Z=scale(PGS,center = TRUE, scale= TRUE)
Bar chart
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
df_PGS=data.frame(PGS_Z)
df_PGS$Population<- rownames(df_PGS)
ggplot(df_PGS, aes(PGS_Z, Population)) +
geom_segment(aes(x = 0, y = Population, xend = PGS_Z, yend = Population), color = "grey50") +
geom_point()+
labs(title = "Polygenic scores for 52 HGDP populations",
caption = "Lee et al., 2018 GWAS")
Carry out the same job at the sub-continent level. This should make results more stable because populations have very low sample sizes but sub-continental clusters rely on larger samples. Upload “Lee_results_continent.csv”
compute PGS by population
PGS<-with(df_remerged, tapply(GVS, population, mean))
Calculate Z-score
PGS_Z=scale(PGS,center = TRUE, scale= TRUE)
Bar chart
library(ggplot2)
df_PGS=data.frame(PGS_Z)
df_PGS$Population<- rownames(df_PGS)
ggplot(df_PGS, aes(PGS_Z, Population)) +
geom_segment(aes(x = 0, y = Population, xend = PGS_Z, yend = Population), color = "grey50") +
geom_point()+
labs(title = "Polygenic scores for 52 HGDP populations",
caption = "Lee et al., 2018 GWAS")
It is difficult to interpret single population scores because they rely on small (sometimes very small, eg. N<10 samples). The top scores are achieved by East Asians and the lowest scores by some African populations, Melanesians and Papuans (remarkably, populations that are both genetically and geographically very far apart). However, using sub-continental clusters gives a more reliable picture of ethnic variation in genotypic scores. The ranked scores are roughly in line with what we know from estimated phenotypic population IQs (Lynn, 2006) and previous genotypic estimates (e.g. Piffer, 2013; Piffer, 2015). There are a few discrepancies, however: the extremely low score achieved by Oceania (-2.01), and Native Americans (-0.72) scoring lower than Africans (-0.19). The higher than expected score of Africans is due to Pygmies scoring relatively high compard to Bantu groups. However, Pygmies are extremely divergent from other African and non African populations (Patin et al., 2009) so it’s likely that PGS scores lose validity when applied to them. Europeans score higher than other groups but slighly lower (0.81) than East Asians (1.11). As I noted before (Piffer, 2013), the much lower Native American score suggests that selective pressure on East Asian intelligence/educational attitude post-dates the split between the two groups, which occurred between about 25-36 Kya (Moreno-Mayar et al., 2018).
The two major limitations of this approach are: 1) Very low coverage of the HGDP-CEPH database, which comprises only about 10% of all the SNPs present in 1000 Genomes; 2) Very small sample sizes,with some populations numbering less than 10 individuals (e.g. Italy - Tuscan= 8; San=5; Bantu=8; Colombia - Piapoco and Curripaco= 7; China - Lahu=8).
References:
Lynn, Richard (2006). Race Differences in Intelligence: An Evolutionary Analysis. Washington Summit Publishers. ISBN 1593680198.
Moreno-Mayar, J.V. et al. (2018).Terminal Pleistocene Alaskan genome reveals first founding population of Native Americans. Nature,553,203–207. doi:10.1038/nature25173
Patin, E.; Laval, G.; Barreiro, L. B.; Salas, A.; Semino, O.; Santachiara-Benerecetti, S.; Kidd, K. K.; Kidd, J. R.; et al. (2009). Di Rienzo, Anna, ed. “Inferring the Demographic History of African Farmers and Pygmy Hunter–Gatherers Using a Multilocus Resequencing Data Set”. PLoS Genetics. 5 (4): e1000448. doi:10.1371/journal.pgen.1000448. P
Piffer, D. (2015). A review of intelligence GWAS hits: Their relationship to country IQ and the issue of spatial autocorrelation. Intelligence, 53: 43-50.
Piffer, D. (2013). Factor Analysis of Population Allele Frequencies as a Simple, Novel Method of Detecting Signals of Recent Polygenic Selection: The Example of Educational Attainment and IQ. Mankind Quarterly, 54, 168-200.