HGDP Lee et al. PGS

I will calculate group-level Polygenic Scores (PGS) for 52 populations and 7 super-populations for SNPs that were found to explain variation in educational attainment by the largest GWAS (N= 1.1 million) to date (Lee et al., 2018).

(Skip this part you’re not running the code but just reading this) ** Download files from: https://osf.io/ewxqj/ HGDP frequencies file: “Lee_results_10k_final.csv”. Frequencies of GWAS hits from 52 populations pre-computed using HGDP-CEPH browser (http://spsmart.cesga.es/ceph.php?dataSet=ceph_stanford). GWAS summary file: “GWAS_EA.to10K.txt”. HGDP frequencies by continent file: “Lee_final_results_continent.csv”"

You will be requested to upload them (in the same order) as the code runs. Note that GWAS files were not LD clumped because this would result in too small number of SNPs being found in the HGDP dataset. However, LD clumping should be performed on the output HGDP files. For simplicity, this step is omitted but will be done in a follow-up version. **

HGDP_CEPH=read.csv(file.choose(), header=TRUE, sep = ";")#open HGDP-CEPH browser output with freqs from GWAS hits
gwas=read.csv(file.choose(), header=TRUE,sep = "\t")#open GWAS summary file

Merge two datasets

HGDP_CEPH_merged=merge(HGDP_CEPH,gwas, by.x = "SNP", by.y = "MarkerName")

Split into 8 groups according to levels of vector “var”: “AC” ,“AG”, “CA”, “CT”, “GA”, “GT” ,“TC”, “TG”

df_AC=subset(HGDP_CEPH_merged, var=="AC")
df_AG=subset(HGDP_CEPH_merged, var=="AG")
df_CA=subset(HGDP_CEPH_merged, var=="CA")
df_CT=subset(HGDP_CEPH_merged, var=="CT")
df_GA=subset(HGDP_CEPH_merged, var=="GA")
df_GT=subset(HGDP_CEPH_merged, var=="GT")
df_TC=subset(HGDP_CEPH_merged, var=="TC")
df_TG=subset(HGDP_CEPH_merged, var=="TG")

Note that some SNPs are on reverse strand so AC is TG on GWAS and viceversa, so this will cause a nonmatch. Select reverse-stranded SNPs and translate into same strand as HGDP-CEPH reference panel. Then, flip frequency of non-matching alleles (when A2 (A1) is (not) HGDP reference allele).

#AC
df_AC_rev_strand_matching=subset(df_AC,A1=="T")
df_AC_rev_strand_nonmatching=subset(df_AC,A1=="G")
#replace "T" with "A" and "C" with "G"
df_AC_rev_strand_matching$A1[df_AC_rev_strand_matching$A1=="T"] <- "A" 
df_AC_rev_strand_matching$A2[df_AC_rev_strand_matching$A2=="G"] <- "C" 
df_AC_rev_strand_nonmatching$A1[df_AC_rev_strand_nonmatching$A1=="G"] <- "C" 
df_AC_rev_strand_nonmatching$A2[df_AC_rev_strand_nonmatching$A2=="T"] <- "A"
df_AC_rev_strand_nonmatching$freq_A=df_AC_rev_strand_nonmatching$freq_C#flip freq
#AG
df_AG_rev_strand_matching=subset(df_AG,A1=="T")
df_AG_rev_strand_nonmatching=subset(df_AG,A1=="C")
#replace "T" with "A" and "C" with "G"
df_AG_rev_strand_matching$A1[df_AG_rev_strand_matching$A1=="T"] <- "A" 
df_AG_rev_strand_matching$A2[df_AG_rev_strand_matching$A2=="C"] <- "G" 
df_AG_rev_strand_nonmatching$A1[df_AG_rev_strand_nonmatching$A1=="C"] <- "G" 
df_AG_rev_strand_nonmatching$A2[df_AG_rev_strand_nonmatching$A2=="T"] <- "A"
df_AG_rev_strand_nonmatching$freq_A=df_AG_rev_strand_nonmatching$freq_G#flip freq
#CA
df_CA_rev_strand_matching=subset(df_CA,A1=="G")
df_CA_rev_strand_nonmatching=subset(df_CA,A1=="T")
#replace "T" with "A" and "C" with "G"
df_CA_rev_strand_matching$A1[df_CA_rev_strand_matching$A1=="G"] <- "C" 
df_CA_rev_strand_matching$A2[df_CA_rev_strand_matching$A2=="T"] <- "A" 
df_CA_rev_strand_nonmatching$A1[df_CA_rev_strand_nonmatching$A1=="T"] <- "A" 
df_CA_rev_strand_nonmatching$A2[df_CA_rev_strand_nonmatching$A2=="G"] <- "C"
df_CA_rev_strand_nonmatching$freq_C=df_CA_rev_strand_nonmatching$freq_A#flip freq
#CT
df_CT_rev_strand_matching=subset(df_CT,A1=="G")
df_CT_rev_strand_nonmatching=subset(df_CT,A1=="A")
#replace "T" with "A" and "C" with "G"
df_CT_rev_strand_matching$A1[df_CT_rev_strand_matching$A1=="G"] <- "C" 
df_CT_rev_strand_matching$A2[df_CT_rev_strand_matching$A2=="A"] <- "T" 
df_CT_rev_strand_nonmatching$A1[df_CT_rev_strand_nonmatching$A1=="A"] <- "T" 
df_CT_rev_strand_nonmatching$A2[df_CT_rev_strand_nonmatching$A2=="G"] <- "C"
df_CT_rev_strand_nonmatching$freq_C=df_CT_rev_strand_nonmatching$freq_T#flip freq
#GA
df_GA_rev_strand_matching=subset(df_GA,A1=="C")
df_GA_rev_strand_nonmatching=subset(df_GA,A1=="T")
#replace "T" with "A" and "C" with "G"
df_GA_rev_strand_matching$A1[df_GA_rev_strand_matching$A1=="C"] <- "G" 
df_GA_rev_strand_matching$A2[df_GA_rev_strand_matching$A2=="T"] <- "A" 
df_GA_rev_strand_nonmatching$A1[df_GA_rev_strand_nonmatching$A1=="T"] <- "A" 
df_GA_rev_strand_nonmatching$A2[df_GA_rev_strand_nonmatching$A2=="C"] <- "G"
df_GA_rev_strand_nonmatching$freq_G=df_GA_rev_strand_nonmatching$freq_A#flip freq
#GT
df_GT_rev_strand_matching=subset(df_GT,A1=="C")
df_GT_rev_strand_nonmatching=subset(df_GT,A1=="A")
#replace "T" with "A" and "C" with "G"and viceversa
df_GT_rev_strand_matching$A1[df_GT_rev_strand_matching$A1=="C"] <- "G" 
df_GT_rev_strand_matching$A2[df_GT_rev_strand_matching$A2=="A"] <- "T" 
df_GT_rev_strand_nonmatching$A1[df_GT_rev_strand_nonmatching$A1=="A"] <- "T" 
df_GT_rev_strand_nonmatching$A2[df_GT_rev_strand_nonmatching$A2=="C"] <- "G"
df_GT_rev_strand_nonmatching$freq_G=df_GT_rev_strand_nonmatching$freq_T#flip freq
#TC
df_TC_rev_strand_matching=subset(df_TC,A1=="A")
df_TC_rev_strand_nonmatching=subset(df_TC,A1=="G")
#replace "T" with "A" and "C" with "G"and viceversa
df_TC_rev_strand_matching$A1[df_TC_rev_strand_matching$A1=="A"] <- "T" 
df_TC_rev_strand_matching$A2[df_TC_rev_strand_matching$A2=="G"] <- "C" 
df_TC_rev_strand_nonmatching$A1[df_TC_rev_strand_nonmatching$A1=="G"] <- "C" 
df_TC_rev_strand_nonmatching$A2[df_TC_rev_strand_nonmatching$A2=="A"] <- "T"
df_TC_rev_strand_nonmatching$freq_T=df_TC_rev_strand_nonmatching$freq_C#flip freq
#TG
df_TG_rev_strand_matching=subset(df_TG,A1=="A")
df_TG_rev_strand_nonmatching=subset(df_TG,A1=="C")
#replace "T" with "A" and "C" with "G"and viceversa
df_TG_rev_strand_matching$A1[df_TG_rev_strand_matching$A1=="A"] <- "T" 
df_TG_rev_strand_matching$A2[df_TG_rev_strand_matching$A2=="C"] <- "G" 
df_TG_rev_strand_nonmatching$A1[df_TG_rev_strand_nonmatching$A1=="C"] <- "G" 
df_TG_rev_strand_nonmatching$A2[df_TG_rev_strand_nonmatching$A2=="A"] <- "T"
df_TG_rev_strand_nonmatching$freq_T=df_TG_rev_strand_nonmatching$freq_G#flip freq

Create objects based on match or nonmatch between A1 and reference allele and flip frequency of nonmatch alleles

#AC
df_AC_match=df_AC[which(as.character(df_AC$reference)==as.character(df_AC$A1)),]
df_AC_nonmatch=df_AC[which(as.character(df_AC$reference)==as.character(df_AC$A2)),]
df_AC_nonmatch$freq_A=df_AC_nonmatch$freq_C
#re-merge objects
df_AC=rbind(df_AC_nonmatch,df_AC_match, df_AC_rev_strand_matching,df_AC_rev_strand_nonmatching)
#AG
df_AG_match=df_AG[which(as.character(df_AG$reference)==as.character(df_AG$A1)),]
df_AG_nonmatch=df_AG[which(as.character(df_AG$reference)==as.character(df_AG$A2)),]
df_AG_nonmatch$freq_A=df_AG_nonmatch$freq_G
#re-merge objects
df_AG=rbind(df_AG_nonmatch,df_AG_match,df_AG_rev_strand_matching,df_AG_rev_strand_nonmatching )
#CA
df_CA_match=df_CA[which(as.character(df_CA$reference)==as.character(df_CA$A1)),]
df_CA_nonmatch=df_CA[which(as.character(df_CA$reference)==as.character(df_CA$A2)),]
df_CA_nonmatch$freq_C=df_CA_nonmatch$freq_A
#re-merge objects
df_CA=rbind(df_CA_nonmatch,df_CA_match,df_CA_rev_strand_matching,df_CA_rev_strand_nonmatching)
#CT
df_CT_match=df_CT[which(as.character(df_CT$reference)==as.character(df_CT$A1)),]
df_CT_nonmatch=df_CT[which(as.character(df_CT$reference)==as.character(df_CT$A2)),]
df_CT_nonmatch$freq_C=df_CT_nonmatch$freq_T
#re-merge objects
df_CT=rbind(df_CT_nonmatch,df_CT_match,df_CT_rev_strand_matching,df_CT_rev_strand_nonmatching)
#GA
df_GA_match=df_GA[which(as.character(df_GA$reference)==as.character(df_GA$A1)),]
df_GA_nonmatch=df_GA[which(as.character(df_GA$reference)==as.character(df_GA$A2)),]
df_GA_nonmatch$freq_G=df_GA_nonmatch$freq_A
#re-merge objects
df_GA=rbind(df_GA_nonmatch,df_GA_match,df_GA_rev_strand_matching,df_GA_rev_strand_nonmatching)

df_GT_match=df_GT[which(as.character(df_GT$reference)==as.character(df_GT$A1)),]
df_GT_nonmatch=df_GT[which(as.character(df_GT$reference)==as.character(df_GT$A2)),]
df_GT_nonmatch$freq_G=df_GT_nonmatch$freq_T
#re-merge objects
df_GT=rbind(df_GT_nonmatch,df_GT_match,df_GT_rev_strand_matching,df_GT_rev_strand_nonmatching)
#TC
df_TC_match=df_TC[which(as.character(df_TC$reference)==as.character(df_TC$A1)),]
df_TC_nonmatch=df_TC[which(as.character(df_TC$reference)==as.character(df_TC$A2)),]
df_TC_nonmatch$freq_T=df_TC_nonmatch$freq_C
#re-merge objects
df_TC=rbind(df_TC_nonmatch,df_TC_match,df_TC_rev_strand_matching,df_TC_rev_strand_nonmatching)
#TG
df_TG_match=df_TG[which(as.character(df_TG$reference)==as.character(df_TG$A1)),]
df_TG_nonmatch=df_TG[which(as.character(df_TG$reference)==as.character(df_TG$A2)),]
df_TG_nonmatch$freq_T=df_TG_nonmatch$freq_G
#re-merge objects
df_TG=rbind(df_TG_nonmatch,df_TG_match,df_TG_rev_strand_matching,df_TG_rev_strand_nonmatching)

Calculate weighted frequency by SNP by group.

df_AC$GVS=df_AC$Beta*df_AC$freq_A
df_AG$GVS=df_AG$Beta*df_AG$freq_A
df_CA$GVS=df_CA$Beta*df_CA$freq_C
df_CT$GVS=df_CT$Beta*df_CT$freq_C
df_GA$GVS=df_GA$Beta*df_GA$freq_G
df_GT$GVS=df_GT$Beta*df_GT$freq_G
df_TC$GVS=df_TC$Beta*df_TC$freq_T
df_TG$GVS=df_TG$Beta*df_TG$freq_T

Re-merge groups

df_remerged=rbind(df_AC,df_AG,df_CA,df_CT,df_GA,df_GT,df_TC,df_TG)

compute PGS by population

PGS<-with(df_remerged, tapply(GVS, population, mean))

Sort in PGS in descending order

sort(PGS,decreasing = TRUE)

##                   China - Mongola                    China - Oroqen 
##                      0.0010909252                      0.0010874991 
##                       China - Dai                   France - Basque 
##                      0.0010324528                      0.0010156329 
##                     China - Tujia                      China - Xibo 
##                      0.0010141417                      0.0009781704 
##                      China - Naxi                       China - Han 
##                      0.0009734147                      0.0009671330 
##         Orkney Islands - Orcadian                    China - Miaozu 
##                      0.0009529228                      0.0009406957 
##                   France - French      D. R. of Congo - Mbuti Pygmy 
##                      0.0009327792                      0.0009184703 
##                      China - Yizu                   Siberia - Yakut 
##                      0.0009036732                      0.0008967102 
##                 Pakistan - Hazara                Pakistan - Burusho 
##                      0.0008821695                      0.0008580274 
##                      China - Daur         Algeria (Mzab) - Mozabite 
##                      0.0008561148                      0.0008479185 
##                    China - Hezhen                 Pakistan - Kalash 
##                      0.0008446146                      0.0008241576 
##                    Italy - Tuscan                Pakistan - Balochi 
##                      0.0008167104                      0.0008103325 
##                       China - She                  Russia - Russian 
##                      0.0008078725                      0.0008040898 
##                  Japan - Japanese              Italy - from Bergamo 
##                      0.0007915242                      0.0007785030 
##                     China - Uygur                 Pakistan - Sindhi 
##                      0.0007542795                      0.0007507413 
##                  Population Set 1                 Pakistan - Brahui 
##                      0.0007378389                      0.0006938104 
##    Israel (Central) - Palestinian        Russia (Caucasus) - Adygei 
##                      0.0006934061                      0.0006889604 
## C. African Republic - Biaka Pygmy                        China - Tu 
##                      0.0006773677                      0.0006756050 
##                Pakistan - Makrani  Colombia - Piapoco and Curripaco 
##                      0.0006723307                      0.0006627465 
##                      China - Lahu                     Kenya - Bantu 
##                      0.0006432017                      0.0006300149 
##           Israel (Carmel) - Druze                 Pakistan - Pathan 
##                      0.0006195176                      0.0006167010 
##              South Africa - Bantu          Israel (Negev) - Bedouin 
##                      0.0005829878                      0.0005815820 
##                Senegal - Mandenka                 Italy - Sardinian 
##                      0.0005766954                      0.0005760035 
##                     Mexico - Pima                Brazil - Karitiana 
##                      0.0005438127                      0.0005280740 
##     Bougainville - NAN Melanesian                  Nigeria - Yoruba 
##                      0.0005079682                      0.0004929285 
##                     Mexico - Maya              Cambodia - Cambodian 
##                      0.0004737000                      0.0004451065 
##                     Namibia - San                    Brazil - Surui 
##                      0.0003823465                      0.0003093824 
##               New Guinea - Papuan 
##                      0.0000792115

Calculate Z-score

PGS_Z=scale(PGS,center = TRUE, scale= TRUE)

Bar chart

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.4.4

df_PGS=data.frame(PGS_Z)
df_PGS$Population<- rownames(df_PGS)
ggplot(df_PGS, aes(PGS_Z, Population)) +
  geom_segment(aes(x = 0, y = Population, xend = PGS_Z, yend = Population), color = "grey50") +
  geom_point()+
  labs(title = "Polygenic scores for 52 HGDP populations",
       caption = "Lee et al., 2018 GWAS")

Carry out the same job at the sub-continent level. This should make results more stable because populations have very low sample sizes but sub-continental clusters rely on larger samples. Upload “Lee_results_continent.csv”

compute PGS by population

PGS<-with(df_remerged, tapply(GVS, population, mean))

Calculate Z-score

PGS_Z=scale(PGS,center = TRUE, scale= TRUE)

Bar chart

library(ggplot2)
df_PGS=data.frame(PGS_Z)
df_PGS$Population<- rownames(df_PGS)
ggplot(df_PGS, aes(PGS_Z, Population)) +
  geom_segment(aes(x = 0, y = Population, xend = PGS_Z, yend = Population), color = "grey50") +
  geom_point()+
  labs(title = "Polygenic scores for 52 HGDP populations",
       caption = "Lee et al., 2018 GWAS")

It is difficult to interpret single population scores because they rely on small (sometimes very small, eg. N<10 samples). The top scores are achieved by East Asians and the lowest scores by some African populations, Melanesians and Papuans (remarkably, populations that are both genetically and geographically very far apart). However, using sub-continental clusters gives a more reliable picture of ethnic variation in genotypic scores. The ranked scores are roughly in line with what we know from estimated phenotypic population IQs (Lynn, 2006) and previous genotypic estimates (e.g. Piffer, 2013; Piffer, 2015). There are a few discrepancies, however: the extremely low score achieved by Oceania (-2.01), and Native Americans (-0.72) scoring lower than Africans (-0.19). The higher than expected score of Africans is due to Pygmies scoring relatively high compard to Bantu groups. However, Pygmies are extremely divergent from other African and non African populations (Patin et al., 2009) so it’s likely that PGS scores lose validity when applied to them. Europeans score higher than other groups but slighly lower (0.81) than East Asians (1.11). As I noted before (Piffer, 2013), the much lower Native American score suggests that selective pressure on East Asian intelligence/educational attitude post-dates the split between the two groups, which occurred between about 25-36 Kya (Moreno-Mayar et al., 2018).

The two major limitations of this approach are: 1) Very low coverage of the HGDP-CEPH database, which comprises only about 10% of all the SNPs present in 1000 Genomes; 2) Very small sample sizes,with some populations numbering less than 10 individuals (e.g. Italy - Tuscan= 8; San=5; Bantu=8; Colombia - Piapoco and Curripaco= 7; China - Lahu=8).

References:

Lynn, Richard (2006). Race Differences in Intelligence: An Evolutionary Analysis. Washington Summit Publishers. ISBN 1593680198.

Moreno-Mayar, J.V. et al. (2018).Terminal Pleistocene Alaskan genome reveals first founding population of Native Americans. Nature,553,203–207. doi:10.1038/nature25173

Patin, E.; Laval, G.; Barreiro, L. B.; Salas, A.; Semino, O.; Santachiara-Benerecetti, S.; Kidd, K. K.; Kidd, J. R.; et al. (2009). Di Rienzo, Anna, ed. “Inferring the Demographic History of African Farmers and Pygmy Hunter–Gatherers Using a Multilocus Resequencing Data Set”. PLoS Genetics. 5 (4): e1000448. doi:10.1371/journal.pgen.1000448. P

Piffer, D. (2015). A review of intelligence GWAS hits: Their relationship to country IQ and the issue of spatial autocorrelation. Intelligence, 53: 43-50.

Piffer, D. (2013). Factor Analysis of Population Allele Frequencies as a Simple, Novel Method of Detecting Signals of Recent Polygenic Selection: The Example of Educational Attainment and IQ. Mankind Quarterly, 54, 168-200.

HGDP Lee et al. PGS

Davide Piffer

29/03/2018