##EPID 677 - Spring 2020 #Exercise 6

Remember to comment your code below as much as possible.

For each question below, please include your code as well as the answer.

We will consider 5284 SNPs between 100 Mb and 120 Mb on Chromosome 10.

Open the “genotype_and_phenotype_data_QCd_Ex5.txt” file. This is the QC’d genotype and phenotype that we generated in last week’s class. However we have a different phenotype: triglyceride levels

#open "genotype_and_phenotype_data_QCd_Ex5.txt" dataset
data<-read.table("genotype_and_phenotype_data_QCd_Ex5.txt", header = T)
dim(data)
## [1] 1182 5297

Open the “map_QCd_Ex5.map” map file. This is also the QC’d map file that we generated last week.

#open "map_QCd_Ex5.map" file
map<-read.table("map_QCd_Ex5.map", header = T)
dim(map)
## [1] 5284    5
  1. First, convert the “population” variable into a character vector (as.character()). Then, subset the dataset into a training set comprised of all African ancestry individuals (ASW, YRI, MKK, LWK; n=510), and two testing sets: Indo-Europeans (GIH, CEU, TSI, GIH; n=341), and East Asians (CHB, JPT, CHD; n=254).
#convert "population" variable to a character
data$population <- as.character(data$population)
#subset training set 1 "African" - populations = ASW, YRI, MKK, LWK (n=510)
dataAfrica<- subset(data, population=="ASW"|population=="YRI"|population=="MKK"|population=="LWK")
dim(dataAfrica)
## [1]  510 5297
#subset testing set 1 "Indo-Europeans" - population = GIH, CEU, TSI, GIH (n=341)
dataIndEur<- subset(data, population=="GIH"|population=="CEU"|population=="TSI")
dim(dataIndEur)
## [1]  341 5297
#subset "testing set 2 Asians" - population = CHB, JPT, CHD (n=254)
dataAsia<- subset(data, population=="CHB"|population=="JPT"|population=="CHD")
dim(dataAsia)
## [1]  254 5297
  1. Remove SNPs with frequency <1% in the training set (since PCA will not run with SNPs that are monomorphic). Remember to also remove these SNPs from the Map file.
#create function to calculate allele frequency
for(i in 1:5284){
  map$MAF[i]<-((mean(dataAfrica[,i+12], na.rm=T))/2)*100
}
sum(map$MAF<1) #262 SNPs with MAF <1.0%
## [1] 262
#remove SNP with MAF <1% from map file - make map2 file
map2<-map[-which(map$MAF<1),]
dim(map2)
## [1] 5022    6
#remove SNPS with MAF <1% from Training dataAfrica - make training dataAfrica2
dataAfrica2<-dataAfrica[,-(which(map$MAF<1)+12)]
dim(dataAfrica2)
## [1]  510 5035
#confirm SNPs are the same in map2 and dataAfrica2
map2$SNP_A<-as.character(map2$SNP_A) #convert "map2$SNP_A" to character
all.equal(map2$SNP_A, colnames(dataAfrica2)[13:5034])
## [1] TRUE

Then, perform PCA in the training set. Missing values have already been imputed based on the mean. Also, remember that the last variable in the genotype dataset is a missing percent variable. So when you do your PCA, do not include the last column.

# Perform PCA on training dataAfrica2, columns 13-5034 contain SNP data
pca<-prcomp(dataAfrica2[,13:5034], scale.=TRUE, center=TRUE)
# plot PCA to show variation explained by PCs
plot(pca, main = "Principal Components Analysis", xlab = "Principal Components 1 - 10")

  1. Perform linear association of each SNPs with triglycerides in the training set, controlling for age, sex and first two PCs.
# Create summary table
OUTtrain<-as.data.frame(matrix(nrow=5022, ncol=5, NA))
#Name columns in OUTtrain<-data.frame(SNP=character(N), position=numeric(N) estimate=numeric(N), std.err=numeric(N), pvalue=numeric(N), stringsAsFactors=F)
colnames(OUTtrain)<-c("SNP", "position", "estimate", "std.err", "pvalue")
#1st column populated with SNPs names from training dataAfrica2
OUTtrain[,1]<-colnames(dataAfrica2)[13:5034]
#2nd column populated with map coordinates of SNPs from map2 dataset
OUTtrain[,2]<-map2$kb
#perfom linear regression on SNPs compared to triglyceride level, populate 3rd column with model beta/estimate, 4th column with model standard error
for (i in 1:5022){
  model<-lm(dataAfrica2$Triglyc ~ dataAfrica2[,12+i] + dataAfrica2$AGE + dataAfrica2$SEX + pca$x[,1] + pca$x[,2])
  OUTtrain[i,3]<-summary(model)$coeff[2,1]
  OUTtrain[i,4]<-summary(model)$coeff[2,2]
  OUTtrain[i,5]<-summary(model)$coeff[2,4]
}

Show Q-Q plot.

#calculate expected
expP<-runif(5022, 0, 1)
#order expected values
ordexpP<-expP[order(expP)]
#identify observed values
obsP<-as.numeric(OUTtrain[,5])
#order observed values
ordobsP<-obsP[order(obsP)]
#Plot a Q-Q plot
plot(-log10(ordexpP), -log10(ordobsP))
#add expected line
abline(0,1, col="red")

Show Manhattan plot. In Manhattan plot Include a horizontal line indicating Bonferroni p-value threshold.

#calculate bonferroni correction
b<-0.05/5022
#create manhatten plot
plot(map2[,3], -log10(as.numeric(OUTtrain[,5])), main="Manhatten Plot", ylab = "-log10 p-value", xlab = "chromosome coordinates", col="blue")
#add bonferroni line
abline(h=-log10(b), col="red")

  1. What are the “top SNPs”, defined as being more than 1Mb apart? and their respective risk (triglyceride increasing) alleles?
#SNPs significant above bonferroni
sum(OUTtrain[,5]<b) #16
## [1] 16
#Show top SNPs
head(OUTtrain[order(OUTtrain[,5]),], 16)
##                                 SNP  position   estimate    std.err
## 2530                    rs2482596_A 110346925  0.8400530 0.06571400
## 2                       rs1983865_T 100006329 -0.5204350 0.08073011
## 3106                    rs4917597_A 112935471 -0.5306120 0.08837324
## 3107 AFFX.SNP_10201128__rs4917597_A 112935471 -0.5306120 0.08837324
## 3101                    rs7088684_T 112921039  0.4591595 0.08217378
## 3104                    rs6585041_C 112932828  0.4366574 0.08232342
## 4565                    rs3010471_G 118445593  0.3909407 0.07797220
## 4569                    rs1014246_T 118456032 -0.3696429 0.07642374
## 4562                    rs2921971_G 118442815  0.3734670 0.07785024
## 4557                    rs1900500_C 118438068  0.3598746 0.07791440
## 4558                    rs2921967_A 118438253  0.3545337 0.07775556
## 4559                    rs2921969_G 118440565  0.3545337 0.07775556
## 2524                    rs7100815_G 110338366  0.3507093 0.07709796
## 3840                   rs11196455_C 115486873  0.3480999 0.07659380
## 4560                    rs3010464_G 118441015  0.3534782 0.07778543
## 4568                    rs3010491_G 118455190  0.3454399 0.07718010
##            pvalue
## 2530 1.311633e-32
## 2    2.680340e-10
## 3106 3.681406e-09
## 3107 3.681406e-09
## 3101 3.772162e-08
## 3104 1.697651e-07
## 4565 7.401754e-07
## 4569 1.756152e-06
## 4562 2.121872e-06
## 4557 4.905273e-06
## 4558 6.441198e-06
## 4559 6.441198e-06
## 2524 6.764304e-06
## 3840 6.892780e-06
## 4560 6.907896e-06
## 4568 9.421956e-06
#Top SNPs at least 1 Mb apart are: rs2482596_A (risk), rs1983865_T (protective), rs4917597_A (protective), rs3010471_G (risk), rs11196455_C (risk)
  1. Calculate a weighted genetic risk score, where weights correspond to the regression coefficients. Make sure that order of SNPs is in same order as the corresponding weights.
#create top SNPs vector
tophits<-c("rs1983865_T", "rs2482596_A", "rs4917597_A", "rs11196455_C", "rs3010471_G")
#make a dataset with the phenotypes of just the 5 top SNPs
topgeno<-dataAfrica2[,which(colnames(dataAfrica2)%in%tophits)]
#identify estimate/beta/risk for the top 5 SNPs
weights2<-OUTtrain[which(OUTtrain$SNP%in%colnames(topgeno)),3]
weights<-c(-0.52, 0.84, -0.531, 0.348, 0.391)
#Calculate weighted risk score for topgenoflipped
WGRS<-as.matrix(topgeno)%*%weights
#vizualize data comparison
plot(dataAfrica2$Triglyc, WGRS, main = "African Population subset", xlab = "Triglyceride level", ylab = "Weighted Genetic Risk Score")

  1. Obtain the residuals from the association of triglycerides with age, sex, PC1, and PC2. You can use: resid<-resid(lm(trn\(Triglyc ~ trn\)AGE + trn\(SEX + pca\)x[,1] + pca$x[,2])). Then, examine the association between resid and the weighted genetic risk score. What proportion of the variation (r-squared) in the adjusted triglycerides explained by the weighted GRS? (Use: summary(lm(resid~GRS)))
#Obtain the residuals from the association of triglycerides with age, sex, PC1, and PC2
resid<-resid(lm(dataAfrica2$Triglyc ~ dataAfrica2$AGE + dataAfrica2$SEX + pca$x[,1] + pca$x[,2]))
#examine the association between resid and the weighted genetic risk score
summary(lm(resid~WGRS))
## 
## Call:
## lm(formula = resid ~ WGRS)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9236 -0.6489 -0.0381  0.5807  3.3162 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.72901    0.05823  -12.52   <2e-16 ***
## WGRS         0.81591    0.04554   17.92   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9409 on 508 degrees of freedom
## Multiple R-squared:  0.3873, Adjusted R-squared:  0.386 
## F-statistic: 321.1 on 1 and 508 DF,  p-value: < 2.2e-16
#the r^2 = 0.39
  1. In the EUR, and in the EAS, what proportion of the variation in the residualized triglycerides (age and sex only) is explained by the genetic risk score that was developed in the AFR individuals? You will first need to make a GRS in the EUR and EAS datasets, using the same SNPs and weights as you used in the training dataset (i.e. AFR).
#make a dataset with the phenotypes of just the 5 top SNPs in first test dataset ("dataIndEur")
topgeno2<-dataIndEur[,which(colnames(dataIndEur)%in%tophits)]
#previously identified estimate/beta/risk for the top 5 SNPs in the training dataset
weights<-c(-0.52, 0.84, -0.531, 0.348, 0.391)
#Calculate weighted risk score for topgeno2
WGRS2<-as.matrix(topgeno2)%*%weights
#vizualize data comparison
plot(dataIndEur$Triglyc, WGRS2, main = "Indo European Population subset", xlab = "Triglyceride level", ylab = "Weighted Genetic Risk Score")

resid2<-resid(lm(dataIndEur$Triglyc ~ dataIndEur$AGE + dataIndEur$SEX))
summary(lm(resid2~WGRS2))
## 
## Call:
## lm(formula = resid2 ~ WGRS2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.87646 -0.60954 -0.00271  0.59643  2.54933 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.15543    0.05269    2.95   0.0034 ** 
## WGRS2        0.74752    0.06567   11.38   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9398 on 339 degrees of freedom
## Multiple R-squared:  0.2765, Adjusted R-squared:  0.2744 
## F-statistic: 129.6 on 1 and 339 DF,  p-value: < 2.2e-16
#the r^2 = 0.28
#make a dataset with the phenotypes of just the 5 top SNPs in second test dataset ("dataAsian")
topgeno3<-dataAsia[,which(colnames(dataAsia)%in%tophits)]
head(topgeno3)
##     rs1983865_T rs2482596_A rs4917597_A rs11196455_C rs3010471_G
## 166           2           1           1            0           0
## 167           1           0           1            0           2
## 168           2           0           0            0           2
## 169           1           1           1            0           2
## 170           1           0           1            1           1
## 171           0           0           1            0           1
#previously identified estimate/beta/risk for the top 5 SNPs in the training dataset
weights<-c(-0.52, 0.84, -0.531, 0.348, 0.391)

#Calculate weighted risk score for topgeno3
WGRS3<-as.matrix(topgeno3)%*%weights
#vizualize data comparison
plot(dataAsia$Triglyc, WGRS3, main = "Asian Population subset", xlab = "Triglyceride level", ylab = "Weighted Genetic Risk Score")

resid3<-resid(lm(dataAsia$Triglyc ~ dataAsia$AGE + dataAsia$SEX))
summary(lm(resid3~WGRS3))
## 
## Call:
## lm(formula = resid3 ~ WGRS3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.87177 -0.56651 -0.02237  0.58175  2.62359 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.19559    0.06167   3.172   0.0017 ** 
## WGRS3        0.82182    0.08098  10.148   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9335 on 252 degrees of freedom
## Multiple R-squared:  0.2901, Adjusted R-squared:  0.2873 
## F-statistic:   103 on 1 and 252 DF,  p-value: < 2.2e-16
#the r^2 = 0.29