##EPID 677 - Spring 2020 #Exercise 6
Remember to comment your code below as much as possible.
For each question below, please include your code as well as the answer.
We will consider 5284 SNPs between 100 Mb and 120 Mb on Chromosome 10.
Open the “genotype_and_phenotype_data_QCd_Ex5.txt” file. This is the QC’d genotype and phenotype that we generated in last week’s class. However we have a different phenotype: triglyceride levels
#open "genotype_and_phenotype_data_QCd_Ex5.txt" dataset
data<-read.table("genotype_and_phenotype_data_QCd_Ex5.txt", header = T)
dim(data)
## [1] 1182 5297
Open the “map_QCd_Ex5.map” map file. This is also the QC’d map file that we generated last week.
#open "map_QCd_Ex5.map" file
map<-read.table("map_QCd_Ex5.map", header = T)
dim(map)
## [1] 5284 5
#convert "population" variable to a character
data$population <- as.character(data$population)
#subset training set 1 "African" - populations = ASW, YRI, MKK, LWK (n=510)
dataAfrica<- subset(data, population=="ASW"|population=="YRI"|population=="MKK"|population=="LWK")
dim(dataAfrica)
## [1] 510 5297
#subset testing set 1 "Indo-Europeans" - population = GIH, CEU, TSI, GIH (n=341)
dataIndEur<- subset(data, population=="GIH"|population=="CEU"|population=="TSI")
dim(dataIndEur)
## [1] 341 5297
#subset "testing set 2 Asians" - population = CHB, JPT, CHD (n=254)
dataAsia<- subset(data, population=="CHB"|population=="JPT"|population=="CHD")
dim(dataAsia)
## [1] 254 5297
#create function to calculate allele frequency
for(i in 1:5284){
map$MAF[i]<-((mean(dataAfrica[,i+12], na.rm=T))/2)*100
}
sum(map$MAF<1) #262 SNPs with MAF <1.0%
## [1] 262
#remove SNP with MAF <1% from map file - make map2 file
map2<-map[-which(map$MAF<1),]
dim(map2)
## [1] 5022 6
#remove SNPS with MAF <1% from Training dataAfrica - make training dataAfrica2
dataAfrica2<-dataAfrica[,-(which(map$MAF<1)+12)]
dim(dataAfrica2)
## [1] 510 5035
#confirm SNPs are the same in map2 and dataAfrica2
map2$SNP_A<-as.character(map2$SNP_A) #convert "map2$SNP_A" to character
all.equal(map2$SNP_A, colnames(dataAfrica2)[13:5034])
## [1] TRUE
Then, perform PCA in the training set. Missing values have already been imputed based on the mean. Also, remember that the last variable in the genotype dataset is a missing percent variable. So when you do your PCA, do not include the last column.
# Perform PCA on training dataAfrica2, columns 13-5034 contain SNP data
pca<-prcomp(dataAfrica2[,13:5034], scale.=TRUE, center=TRUE)
# plot PCA to show variation explained by PCs
plot(pca, main = "Principal Components Analysis", xlab = "Principal Components 1 - 10")
# Create summary table
OUTtrain<-as.data.frame(matrix(nrow=5022, ncol=5, NA))
#Name columns in OUTtrain<-data.frame(SNP=character(N), position=numeric(N) estimate=numeric(N), std.err=numeric(N), pvalue=numeric(N), stringsAsFactors=F)
colnames(OUTtrain)<-c("SNP", "position", "estimate", "std.err", "pvalue")
#1st column populated with SNPs names from training dataAfrica2
OUTtrain[,1]<-colnames(dataAfrica2)[13:5034]
#2nd column populated with map coordinates of SNPs from map2 dataset
OUTtrain[,2]<-map2$kb
#perfom linear regression on SNPs compared to triglyceride level, populate 3rd column with model beta/estimate, 4th column with model standard error
for (i in 1:5022){
model<-lm(dataAfrica2$Triglyc ~ dataAfrica2[,12+i] + dataAfrica2$AGE + dataAfrica2$SEX + pca$x[,1] + pca$x[,2])
OUTtrain[i,3]<-summary(model)$coeff[2,1]
OUTtrain[i,4]<-summary(model)$coeff[2,2]
OUTtrain[i,5]<-summary(model)$coeff[2,4]
}
Show Q-Q plot.
#calculate expected
expP<-runif(5022, 0, 1)
#order expected values
ordexpP<-expP[order(expP)]
#identify observed values
obsP<-as.numeric(OUTtrain[,5])
#order observed values
ordobsP<-obsP[order(obsP)]
#Plot a Q-Q plot
plot(-log10(ordexpP), -log10(ordobsP))
#add expected line
abline(0,1, col="red")
Show Manhattan plot. In Manhattan plot Include a horizontal line indicating Bonferroni p-value threshold.
#calculate bonferroni correction
b<-0.05/5022
#create manhatten plot
plot(map2[,3], -log10(as.numeric(OUTtrain[,5])), main="Manhatten Plot", ylab = "-log10 p-value", xlab = "chromosome coordinates", col="blue")
#add bonferroni line
abline(h=-log10(b), col="red")
#SNPs significant above bonferroni
sum(OUTtrain[,5]<b) #16
## [1] 16
#Show top SNPs
head(OUTtrain[order(OUTtrain[,5]),], 16)
## SNP position estimate std.err
## 2530 rs2482596_A 110346925 0.8400530 0.06571400
## 2 rs1983865_T 100006329 -0.5204350 0.08073011
## 3106 rs4917597_A 112935471 -0.5306120 0.08837324
## 3107 AFFX.SNP_10201128__rs4917597_A 112935471 -0.5306120 0.08837324
## 3101 rs7088684_T 112921039 0.4591595 0.08217378
## 3104 rs6585041_C 112932828 0.4366574 0.08232342
## 4565 rs3010471_G 118445593 0.3909407 0.07797220
## 4569 rs1014246_T 118456032 -0.3696429 0.07642374
## 4562 rs2921971_G 118442815 0.3734670 0.07785024
## 4557 rs1900500_C 118438068 0.3598746 0.07791440
## 4558 rs2921967_A 118438253 0.3545337 0.07775556
## 4559 rs2921969_G 118440565 0.3545337 0.07775556
## 2524 rs7100815_G 110338366 0.3507093 0.07709796
## 3840 rs11196455_C 115486873 0.3480999 0.07659380
## 4560 rs3010464_G 118441015 0.3534782 0.07778543
## 4568 rs3010491_G 118455190 0.3454399 0.07718010
## pvalue
## 2530 1.311633e-32
## 2 2.680340e-10
## 3106 3.681406e-09
## 3107 3.681406e-09
## 3101 3.772162e-08
## 3104 1.697651e-07
## 4565 7.401754e-07
## 4569 1.756152e-06
## 4562 2.121872e-06
## 4557 4.905273e-06
## 4558 6.441198e-06
## 4559 6.441198e-06
## 2524 6.764304e-06
## 3840 6.892780e-06
## 4560 6.907896e-06
## 4568 9.421956e-06
#Top SNPs at least 1 Mb apart are: rs2482596_A (risk), rs1983865_T (protective), rs4917597_A (protective), rs3010471_G (risk), rs11196455_C (risk)
#create top SNPs vector
tophits<-c("rs1983865_T", "rs2482596_A", "rs4917597_A", "rs11196455_C", "rs3010471_G")
#make a dataset with the phenotypes of just the 5 top SNPs
topgeno<-dataAfrica2[,which(colnames(dataAfrica2)%in%tophits)]
#identify estimate/beta/risk for the top 5 SNPs
weights2<-OUTtrain[which(OUTtrain$SNP%in%colnames(topgeno)),3]
weights<-c(-0.52, 0.84, -0.531, 0.348, 0.391)
#Calculate weighted risk score for topgenoflipped
WGRS<-as.matrix(topgeno)%*%weights
#vizualize data comparison
plot(dataAfrica2$Triglyc, WGRS, main = "African Population subset", xlab = "Triglyceride level", ylab = "Weighted Genetic Risk Score")
#Obtain the residuals from the association of triglycerides with age, sex, PC1, and PC2
resid<-resid(lm(dataAfrica2$Triglyc ~ dataAfrica2$AGE + dataAfrica2$SEX + pca$x[,1] + pca$x[,2]))
#examine the association between resid and the weighted genetic risk score
summary(lm(resid~WGRS))
##
## Call:
## lm(formula = resid ~ WGRS)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.9236 -0.6489 -0.0381 0.5807 3.3162
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.72901 0.05823 -12.52 <2e-16 ***
## WGRS 0.81591 0.04554 17.92 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9409 on 508 degrees of freedom
## Multiple R-squared: 0.3873, Adjusted R-squared: 0.386
## F-statistic: 321.1 on 1 and 508 DF, p-value: < 2.2e-16
#the r^2 = 0.39
#make a dataset with the phenotypes of just the 5 top SNPs in first test dataset ("dataIndEur")
topgeno2<-dataIndEur[,which(colnames(dataIndEur)%in%tophits)]
#previously identified estimate/beta/risk for the top 5 SNPs in the training dataset
weights<-c(-0.52, 0.84, -0.531, 0.348, 0.391)
#Calculate weighted risk score for topgeno2
WGRS2<-as.matrix(topgeno2)%*%weights
#vizualize data comparison
plot(dataIndEur$Triglyc, WGRS2, main = "Indo European Population subset", xlab = "Triglyceride level", ylab = "Weighted Genetic Risk Score")
resid2<-resid(lm(dataIndEur$Triglyc ~ dataIndEur$AGE + dataIndEur$SEX))
summary(lm(resid2~WGRS2))
##
## Call:
## lm(formula = resid2 ~ WGRS2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.87646 -0.60954 -0.00271 0.59643 2.54933
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.15543 0.05269 2.95 0.0034 **
## WGRS2 0.74752 0.06567 11.38 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9398 on 339 degrees of freedom
## Multiple R-squared: 0.2765, Adjusted R-squared: 0.2744
## F-statistic: 129.6 on 1 and 339 DF, p-value: < 2.2e-16
#the r^2 = 0.28
#make a dataset with the phenotypes of just the 5 top SNPs in second test dataset ("dataAsian")
topgeno3<-dataAsia[,which(colnames(dataAsia)%in%tophits)]
head(topgeno3)
## rs1983865_T rs2482596_A rs4917597_A rs11196455_C rs3010471_G
## 166 2 1 1 0 0
## 167 1 0 1 0 2
## 168 2 0 0 0 2
## 169 1 1 1 0 2
## 170 1 0 1 1 1
## 171 0 0 1 0 1
#previously identified estimate/beta/risk for the top 5 SNPs in the training dataset
weights<-c(-0.52, 0.84, -0.531, 0.348, 0.391)
#Calculate weighted risk score for topgeno3
WGRS3<-as.matrix(topgeno3)%*%weights
#vizualize data comparison
plot(dataAsia$Triglyc, WGRS3, main = "Asian Population subset", xlab = "Triglyceride level", ylab = "Weighted Genetic Risk Score")
resid3<-resid(lm(dataAsia$Triglyc ~ dataAsia$AGE + dataAsia$SEX))
summary(lm(resid3~WGRS3))
##
## Call:
## lm(formula = resid3 ~ WGRS3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.87177 -0.56651 -0.02237 0.58175 2.62359
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.19559 0.06167 3.172 0.0017 **
## WGRS3 0.82182 0.08098 10.148 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9335 on 252 degrees of freedom
## Multiple R-squared: 0.2901, Adjusted R-squared: 0.2873
## F-statistic: 103 on 1 and 252 DF, p-value: < 2.2e-16
#the r^2 = 0.29