##Exercise 5 #Remember to comment your code below as much as possible. #For each question below, please include your code as well as the answer. #We will consider 5284 SNPs between 100 Mb and 120 Mb on Chromosome 10.
#Open the “genotype_and_phenotype_data_QCd_Ex5.txt” file. Use no sep= option, or use sep=’ ‘.This is the QC’d genotype and phenotype that we generated in last week’s class. However we have a different phenotype: triglyceride levels.
# Open “genotype_and_phenotype_data_QCd_Ex5.txt” dataset
GenPhen<-read.table("genotype_and_phenotype_data_QCd_Ex5.txt", sep = " ", header = T)
#Open the “map_QCd_Ex5.map” map file. This is also the QC’d map file that we generated last week.
# open "map_QCd_Ex5.map" dataset
map <- read.table("map_QCd_Ex5.map", sep = " ", header = T)
#1. Using the all.equal() command, ensure that the list of SNPs in the genotype/phenotype file is the exact same as the list of SNPs in the map file. Convert ‘SNP_A’ variable in map file to a “character” variable. Confirm the number of SNPs in the map file using nrow() or dim(). Remember that there is an extra column at the end of the genos dataset which is “% missing”
#convert SNP_A to character
map$SNP_A<-as.character(map$SNP_A)
# ensure that the list of SNPs in the genotype/phenotype file is the exact same as the list of SNPs in the map file
all.equal(colnames(GenPhen)[13:5296], map$SNP_A)
## [1] TRUE
## Yes, the SNPs in the map file match the SNPs in the genotype/phenotype file
#Confirm the number of SNPs in the map file using
nrow(map)
## [1] 5284
## There are 5284 SNPs in the map file
#2. Perform PCA in the entire sample. Missing values have already been imputed based on the mean. #Show PC1 vs.PC2 plot.
# Perform PCA on dataset
pca<-prcomp(GenPhen[,13:5296], scale.=TRUE, center=TRUE)
# plot PCA to show variation explained by PCs
plot(pca, main = "PCA")
# Plot PC1 vsPC2 and add legend
COLOR<-c(1:11)
plot(pca$x, col=COLOR[GenPhen$population], pch=1)
legend("bottomright",legend=levels(GenPhen$population), col=c(1:4), pch=1, horiz=T, cex=0.55, pt.cex=0.7)
#3. Plot the Triglyceride variable in a histogram. FYI, the values of triglycerides are standardized.
hist(GenPhen$Triglyc, main = "Histogram - Triglyceride", xlab = "Triglyceride Level (Standardized)", col = "blue")
#4. Using linear regression (lm()), perform the association between each SNP and triglycerides in the entire sample, including sex, age and the first two PCs as covariates. Plot and show the Q-Q plot of the p-values. Does the observed p-value distribution appear to be inflated?
#test linear model on first SNP
model <- lm(GenPhen$Triglyc ~ GenPhen[,13] + GenPhen$sex + GenPhen$AGE + pca$x[,1] + pca$x[,2])
summary(model)$coeff
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.542738123 0.170079549 -9.0706856 4.839162e-19
## GenPhen[, 13] 0.089734096 0.082774652 1.0840770 2.785529e-01
## GenPhen$sex -0.026489430 0.066694326 -0.3971767 6.913093e-01
## GenPhen$AGE 0.032670061 0.002800986 11.6637738 7.972032e-30
## pca$x[, 1] 0.017139924 0.001810075 9.4691769 1.488637e-20
## pca$x[, 2] -0.004634388 0.002217962 -2.0894807 3.687894e-02
# Create summary table
OUT<-as.data.frame(matrix(nrow=5284, ncol=5, NA))
colnames(OUT)<-c("SNP", "position", "estimate", "std.err", "pvalue")
# OUT<-data.frame(SNP=character(N), estimate=numeric(N), std.err=numeric(N), pvalue=numeric(N), stringsAsFactors=F)
OUT[,1]<-colnames(GenPhen)[13:5296]
OUT[,2]<-map$kb
# make a linear regression model to find association between SNP and Tryglyceride level, covariates: sex, age, PC1, PC2
for (i in 1:5284){
model <- lm(GenPhen$Triglyc ~ GenPhen[,12+i] + GenPhen$sex + GenPhen$AGE + pca$x[,1] + pca$x[,2])
OUT[i,3]<-summary(model)$coeff[2,1]
OUT[i,4]<-summary(model)$coeff[2,2]
OUT[i,5]<-summary(model)$coeff[2,4]
}
#Plot a Q-Q plot
expP<-runif(5284, 0, 1)
ordexpP<-expP[order(expP)]
obsP<-as.numeric(OUT[,5])
ordobsP<-obsP[order(obsP)]
plot(-log10(ordexpP), -log10(ordobsP), main = "QQ Plot", ylab = "-log P Values", xlab = "-log Expected P")
abline(0,1, col="red")
## We would interpret this QQ plot to show this association is not inflated.
#5. How many SNPs are statistically significant using a Bonferroni correction (divide 0.05 by number of statistical tests performed to get new p-value threshold)?
#Bonferroni Correction
x<-0.05/5284
# Number of SNP P-values above Bonferroni Corected P-Value
sum(OUT[ ,5]<x)
## [1] 27
## There are 27 SNPs that remain significant after the Bonferroni correction
#6. Plot the Manhattan plot, and add a red horizontal line indicating the Bonferroni p-value threshold (hint: abline(h=pvaluethreshold, …)). You can also ‘zoom in’ on the plot by adding this to the plot() command: ylim=c(0,20)
#Make a Manahttan plot
plot(map[,3], -log10(as.numeric(OUT[,5])), main="Manhattan Plot - Triglycerides", xlab = "Chromosome 10 coordinates", ylab = "-log10 Pvalues", ylim=c(0,20))
#Add Bonferroni line
abline(h=(-log10(x)), col = "red")
#7. What are the “top SNPs”, defined as being more than 1Mb apart and being statistically significant according to ? and their respective risk (triglyceride increasing) alleles? Hint, use: head(OUT[order(OUT[,5]),], 30) to sort OUT file by p-value column. Go down SNPs and don’t include SNPs that are within 1Mb (1 million bases) of any SNP already included in the set of significant SNPs.
#subset OUT file to include only SNPs above Bonferroni correction
OUT2 <- OUT[-(which(OUT$pvalue > x)),]
#Order Top OUT2 by Pvalues
OUT3 <- OUT2[order(OUT2[,5]),]
#Manually identify SNPS above Bonferroni significance more than 1Mb apart
OUT4 <- OUT3[c(1,2,3,5,6,23,24),]
print(OUT4)
## SNP position estimate std.err pvalue
## 2677 rs2482596_A 110346925 0.7624480 0.04677621 5.195558e-54
## 2 rs1983865_T 100006329 -0.4161721 0.04978483 1.760894e-16
## 3274 rs4917597_A 112935471 -0.4207157 0.05147495 7.699630e-16
## 4810 rs1014246_T 118456032 -0.3636481 0.05090876 1.595488e-12
## 4036 rs11196455_C 115486873 0.3454538 0.05242513 6.645503e-11
## 802 rs7092614_A 102896619 0.2392274 0.04923544 1.340220e-06
## 4414 rs1270442_G 116938164 0.2302144 0.04909859 3.069205e-06
#The Top SNPs more than 1Mb apart are: rs2482596_A (risk), rs1983865_T (protective), rs4917597_A (protective), rs1014246_T (protective), rs11196455_C (risk), rs7092614_A (risk), rs1270442_G (risk)
#8. Calculate a weighted genetic risk score, where weights correspond to the regression coefficients.
#Make a top SNP vector
tophits<-c("rs2482596_A", "rs1983865_T", "rs4917597_A", "rs1014246_T", "rs11196455_C", "rs7092614_A", "rs1270442_G" )
#Subset Genotype/phenotype file for Top SNPs
topgeno<-GenPhen[,which(colnames(GenPhen)%in%tophits)]
OUT5 <- OUT[which(OUT$SNP%in%colnames(topgeno)),]
# Convert to a matrix
topgeno<-as.matrix(topgeno)
# Calculate weighted genetic risk scores
WGRS<-topgeno%*%OUT5$estimate
#9. Plot a histogram of the weighted GRS. Plot a scatter plot of the weighted GRS and triglyceride levels.
#Histogram of WGRS
hist(WGRS, col = "purple")
#Scatterplot of WGRS vs triglyceride levels
plot(WGRS, GenPhen$Triglyc, main = "Weighted Genetic Risk Scores (WGRS) x Triglyceride Levels", xlab = "WGRS", ylab = "Triclyceride Levels", col="green")
#10. Perform and show results of association between the weighted genetic risk score and triglycerides, controlling for age, sex, PC1, and PC2?
#perform linear regression
model2 <- lm(GenPhen$Triglyc ~ WGRS + GenPhen$sex + GenPhen$AGE + pca$x[,1] + pca$x[,2])
summary(model2)
##
## Call:
## lm(formula = GenPhen$Triglyc ~ WGRS + GenPhen$sex + GenPhen$AGE +
## pca$x[, 1] + pca$x[, 2])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.71706 -0.59351 0.00467 0.55106 2.63889
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.091422 0.128741 -8.478 <2e-16 ***
## WGRS 1.029188 0.035174 29.260 <2e-16 ***
## GenPhen$sex 0.028940 0.050769 0.570 0.5688
## GenPhen$AGE 0.019192 0.002180 8.802 <2e-16 ***
## pca$x[, 1] -0.002648 0.001340 -1.975 0.0484 *
## pca$x[, 2] -0.003993 0.001687 -2.367 0.0181 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8709 on 1176 degrees of freedom
## Multiple R-squared: 0.5472, Adjusted R-squared: 0.5453
## F-statistic: 284.2 on 5 and 1176 DF, p-value: < 2.2e-16
## The WGRS is a significant predictor of Triglyceride level. Age also is a significant predictor.