##Exercise 5 #Remember to comment your code below as much as possible. #For each question below, please include your code as well as the answer. #We will consider 5284 SNPs between 100 Mb and 120 Mb on Chromosome 10.

#Open the “genotype_and_phenotype_data_QCd_Ex5.txt” file. Use no sep= option, or use sep=’ ‘.This is the QC’d genotype and phenotype that we generated in last week’s class. However we have a different phenotype: triglyceride levels.

# Open “genotype_and_phenotype_data_QCd_Ex5.txt” dataset
GenPhen<-read.table("genotype_and_phenotype_data_QCd_Ex5.txt", sep = " ", header = T)

#Open the “map_QCd_Ex5.map” map file. This is also the QC’d map file that we generated last week.

# open "map_QCd_Ex5.map" dataset
map <- read.table("map_QCd_Ex5.map", sep = " ", header = T)

#1. Using the all.equal() command, ensure that the list of SNPs in the genotype/phenotype file is the exact same as the list of SNPs in the map file. Convert ‘SNP_A’ variable in map file to a “character” variable. Confirm the number of SNPs in the map file using nrow() or dim(). Remember that there is an extra column at the end of the genos dataset which is “% missing”

#convert SNP_A to character
map$SNP_A<-as.character(map$SNP_A)
# ensure that the list of SNPs in the genotype/phenotype file is the exact same as the list of SNPs in the map file
all.equal(colnames(GenPhen)[13:5296], map$SNP_A)
## [1] TRUE
## Yes, the SNPs in the map file match the SNPs in the genotype/phenotype file 
#Confirm the number of SNPs in the map file using
nrow(map)
## [1] 5284
## There are 5284 SNPs in the map file

#2. Perform PCA in the entire sample. Missing values have already been imputed based on the mean. #Show PC1 vs.PC2 plot.

# Perform PCA on dataset
pca<-prcomp(GenPhen[,13:5296], scale.=TRUE, center=TRUE)
# plot PCA to show variation explained by PCs
plot(pca, main = "PCA")

# Plot PC1 vsPC2 and add legend
COLOR<-c(1:11)
plot(pca$x, col=COLOR[GenPhen$population],  pch=1)
legend("bottomright",legend=levels(GenPhen$population), col=c(1:4), pch=1, horiz=T, cex=0.55, pt.cex=0.7)

#3. Plot the Triglyceride variable in a histogram. FYI, the values of triglycerides are standardized.

hist(GenPhen$Triglyc, main = "Histogram - Triglyceride", xlab = "Triglyceride Level (Standardized)", col = "blue")

#4. Using linear regression (lm()), perform the association between each SNP and triglycerides in the entire sample, including sex, age and the first two PCs as covariates. Plot and show the Q-Q plot of the p-values. Does the observed p-value distribution appear to be inflated?

#test linear model on first SNP
model <- lm(GenPhen$Triglyc ~ GenPhen[,13] + GenPhen$sex + GenPhen$AGE + pca$x[,1] + pca$x[,2])
summary(model)$coeff
##                   Estimate  Std. Error    t value     Pr(>|t|)
## (Intercept)   -1.542738123 0.170079549 -9.0706856 4.839162e-19
## GenPhen[, 13]  0.089734096 0.082774652  1.0840770 2.785529e-01
## GenPhen$sex   -0.026489430 0.066694326 -0.3971767 6.913093e-01
## GenPhen$AGE    0.032670061 0.002800986 11.6637738 7.972032e-30
## pca$x[, 1]     0.017139924 0.001810075  9.4691769 1.488637e-20
## pca$x[, 2]    -0.004634388 0.002217962 -2.0894807 3.687894e-02
# Create summary table 
OUT<-as.data.frame(matrix(nrow=5284, ncol=5, NA))
colnames(OUT)<-c("SNP", "position", "estimate", "std.err", "pvalue")
# OUT<-data.frame(SNP=character(N), estimate=numeric(N), std.err=numeric(N), pvalue=numeric(N), stringsAsFactors=F)
OUT[,1]<-colnames(GenPhen)[13:5296]
OUT[,2]<-map$kb
# make a linear regression model to find association between SNP and Tryglyceride level, covariates: sex, age, PC1, PC2 
for (i in 1:5284){
  model <- lm(GenPhen$Triglyc ~ GenPhen[,12+i] + GenPhen$sex + GenPhen$AGE + pca$x[,1] + pca$x[,2])
OUT[i,3]<-summary(model)$coeff[2,1]
  OUT[i,4]<-summary(model)$coeff[2,2]
  OUT[i,5]<-summary(model)$coeff[2,4]
}
#Plot a Q-Q plot 
expP<-runif(5284, 0, 1)
ordexpP<-expP[order(expP)]
obsP<-as.numeric(OUT[,5])
ordobsP<-obsP[order(obsP)]

plot(-log10(ordexpP), -log10(ordobsP), main = "QQ Plot", ylab = "-log P Values", xlab = "-log Expected P")
abline(0,1, col="red")

## We would interpret this QQ plot to show this association is not inflated.

#5. How many SNPs are statistically significant using a Bonferroni correction (divide 0.05 by number of statistical tests performed to get new p-value threshold)?

#Bonferroni Correction
x<-0.05/5284
# Number of SNP P-values above Bonferroni Corected P-Value
sum(OUT[ ,5]<x)
## [1] 27
## There are 27 SNPs that remain significant after the Bonferroni correction

#6. Plot the Manhattan plot, and add a red horizontal line indicating the Bonferroni p-value threshold (hint: abline(h=pvaluethreshold, …)). You can also ‘zoom in’ on the plot by adding this to the plot() command: ylim=c(0,20)

#Make a Manahttan plot 
plot(map[,3], -log10(as.numeric(OUT[,5])), main="Manhattan Plot - Triglycerides", xlab = "Chromosome 10 coordinates", ylab = "-log10 Pvalues", ylim=c(0,20))
#Add Bonferroni line
abline(h=(-log10(x)), col = "red")

#7. What are the “top SNPs”, defined as being more than 1Mb apart and being statistically significant according to ? and their respective risk (triglyceride increasing) alleles? Hint, use: head(OUT[order(OUT[,5]),], 30) to sort OUT file by p-value column. Go down SNPs and don’t include SNPs that are within 1Mb (1 million bases) of any SNP already included in the set of significant SNPs.

#subset OUT file to include only SNPs above Bonferroni correction
OUT2 <- OUT[-(which(OUT$pvalue > x)),]
#Order Top OUT2 by Pvalues
OUT3 <- OUT2[order(OUT2[,5]),]
#Manually identify SNPS above Bonferroni significance more than 1Mb apart
OUT4 <- OUT3[c(1,2,3,5,6,23,24),]
print(OUT4)
##               SNP  position   estimate    std.err       pvalue
## 2677  rs2482596_A 110346925  0.7624480 0.04677621 5.195558e-54
## 2     rs1983865_T 100006329 -0.4161721 0.04978483 1.760894e-16
## 3274  rs4917597_A 112935471 -0.4207157 0.05147495 7.699630e-16
## 4810  rs1014246_T 118456032 -0.3636481 0.05090876 1.595488e-12
## 4036 rs11196455_C 115486873  0.3454538 0.05242513 6.645503e-11
## 802   rs7092614_A 102896619  0.2392274 0.04923544 1.340220e-06
## 4414  rs1270442_G 116938164  0.2302144 0.04909859 3.069205e-06
#The Top SNPs more than 1Mb apart are: rs2482596_A (risk), rs1983865_T (protective), rs4917597_A (protective), rs1014246_T (protective), rs11196455_C (risk), rs7092614_A (risk), rs1270442_G (risk)

#8. Calculate a weighted genetic risk score, where weights correspond to the regression coefficients.

#Make a top SNP vector
tophits<-c("rs2482596_A", "rs1983865_T", "rs4917597_A", "rs1014246_T", "rs11196455_C", "rs7092614_A", "rs1270442_G" )
#Subset Genotype/phenotype file for Top SNPs
topgeno<-GenPhen[,which(colnames(GenPhen)%in%tophits)]
OUT5 <- OUT[which(OUT$SNP%in%colnames(topgeno)),]
# Convert to a matrix
topgeno<-as.matrix(topgeno)
# Calculate weighted genetic risk scores
WGRS<-topgeno%*%OUT5$estimate

#9. Plot a histogram of the weighted GRS. Plot a scatter plot of the weighted GRS and triglyceride levels.

#Histogram of WGRS
hist(WGRS, col = "purple")

#Scatterplot of WGRS vs triglyceride levels
plot(WGRS, GenPhen$Triglyc, main = "Weighted Genetic Risk Scores (WGRS) x Triglyceride Levels", xlab = "WGRS", ylab = "Triclyceride Levels", col="green")

#10. Perform and show results of association between the weighted genetic risk score and triglycerides, controlling for age, sex, PC1, and PC2?

#perform linear regression
model2 <- lm(GenPhen$Triglyc ~ WGRS + GenPhen$sex + GenPhen$AGE + pca$x[,1] + pca$x[,2])
summary(model2)
## 
## Call:
## lm(formula = GenPhen$Triglyc ~ WGRS + GenPhen$sex + GenPhen$AGE + 
##     pca$x[, 1] + pca$x[, 2])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.71706 -0.59351  0.00467  0.55106  2.63889 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.091422   0.128741  -8.478   <2e-16 ***
## WGRS         1.029188   0.035174  29.260   <2e-16 ***
## GenPhen$sex  0.028940   0.050769   0.570   0.5688    
## GenPhen$AGE  0.019192   0.002180   8.802   <2e-16 ***
## pca$x[, 1]  -0.002648   0.001340  -1.975   0.0484 *  
## pca$x[, 2]  -0.003993   0.001687  -2.367   0.0181 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8709 on 1176 degrees of freedom
## Multiple R-squared:  0.5472, Adjusted R-squared:  0.5453 
## F-statistic: 284.2 on 5 and 1176 DF,  p-value: < 2.2e-16
## The WGRS is a significant predictor of Triglyceride level.  Age also is a significant predictor.