#Set-up
library(nlme)
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggfortify)
library(gridExtra)
library(grDevices)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:nlme':
##
## collapse
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble 3.1.6 ✓ purrr 0.3.4
## ✓ tidyr 1.2.0 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::collapse() masks nlme::collapse()
## x dplyr::combine() masks gridExtra::combine()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(leaps)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(tibble)
library(readr)
library(boot)
library(leaps)
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-4
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
library(Matrix)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:boot':
##
## logit
## The following object is masked from 'package:purrr':
##
## some
## The following object is masked from 'package:dplyr':
##
## recode
library(caret)
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
##
## Attaching package: 'caret'
## The following object is masked from 'package:pls':
##
## R2
## The following object is masked from 'package:purrr':
##
## lift
library(readxl)
library(readr)
library(fable)
## Loading required package: fabletools
##
## Attaching package: 'fabletools'
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
Constipation = read_excel("~/Desktop/BKN599FinalProject/BKN599FinalProject.xlsx")
## New names:
## * `` -> ...1
People with Parkinson’s disease (PD) suffer from constipation. Constipation has been linked to disease severity, mostly measured as motor function via the Unified Parkinson’s disease Rating Scale (UPDRS-III). Constipation is classified by assessing the prevalence on a scale of 1-10 of 9 variables. If someone has more than 4, they are classified as having constipation.
However, each of these symptoms may indicate different pathological considerations for neurologic disorders that affect gut health. For instance, having less than 3 bowel movements per week may indicate that a person’s autonomic nervous system is dysregulated, whereas having hard stool may indicate dehydration or dietary issues.
Thus, the purpose of my project was to determine, in 71 people with PD: 1) which (if any) of the 9 predictors of constipation symptoms more significantly interracted with a UPDRS 2) whether the dimensions of the model need to be reduced OR 3) whether the sample size needs to be increased to adhere to the large number of predictors.
This is an exploratory analysis to determine if the model formulated needs to be reduced OR if all variables must be included but the sample size increased to better-power the analysis.
First, I split the dataset to work with a training subset.
set.seed(11)
x= data.matrix(Constipation [, c('Straining', 'Hard', 'Emptying', 'Blockage' , 'BMs' , 'Pain', 'Watery', 'Incontinence','Bloat')])
y = Constipation$UPDRS
train.index = sample(1:nrow(Constipation), round(nrow(Constipation) * 0.85)) #I split the data 85% train and removed 15% for test set to be used later.
train = Constipation[train.index, ]
test = Constipation[-train.index, ]
train.matrix = dummyVars(UPDRS ~ Straining + Hard + Emptying + Blockage + BMs + Pain + Watery + Incontinence + Bloat, Constipation, data = train, fullRank = F) %>%
predict(newdata = train) %>%
as.matrix()
test.matrix = dummyVars(UPDRS ~ Straining + Hard + Emptying + Blockage + BMs + Pain + Watery + Incontinence + Bloat, Constipation, data = test, fullRank = F) %>%
predict(newdata = test) %>%
as.matrix()
###A. Linear Model I calculated a multiple linear regression model to assess which constipation symptoms most significantly interacted with motor function (UPDRS-III score) in people with PD.
lm1 = lm(UPDRS ~ Straining + Hard + Emptying + Blockage + BMs + Pain + Watery + Incontinence + Bloat, data = Constipation)
summary(lm1)
##
## Call:
## lm(formula = UPDRS ~ Straining + Hard + Emptying + Blockage +
## BMs + Pain + Watery + Incontinence + Bloat, data = Constipation)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.3697 -1.2467 0.4155 1.8397 6.4120
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.76575 2.85079 12.195 < 2e-16 ***
## Straining -0.48043 0.24108 -1.993 0.050760 .
## Hard -0.94896 0.23232 -4.085 0.000131 ***
## Emptying 0.65954 0.26470 2.492 0.015452 *
## Blockage -0.22294 0.19521 -1.142 0.257880
## BMs 1.68051 0.30034 5.595 5.53e-07 ***
## Pain 0.03959 0.18377 0.215 0.830148
## Watery 0.46109 0.16842 2.738 0.008095 **
## Incontinence 0.14690 0.26241 0.560 0.577665
## Bloat 0.71312 0.29105 2.450 0.017164 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.953 on 61 degrees of freedom
## Multiple R-squared: 0.9182, Adjusted R-squared: 0.9062
## F-statistic: 76.11 on 9 and 61 DF, p-value: < 2.2e-16
lm.pred = predict(lm1, Constipation)
lm.mse = mean((lm.pred - train$UPDRS)^2)
## Warning in lm.pred - train$UPDRS: longer object length is not a multiple of
## shorter object length
#Comments: It appears 5 of the 9 predictors, hard stool, incomplete emptying, <3 BMs per week, watery stool, and bloating were all significant predictors of UPDRS-III scores. However, to ensure the robustness of this model, I assessed the assumptions of a linear regression.
##A1- Diagnostic Plots to check linear regression assumptions In particular, given that many of the constipation symptoms are related to one another, it is important to investigate whether the predictors are correlated. If so, another regression model must be considered.
#PLOT 1
plot1 = ggplot(lm1, aes(lm1$fitted.values, lm1$residuals))+
geom_point() +
geom_smooth() +
labs(x = "Fitted Values", y ="Residuals", title= "Residuals vs Fitted")
#PLOT2
lm1$stdresid = lm1$residual/sd(lm1$residual)
plot2 = ggplot(lm1, aes(qqnorm(lm1$stdresid)[[1]], lm1$stdresid)) +
geom_point(na.r = TRUE) +
geom_smooth(method='lm', size = .5) +
labs( x="Theoretical Quartiles", y = "Standardized Residuals", title= "Normal Q-Q")
## Warning: Ignoring unknown parameters: na.r
#PLOT 3
plot3 = ggplot(lm1, aes(lm1$fitted.values, (sqrt(abs(lm1$stdresid))))) +
geom_point() +
geom_smooth() +
labs( x="Fitted Values", y = "Sqrt Residuals", title= "Scale-Location")
#PLOT 4
hats = hatvalues(lm1)
lm1$hats = hats
plot4 = ggplot(lm1, aes(lm1$hats, lm1$stdresid)) +geom_point() +geom_smooth() + labs(x="Leverage", y = "Standarized Residuals", title= "Residuals vs. Leverage")
grid.arrange(plot1, plot2, plot3, plot4, nrow= 2)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#1) Variance (Standardized Residuals vs. Fitted Values) Variance is nonconstant and homoscedastic
ggplot(lm1, aes(lm1$fitted.values, lm1$residuals))+
geom_point() +
geom_smooth()+
geom_hline(yintercept=0, linetype='dotted')+
ggtitle("Residuals vs Fitted") + xlab("Fitted Values") + ylab("Standardized Residuals")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#2) Linearity 5 predictors are significantly correlated with the response variable, and 1 is almost statistically significant.
plot5= ggplot(Constipation, aes(UPDRS, Straining))+
stat_smooth(method="lm", color="red", fill="red", alpha=.25) +
geom_point()
plot6= ggplot(Constipation, aes(UPDRS, Hard))+
stat_smooth(method="lm", color="red", fill="red", alpha=.25) +
geom_point()
plot7= ggplot(Constipation, aes(UPDRS,BMs))+
stat_smooth(method="lm", color="red", fill="red", alpha=.25) +
geom_point()
plot8= ggplot(Constipation, aes(UPDRS, Watery))+
stat_smooth(method="lm", color="red", fill="red", alpha=.25) +
geom_point()
plot9= ggplot(Constipation, aes(UPDRS,Bloat))+
stat_smooth(method="lm", color="red", fill="red", alpha=.25) +
geom_point()
plot10= ggplot(Constipation, aes(UPDRS,Hard))+
stat_smooth(method="lm", color="red", fill="red", alpha=.25) +
geom_point()
grid.arrange(plot5, plot6, plot7, plot8, plot9, plot10, nrow= 2)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
#All predictors appear to be linear except for straining
#3) Multicolinearity VIOLATED- there is statistcal correlations between virtually every predictor, meaning there may be inconsistencies when interpreting a simple linear model.
#Plots
ggpairs(Constipation[,2:11]) +
labs(title="Scatterplot Matrix")
# Covariance matrix
vcov(lm1)
## (Intercept) Straining Hard Emptying Blockage
## (Intercept) 8.12698644 -0.032460454 -0.594166965 0.062218427 0.0536147403
## Straining -0.03246045 0.058119447 0.005270456 -0.044702868 -0.0054665733
## Hard -0.59416697 0.005270456 0.053971982 -0.009280206 -0.0025611012
## Emptying 0.06221843 -0.044702868 -0.009280206 0.070068334 -0.0180479612
## Blockage 0.05361474 -0.005466573 -0.002561101 -0.018047961 0.0381056330
## BMs -0.41295299 0.009959156 0.036164383 -0.016006262 -0.0075790305
## Pain -0.14856940 -0.003584264 0.006127098 0.001555531 -0.0097675257
## Watery 0.01094975 -0.003990761 0.003565260 0.004424057 -0.0021279900
## Incontinence -0.06863953 -0.008461992 -0.005514685 0.015831479 -0.0009897367
## Bloat -0.28256192 -0.005425023 0.012490332 -0.005218413 0.0048712819
## BMs Pain Watery Incontinence
## (Intercept) -0.412952988 -0.1485694030 0.0109497514 -0.0686395346
## Straining 0.009959156 -0.0035842644 -0.0039907611 -0.0084619916
## Hard 0.036164383 0.0061270985 0.0035652597 -0.0055146854
## Emptying -0.016006262 0.0015555308 0.0044240573 0.0158314786
## Blockage -0.007579031 -0.0097675257 -0.0021279900 -0.0009897367
## BMs 0.090202151 0.0009523070 -0.0140314486 -0.0133622127
## Pain 0.000952307 0.0337719958 0.0007817636 -0.0026478475
## Watery -0.014031449 0.0007817636 0.0283654283 -0.0084031555
## Incontinence -0.013362213 -0.0026478475 -0.0084031555 0.0688581966
## Bloat -0.030169313 -0.0062226334 -0.0102818313 0.0068269620
## Bloat
## (Intercept) -0.282561916
## Straining -0.005425023
## Hard 0.012490332
## Emptying -0.005218413
## Blockage 0.004871282
## BMs -0.030169313
## Pain -0.006222633
## Watery -0.010281831
## Incontinence 0.006826962
## Bloat 0.084709487
#All predictors are correlated to one another except for incontinence.
# VIF
vif(lm1)
## Straining Hard Emptying Blockage BMs Pain
## 4.343827 3.271201 5.107376 2.689287 5.041539 1.606349
## Watery Incontinence Bloat
## 2.410666 1.193938 3.195688
#All VIFS are over 2 except for incontinence and pain symptoms.
#4) Independence of the variables via autocorrelation For the most part, the predictors are independent of one another.
acf(Constipation[,4])
#5. Normal distribution of the residuals Check using a Shaprio-Wilks Test to determine this; if p<.05, than they are NOT normally distributed
shapiro.test(lm1$residuals)
##
## Shapiro-Wilk normality test
##
## data: lm1$residuals
## W = 0.96261, p-value = 0.03293
#Can also check doing a histogram
hist(lm1$residuals)
#Visually looks like normally distributed and Shapiro Wilks is <.05, so it is normally distributed.
#6) high leverage points and 7) outliers using aesthetic size in your leverage plot to show the Cook’s distance. Plot Cook’s distance for all observations on a bar plot. VIOLATED- There are a few points that may be outliers or cause the model to be leverages in a particular direction.
cook= cooks.distance(lm1)
cook_check= (lm1[x<2])
leverage = as.data.frame(hatvalues(lm1))
std_residual = as.data.frame(rstandard(lm1))
cook
## 1 2 3 4 5 6
## 1.669510e-02 7.645708e-02 1.602869e-03 4.921657e-02 2.271319e-02 4.464479e-04
## 7 8 9 10 11 12
## 7.416572e-04 5.156768e-04 3.071428e-03 1.603170e-02 2.093222e-02 8.804986e-02
## 13 14 15 16 17 18
## 6.236494e-03 7.828656e-03 1.839859e-02 2.580509e-02 3.046004e-02 1.568015e-01
## 19 20 21 22 23 24
## 1.275450e-02 5.269449e-03 8.568981e-04 1.802147e-03 1.176527e-03 1.510463e-02
## 25 26 27 28 29 30
## 4.185421e-02 7.322044e-02 3.742365e-02 1.027774e-02 7.430090e-04 3.511845e-04
## 31 32 33 34 35 36
## 5.731817e-03 5.879410e-02 2.998212e-04 2.194883e-02 6.597386e-05 2.694111e-03
## 37 38 39 40 41 42
## 3.876028e-03 6.501589e-05 9.288706e-04 1.502768e-04 1.607998e-03 5.121831e-06
## 43 44 45 46 47 48
## 2.788475e-05 5.078748e-03 1.663975e-03 1.710342e-03 2.123853e-04 6.505432e-04
## 49 50 51 52 53 54
## 5.609538e-03 1.307245e-04 2.058559e-04 1.391161e-02 2.936182e-03 1.468915e-03
## 55 56 57 58 59 60
## 4.840046e-02 8.730503e-05 2.879664e-03 2.532992e-02 2.587734e-04 3.321994e-02
## 61 62 63 64 65 66
## 2.107139e-03 4.796478e-06 3.335906e-03 1.258915e-01 1.756158e-02 1.111156e-02
## 67 68 69 70 71
## 7.866371e-03 6.321926e-04 1.057800e-01 2.204245e-02 3.614691e-04
Since 2 of the regression assumptions are violated, the model needs to be modified. Thus, the next step is to determine whether every single symptoms needs to be included in the model, or whether the model can be reduced.
Particularly given that there are a large number of predictors versus the sample size (9 predictors vs an n=71), it’s important to reduce the dimensions of the model to avoid overfitting. In addition, one assumption NOT met in the linear model was multicolinearity.
###B. CROSS VALIDATION TO PPROXIMATE THE MOST APPROPRIATE NUMBER OF PREDICTORS IN THE MODEL: Find the best subset using regubsets to estimate via the BIC or ADJR2:
regfit = regsubsets(UPDRS ~ Straining + Hard + Emptying + Blockage + BMs + Pain + Watery + Incontinence + Bloat, Constipation, nvmax = 9)
summary(regfit)
## Subset selection object
## Call: regsubsets.formula(UPDRS ~ Straining + Hard + Emptying + Blockage +
## BMs + Pain + Watery + Incontinence + Bloat, Constipation,
## nvmax = 9)
## 9 Variables (and intercept)
## Forced in Forced out
## Straining FALSE FALSE
## Hard FALSE FALSE
## Emptying FALSE FALSE
## Blockage FALSE FALSE
## BMs FALSE FALSE
## Pain FALSE FALSE
## Watery FALSE FALSE
## Incontinence FALSE FALSE
## Bloat FALSE FALSE
## 1 subsets of each size up to 9
## Selection Algorithm: exhaustive
## Straining Hard Emptying Blockage BMs Pain Watery Incontinence Bloat
## 1 ( 1 ) " " " " " " " " "*" " " " " " " " "
## 2 ( 1 ) " " "*" " " " " "*" " " " " " " " "
## 3 ( 1 ) " " "*" " " " " "*" " " " " " " "*"
## 4 ( 1 ) " " "*" " " " " "*" " " "*" " " "*"
## 5 ( 1 ) " " "*" "*" " " "*" " " "*" " " "*"
## 6 ( 1 ) "*" "*" "*" " " "*" " " "*" " " "*"
## 7 ( 1 ) "*" "*" "*" "*" "*" " " "*" " " "*"
## 8 ( 1 ) "*" "*" "*" "*" "*" " " "*" "*" "*"
## 9 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*"
regfit.full = summary(regfit)
Visualize the R2 based on number of predictors
plot(regfit.full$cp, xlab =" Number of Variables ", ylab="Cp", type="l")
points(4, regfit.full$cp [4], col ="red",cex =2, pch =20)
plot(regfit.full$bic ,xlab=" Number of Variables ",ylab=" BIC", type="l")
points(4, regfit.full$bic [4], col =" red",cex =2, pch =20)
which.min (regfit.full$cp) #4= Hard stool, Watery stool, Bloating, and <3 BMs per week
## [1] 6
which.min (regfit.full$bic) #4=
## [1] 4
which.min(regfit.full$adjr2) #1= <3 BMs per week
## [1] 1
plot(regfit.full$adjr2 ,xlab=" Number of Variables ",ylab=" Adjusted R2", type="l")
points (7, regfit.full$adjr2[7], col ="red",cex =2, pch =20)
#ggplot
BIC.sub= regfit.full$bic
id.min.bic = which.min(BIC.sub)
min.bic = min(BIC.sub)
Perform the best subset selection on the full dataset:
regfit.best=regsubsets(UPDRS ~ Straining + Hard + Emptying + Blockage + BMs + Pain + Watery + Incontinence + Bloat, Constipation ,nvmax =9)
coef(lm1, 9)
## (Intercept) Straining Hard Emptying Blockage BMs
## 34.76574555 -0.48042887 -0.94895564 0.65953576 -0.22294361 1.68051463
## Pain Watery Incontinence Bloat
## 0.03959046 0.46109469 0.14689670 0.71311531
coef(regfit.best,4)
## (Intercept) Hard BMs Watery Bloat
## 34.4080565 -0.8792355 1.7860002 0.4130160 0.7317247
#The intercepts are different than the test set coefficients, thus cross-validation is necessary.
It appears that using 4 predictors, hard stool, <3BMs per week, watery stool, and bloating would reduce the liklihood that colinearity would be violated. We can refit the model to fit this ‘best subset’, but…
This is just an approximation, Thus, to directly assess which predictors need to be included, I can directly reduce the dimensions of the model using 3 different methods to verify which is the best model:
Thus, I explored 3 different shrinkage methods to determine the most appropriate regression model.
###C. RIDGE REGRESSION #C1. Next, I performed a ridge regression on the training set and calculated the CV MSE and coefficient estimates via the best lambda. Ridge regression adds bias to reduce errors, which will reduce the variance. It uses a shrink penalty, called lambda, to bring the slope and intercept closer to 0 to control the impact of these two terms on the regression coefficient estimates.
#Created a model:
grid = 10^seq(4,-2,length=100)
ridge.mod= glmnet (train.matrix, train$UPDRS, alpha =0, lambda = grid)
plot(ridge.mod, xvar='lambda', label= T)
#9 variables in the model. At each stage of lambda, it shows all the models included. All the features are included, but begin to shrink as the lambda increases.
#Values 5 and 2 are more significant to the model than, say, 8 or 6, because the don't shrink as quickly.
#Mean Square Error calculation
ridge.mse= mean((train.matrix - train$UPDRS)^2)
ridge.mse
## [1] 1845.246
#C2- To find optimal lambda, I used the training set to create a modified ridge model, then used the predict function and CV to determine the optimal lambda for the full model. Lambda determines how severe the penalty placed upon the slope will be. When lambda is 0, the penalty has no effect, and the ridge model will produce the least squares estimates. As lambda increases, the coefficients will approach 0 and the model will shrink. This shrinkage method is only applied to the slopes, not the intercept.
#CV on the training set to determine the best lambda and CV MSE.
set.seed(1)
cv.ridge = cv.glmnet(train.matrix,train$UPDRS,alpha=0,lambda=grid)
cv.ridge
##
## Call: cv.glmnet(x = train.matrix, y = train$UPDRS, lambda = grid, alpha = 0)
##
## Measure: Mean-Squared Error
##
## Lambda Index Measure SE Nonzero
## min 0.010 100 9.951 2.212 9
## 1se 3.511 58 11.857 3.878 9
#plot CV error using ggplot:
df = data.frame(lambda = cv.ridge$lambda, log_lambda = log(cv.ridge$lambda),
cvm=cv.ridge$cvm, cvup = cv.ridge$cvup, cvlo = cv.ridge$cvlo)
ggplot(df, aes(x=lambda, y=cvm,
ymin=cvlo, ymax=cvup))+
geom_ribbon(alpha=0.25)+
geom_line(color = "black")+
geom_point(color = "red")+
theme_bw()+
labs(x="Lambda", y = "MSE")+
geom_vline(aes(xintercept = lambda[which.min(cvm)]), linetype="dotted")+
geom_vline(aes(xintercept = cv.ridge$lambda.1se), linetype = "dotted")
#Next, I choose the optimal lambda (tuning parameter) via cross-validation to make the fit smaller by making the RSS small plus adding the shrinking penalty.
bestlam.ridge = cv.ridge$lambda.1se
bestlam.ridge
## [1] 3.511192
#The lambda, 3.511, will be the tuning parameter added to the model to shrink it and reduce bias. In doing so, it will also reduce variance, so the model may not be as significant as the basic linear model.
#Plot it
plot(cv.ridge)
#C3. Repeat test error using Ridge with new best lambda Repeat the model using the new lambda and use it to determine MSE:
ridge.pred = predict(ridge.mod,s=bestlam.ridge, newx = train.matrix)
#Mean Square Error calculation
ridge.mse.train= mean((train$UPDRS-ridge.pred)^2)
ridge.mse
## [1] 1845.246
ridge.mse.train
## [1] 8.97631
#C4- L1 Norm Plot: Each curve corresponds to the ridge regression coefficient estimate for one of the 9 variables, plotted as a function of λ. L1 norm is normalized values for the ridge model against the estimated values for each predictors based on the lambda. Left of the plot shows what happens when lambda is large, all variables are shrunk completely. If lambda is small (right of the plot), we can include all the variables, or it goes towards the linear model.
We want to see something in the middle, where you can fit the model to show variance but also shrink the model via ridge’s lambda to reduce errors.
autoplot(ridge.mod)
## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## Please use `gather()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
#C5- Re-fit the model on the test data using best lambda to verify it a Ridge regression is appropriate:
ridge.test= glmnet(y = test$UPDRS,
x = test.matrix,
alpha = 0,
lambda = 10^seq(2,-2, length = 100))
ridge.pred = predict(ridge.test, s = ridge.mod$lambda.min, newx = test.matrix)
ridge.mse
## [1] 1845.246
ridge.mse.train
## [1] 8.97631
(ridge.mse.test = mean((ridge.pred - test$UPDRS)^2))
## [1] 9.581452
#COMMENTS: By using the ridge regression model, I was able to reduce the MSE significantly. However, I cannot determine how many predictors need to be included in the model. In order to do that, I performed a lasso regression.
###D. NEXT, I PERFORMED A LASSO REGRESSION Lasso shrinks the coefficient estimates towards zero and it has the effect of setting variables exactly equal to zero when lambda is large enough while ridge does not.
Much like the best subset selection method, lasso performs variable selection, but more directly.
#D1- Create the lasso model
#Create a Model
set.seed(1)
lasso.mod =glmnet(y = train$UPDRS, x = train.matrix, alpha = 1, lambda = grid)
summary(lasso.mod)
## Length Class Mode
## a0 100 -none- numeric
## beta 900 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## call 5 -none- call
## nobs 1 -none- numeric
plot(lasso.mod, xvar='lambda', label= T)
#9 features in the model. At each stage of lambda, it shows features being omitted, which in those case appears that 4 features is the most fitted model.
#Lasso regularization does both shrinkage and variable selection.
#Mean Square Error calculation
lasso.mse= mean((train.matrix - train$UPDRS)^2)
lasso.mse
## [1] 1845.246
#D2- Find the optimal lambda (via CV on the training set) to determine the best lambda and CV MSE. The tuning parameter lambda is chosen by cross validation.
When lambda is small, the result is essentially the least squares estimates. As lambda increases, shrinkage occurs so that variables that are at zero can be thrown away.
This is a major advantage of lasso-it is a combination of both shrinkage and selection of variables. In cases with very large number of features, the lasso efficiently finds the sparse model that involves a small subset of the features.
#CV on the training set to determine the best lambda and CV MSE.
set.seed(1)
cv.lasso = cv.glmnet(train.matrix,train$UPDRS,alpha=1)
cv.lasso
##
## Call: cv.glmnet(x = train.matrix, y = train$UPDRS, alpha = 1)
##
## Measure: Mean-Squared Error
##
## Lambda Index Measure SE Nonzero
## min 0.0123 72 9.953 2.238 9
## 1se 1.4112 21 12.045 3.124 4
#plot CV error using ggplot:
df = data.frame(lambda = cv.lasso$lambda, log_lambda = log(cv.lasso$lambda),
cvm=cv.lasso$cvm, cvup = cv.lasso$cvup, cvlo = cv.lasso$cvlo)
ggplot(df, aes(x=lambda, y=cvm,
ymin=cvlo, ymax=cvup))+
geom_ribbon(alpha=0.25)+
geom_line(color = "black")+
geom_point(color = "red")+
theme_bw()+
labs(x="Lambda", y = "MSE")+
geom_vline(aes(xintercept = lambda[which.min(cvm)]), linetype="dotted")+
geom_vline(aes(xintercept = cv.lasso$lambda.1se), linetype = "dotted")
#Next, I choose the optimal lambda (tuning parameter) via cross-validation to make the fit smaller by making the RSS small plus adding the shrinking penalty.
bestlam.lasso = cv.lasso$lambda.1se
bestlam.lasso
## [1] 1.411235
lasso.coef= predict(cv.lasso, type = "coefficients", s=bestlam.lasso)
lasso.coef
## 10 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 35.9375714
## Straining .
## Hard -0.6755357
## Emptying .
## Blockage .
## BMs 1.7849046
## Pain .
## Watery 0.3399384
## Incontinence .
## Bloat 0.3581297
#The lambda, 1.32, will be the tuning parameter added to the model to shrink it and reduce bias. In doing so, we can see that the optimal model that reduces errors and addresses bias includes 4 features.
#Plot it
plot(cv.lasso)
#D3. Repeat test error using lasso with new best lambda Repeat the model using the new lambda and use it to determine MSE:
lasso.pred = predict(lasso.mod,s=bestlam.lasso, newx = train.matrix)
#Mean Square Error calculation
lasso.mse.train= mean((train$UPDRS-lasso.pred)^2)
lasso.mse.train
## [1] 10.37384
lasso.mse
## [1] 1845.246
#D4- Plot coefficients plots against coefficients relative L1-norms Each curve corresponds to the lasso regression coefficient estimate for one of the 9 variables, plotted as a function of λ. L1 norm is normalized values for the lasso model against the estimated values for each predictors based on the lambda. Left of the plot shows what happens when lambda is large, all variables are shrunk completely. If lambda is small (right of the plot), we can include all the variables, or it goes towards the linear model.
We want to see something in the middle, where you can fit the model to show variance but also shrink the model via lasso lambda’s errors. Based on the shrinkage, it appears 4 variables is the optimal ‘middle ground’.
autoplot(lasso.mod)
#D5- Re-fit the model on the test data using best lambda to verify it a Ridge regression is appropriate:
lasso.best= glmnet(y = train$UPDRS,
x = train.matrix,
alpha = 1,
lambda = 10^seq(2,-2, length = 100))
lasso.pred = predict(lasso.best, s = lasso.mod$lambda.min, newx = test.matrix)
lasso.mse
## [1] 1845.246
lasso.mse.train
## [1] 10.37384
(lasso.mse.test = mean((lasso.pred - test$UPDRS)^2))
## [1] 30.21493
##E. PERFORM A PCR ON THE DATASET #E1- fit the model on a training set A PCR will allow us to see the relative relationships and explain the response AND see the predictors. Considering the variables that are eliminated in the best subset, if those same variables aren’t showing up as strong, then I can come back to the best subset or lasso and I can say it was sufficient.
pcr.mod= pcr(UPDRS ~ Straining + Hard + Emptying + Blockage + BMs + Pain + Watery + Incontinence + Bloat, data=Constipation, subset= train.matrix, scale=TRUE, validation ="CV")
summary(pcr.mod)
## Data: X dimension: 539 9
## Y dimension: 539 1
## Fit method: svdpc
## Number of components considered: 9
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 7.713 7.598 2.107 2.108 2.096 2.094 2.088
## adjCV 7.713 7.596 2.105 2.107 2.095 2.092 2.086
## 7 comps 8 comps 9 comps
## CV 1.698 1.569 8.527e-14
## adjCV 1.696 1.567 1.796e-13
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 43.196 72.94 85.70 94.11 97.39 98.71 99.51 99.97
## UPDRS 4.106 92.58 92.63 92.75 92.80 92.88 95.26 95.97
## 9 comps
## X 100
## UPDRS 100
#MSE of the PCR
pcr.mse = MSEP(pcr.mod, estimate = "CV")$val %>%
reshape2::melt() %>%
mutate(M = 0:(nrow(.)-1)) %>%
select(M, value) %>%
rename(CV_MSE = value)
pcr.mse
## M CV_MSE
## 1 0 5.949699e+01
## 2 1 5.772755e+01
## 3 2 4.439929e+00
## 4 3 4.444106e+00
## 5 4 4.394731e+00
## 6 5 4.383536e+00
## 7 6 4.358605e+00
## 8 7 2.884071e+00
## 9 8 2.460740e+00
## 10 9 7.270610e-27
#Plots: Based on inital calculations, it looks like all 9 features should be included in the model.
pcr.mse %>%
mutate(min_CV_MSE = as.numeric(min(CV_MSE) == CV_MSE)) %>%
ggplot(aes(x = M, y = CV_MSE)) +
geom_line(col = "grey55") +
geom_point(size = 2, aes(col = factor(min_CV_MSE))) +
scale_y_continuous(labels = scales::comma_format()) +
scale_color_manual(values = c("deepskyblue3", "green")) +
theme(legend.position = "none") +
labs(x = "M",
y = "Cross-Validation MSE",
col = "Non-Zero Coefficients:",
title = "PCR - M Selection (Using 10-Fold Cross-Validation)")
selectNcomp(pcr.mod, method = "onesigma", plot = TRUE)
## Warning in arrows(xvals, uls, xvals, rmseps + residsds, code = 3, col =
## "gray", : zero-length arrow is of indeterminate angle and so skipped
## [1] 9
#E3- Find the lowest CV error:
#Lowest cross-validation error occurs when M = 7 component are used. We compute the test MSE as follows.
pcr.pred.train = predict(pcr.mod, x[train.matrix,], ncomp = 7)
#Mean Square Error calculation
pcr.mse.test= mean((test$UPDRS-pcr.pred.train)^2)
pcr.mse
## M CV_MSE
## 1 0 5.949699e+01
## 2 1 5.772755e+01
## 3 2 4.439929e+00
## 4 3 4.444106e+00
## 5 4 4.394731e+00
## 6 5 4.383536e+00
## 7 6 4.358605e+00
## 8 7 2.884071e+00
## 9 8 2.460740e+00
## 10 9 7.270610e-27
##Test set MSE is less than the results obtained using ridge regression, which gave us 7, and the lasso, which gave us 9.
#However, as a result of the way PCR is implemented, the final model is more difficult to interpret because it does not perform any kind of variable selection or directly produce coefficient estimates.
#PCR on the full dataset with the new components:
pcr.fit=pcr(y ~ x ,scale=TRUE, ncomp =9)
summary(pcr.fit)
## Data: X dimension: 71 9
## Y dimension: 71 1
## Fit method: svdpc
## Number of components considered: 9
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 54.55 70.27 79.89 86.79 90.66 94.14 96.96 98.77
## y 74.58 85.21 89.21 89.53 89.86 90.10 90.10 91.67
## 9 comps
## X 100.00
## y 91.82
#Compare the lm, lasso, ridge, and PCR models:
SS_tot = sum((test$UPDRS - mean(test$UPDRS))^2)
bar = data.frame(method = c("LM", "Ridge", "Lasso", "PCR"), test_MSE = c(lm.mse, ridge.mse.test, lasso.mse.test, pcr.mse.test))
bar
## method test_MSE
## 1 LM 155.932088
## 2 Ridge 9.581452
## 3 Lasso 30.214935
## 4 PCR 131.379579
ggplot(bar, aes(x = method, y = test_MSE, fill = method))+
geom_bar(position="dodge", stat = "identity")
#Conclusion: Based on the approximation of the best subset, and the comparison of each of these direct models, it appears that the best model would be the lasso regression and to include 4 predictors from the original 9 constipation symptoms. This will reduce the amount of error seen in the linear model and will avoid overfitting given the predictor:to:sample-size proportion. However, the lasso had a small MSE but requires ~8 predictors. Thus, it may be adventageous to increase the sample size to over 100 (~10 participants per 9 predictors).