In this lesson students will learn how to implement…
Use the following data for these examples:
library(tidyverse)
#install.packages("faraway")
library(faraway)
set.seed(1212)
data("fat")
help(fat)
Read about the fat
dataset
help(fat)
Our goal is to predict percent of body fat from Brozek’s equation.
#install.packages("GGally")
library(GGally)
ggpairs(fat, columns=c("brozek", "age", "weight", "height",
"adipos", "neck", "chest", "abdom",
"hip", "thigh", "knee", "ankle", "biceps",
"forearm", "wrist"))
In stepwise selection you take steps to add variables into the model or take them out of the model one at a time.
#install.packages("leaps")
library(leaps)
### forward
model.fwrd <- regsubsets(brozek ~ age + weight +
height + adipos +
neck + chest +
abdom + hip + thigh +
knee + ankle +
biceps + forearm +
wrist,
data = fat,
nvmax = 14,
method = "forward")
summary(model.fwrd)
## Subset selection object
## Call: regsubsets.formula(brozek ~ age + weight + height + adipos +
## neck + chest + abdom + hip + thigh + knee + ankle + biceps +
## forearm + wrist, data = fat, nvmax = 14, method = "forward")
## 14 Variables (and intercept)
## Forced in Forced out
## age FALSE FALSE
## weight FALSE FALSE
## height FALSE FALSE
## adipos FALSE FALSE
## neck FALSE FALSE
## chest FALSE FALSE
## abdom FALSE FALSE
## hip FALSE FALSE
## thigh FALSE FALSE
## knee FALSE FALSE
## ankle FALSE FALSE
## biceps FALSE FALSE
## forearm FALSE FALSE
## wrist FALSE FALSE
## 1 subsets of each size up to 14
## Selection Algorithm: forward
## age weight height adipos neck chest abdom hip thigh knee ankle biceps
## 1 ( 1 ) " " " " " " " " " " " " "*" " " " " " " " " " "
## 2 ( 1 ) " " "*" " " " " " " " " "*" " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " "*" " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " "*" " " " " " " " " " "
## 5 ( 1 ) " " "*" " " " " "*" " " "*" " " " " " " " " " "
## 6 ( 1 ) "*" "*" " " " " "*" " " "*" " " " " " " " " " "
## 7 ( 1 ) "*" "*" " " " " "*" " " "*" " " "*" " " " " " "
## 8 ( 1 ) "*" "*" " " " " "*" " " "*" "*" "*" " " " " " "
## 9 ( 1 ) "*" "*" " " " " "*" " " "*" "*" "*" " " " " "*"
## 10 ( 1 ) "*" "*" " " " " "*" " " "*" "*" "*" " " "*" "*"
## 11 ( 1 ) "*" "*" "*" " " "*" " " "*" "*" "*" " " "*" "*"
## 12 ( 1 ) "*" "*" "*" " " "*" "*" "*" "*" "*" " " "*" "*"
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*"
## 14 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## forearm wrist
## 1 ( 1 ) " " " "
## 2 ( 1 ) " " " "
## 3 ( 1 ) " " "*"
## 4 ( 1 ) "*" "*"
## 5 ( 1 ) "*" "*"
## 6 ( 1 ) "*" "*"
## 7 ( 1 ) "*" "*"
## 8 ( 1 ) "*" "*"
## 9 ( 1 ) "*" "*"
## 10 ( 1 ) "*" "*"
## 11 ( 1 ) "*" "*"
## 12 ( 1 ) "*" "*"
## 13 ( 1 ) "*" "*"
## 14 ( 1 ) "*" "*"
which.min(summary(model.fwrd)$cp)
## [1] 8
coef(model.fwrd, id=8)
## (Intercept) age weight neck abdom hip
## -20.06213373 0.05921577 -0.08413521 -0.43189267 0.87720667 -0.18641032
## thigh forearm wrist
## 0.28644340 0.48254563 -1.40486912
model.bwrd <- regsubsets(brozek ~ age + weight +
height + adipos +
neck + chest +
abdom + hip + thigh +
knee + ankle +
biceps + forearm +
wrist,
data = fat,
nvmax = 14,
method = "backward")
summary(model.bwrd)
## Subset selection object
## Call: regsubsets.formula(brozek ~ age + weight + height + adipos +
## neck + chest + abdom + hip + thigh + knee + ankle + biceps +
## forearm + wrist, data = fat, nvmax = 14, method = "backward")
## 14 Variables (and intercept)
## Forced in Forced out
## age FALSE FALSE
## weight FALSE FALSE
## height FALSE FALSE
## adipos FALSE FALSE
## neck FALSE FALSE
## chest FALSE FALSE
## abdom FALSE FALSE
## hip FALSE FALSE
## thigh FALSE FALSE
## knee FALSE FALSE
## ankle FALSE FALSE
## biceps FALSE FALSE
## forearm FALSE FALSE
## wrist FALSE FALSE
## 1 subsets of each size up to 14
## Selection Algorithm: backward
## age weight height adipos neck chest abdom hip thigh knee ankle biceps
## 1 ( 1 ) " " " " " " " " " " " " "*" " " " " " " " " " "
## 2 ( 1 ) " " "*" " " " " " " " " "*" " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " "*" " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " "*" " " " " " " " " " "
## 5 ( 1 ) "*" "*" " " " " " " " " "*" " " " " " " " " " "
## 6 ( 1 ) "*" "*" " " " " " " " " "*" " " "*" " " " " " "
## 7 ( 1 ) "*" "*" " " " " "*" " " "*" " " "*" " " " " " "
## 8 ( 1 ) "*" "*" " " " " "*" " " "*" "*" "*" " " " " " "
## 9 ( 1 ) "*" "*" " " " " "*" " " "*" "*" "*" " " " " "*"
## 10 ( 1 ) "*" "*" " " " " "*" " " "*" "*" "*" " " "*" "*"
## 11 ( 1 ) "*" "*" "*" " " "*" " " "*" "*" "*" " " "*" "*"
## 12 ( 1 ) "*" "*" "*" " " "*" "*" "*" "*" "*" " " "*" "*"
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*"
## 14 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## forearm wrist
## 1 ( 1 ) " " " "
## 2 ( 1 ) " " " "
## 3 ( 1 ) " " "*"
## 4 ( 1 ) "*" "*"
## 5 ( 1 ) "*" "*"
## 6 ( 1 ) "*" "*"
## 7 ( 1 ) "*" "*"
## 8 ( 1 ) "*" "*"
## 9 ( 1 ) "*" "*"
## 10 ( 1 ) "*" "*"
## 11 ( 1 ) "*" "*"
## 12 ( 1 ) "*" "*"
## 13 ( 1 ) "*" "*"
## 14 ( 1 ) "*" "*"
which.min(summary(model.bwrd)$cp)
## [1] 8
coef(model.bwrd, id=8)
## (Intercept) age weight neck abdom hip
## -20.06213373 0.05921577 -0.08413521 -0.43189267 0.87720667 -0.18641032
## thigh forearm wrist
## 0.28644340 0.48254563 -1.40486912
There are \(2^p\) possible models to explore. Best Subset Selection finds the best set of variables for a model of a given size.
### BEST SUBSET
bestsub.model <- regsubsets(brozek ~ age +
weight + height +
adipos +
neck + chest +
abdom + hip +
thigh + knee +
ankle +biceps +
forearm + wrist,
data = fat, nvmax = 14)
# nvmax: max size of subsets
summary(bestsub.model)
## Subset selection object
## Call: regsubsets.formula(brozek ~ age + weight + height + adipos +
## neck + chest + abdom + hip + thigh + knee + ankle + biceps +
## forearm + wrist, data = fat, nvmax = 14)
## 14 Variables (and intercept)
## Forced in Forced out
## age FALSE FALSE
## weight FALSE FALSE
## height FALSE FALSE
## adipos FALSE FALSE
## neck FALSE FALSE
## chest FALSE FALSE
## abdom FALSE FALSE
## hip FALSE FALSE
## thigh FALSE FALSE
## knee FALSE FALSE
## ankle FALSE FALSE
## biceps FALSE FALSE
## forearm FALSE FALSE
## wrist FALSE FALSE
## 1 subsets of each size up to 14
## Selection Algorithm: exhaustive
## age weight height adipos neck chest abdom hip thigh knee ankle biceps
## 1 ( 1 ) " " " " " " " " " " " " "*" " " " " " " " " " "
## 2 ( 1 ) " " "*" " " " " " " " " "*" " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " "*" " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " "*" " " " " " " " " " "
## 5 ( 1 ) " " "*" " " " " "*" " " "*" " " " " " " " " " "
## 6 ( 1 ) "*" "*" " " " " " " " " "*" " " "*" " " " " " "
## 7 ( 1 ) "*" "*" " " " " "*" " " "*" " " "*" " " " " " "
## 8 ( 1 ) "*" "*" " " " " "*" " " "*" "*" "*" " " " " " "
## 9 ( 1 ) "*" "*" " " " " "*" " " "*" "*" "*" " " " " "*"
## 10 ( 1 ) "*" "*" " " " " "*" " " "*" "*" "*" " " "*" "*"
## 11 ( 1 ) "*" "*" "*" " " "*" " " "*" "*" "*" " " "*" "*"
## 12 ( 1 ) "*" "*" "*" " " "*" "*" "*" "*" "*" " " "*" "*"
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*"
## 14 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## forearm wrist
## 1 ( 1 ) " " " "
## 2 ( 1 ) " " " "
## 3 ( 1 ) " " "*"
## 4 ( 1 ) "*" "*"
## 5 ( 1 ) "*" "*"
## 6 ( 1 ) "*" "*"
## 7 ( 1 ) "*" "*"
## 8 ( 1 ) "*" "*"
## 9 ( 1 ) "*" "*"
## 10 ( 1 ) "*" "*"
## 11 ( 1 ) "*" "*"
## 12 ( 1 ) "*" "*"
## 13 ( 1 ) "*" "*"
## 14 ( 1 ) "*" "*"
#Performance measures
best14<-data.frame(nvars=1:14,
Cp = summary(bestsub.model)$cp,
r2 = summary(bestsub.model)$rsq,
Adj_r2 = summary(bestsub.model)$adjr2,
BIC =summary(bestsub.model)$bic)%>%
gather(metric, value, -c(nvars))
ggplot(best14, aes(x=nvars, y=value, color=metric))+
geom_line()+
facet_grid(metric~., scales = "free")
which.max(summary(bestsub.model)$adjr2)
## [1] 8
which.min(summary(bestsub.model)$bic)
## [1] 4
which.min(summary(bestsub.model)$cp)
## [1] 8
which.max(summary(bestsub.model)$rsq)
## [1] 14
##jack-knife validation (leave-one-out)
##Function to get predictions from the regsubset function
predict.regsubsets <- function(object, newdata, id,...){
form <- as.formula(object$call[[2]])
mat <- model.matrix(form, newdata)
coefi <- coef(object, id=id)
mat[, names(coefi)]%*%coefi
}
#store the prediction error n=252
jk.errors <- matrix(NA, 252, 14)
for (k in 1:252){
#uses regsubsets in the data with 1 observation removed
best.model.cv <- regsubsets(brozek ~ age +
weight + height +
adipos + neck + chest +
abdom + hip + thigh +
knee + ankle + biceps +
forearm + wrist,
data = fat[-k,], #removes the kth obsv
nvmax = 14)
#Models with 1, 2 ,...14 predictors
for (i in 1:14){
#that was left out
pred <- predict.regsubsets(best.model.cv, #prediction in the obsv
fat[k,],
id=i)
jk.errors[k,i] <- (fat$brozek[k]-pred)^2 #error in the obsv
}
}
mse.models <- apply(jk.errors, 2, mean) #MSE estimation
plot(mse.models , #Plot with MSEs
pch=19, type="b",
xlab="nr predictors",
ylab="MSE")
The best final model has 7 variables.
coef(bestsub.model, id=7)
## (Intercept) age weight neck abdom thigh
## -30.17420140 0.06149174 -0.11235658 -0.37203479 0.85152008 0.20972568
## forearm wrist
## 0.51823595 -1.40081475
Collinearity/Multicollinaearity: The situation in which two or more predictor variables are related to each other.
Why is this a problem?
A way to quantify the impact of multicollinearity is to look at the VIF (variance inflation factor). A VIF close to 1 would imply no correlation. The larger the VIF the larger the amount of multicollinearity.
\[VIF(\hat{\beta}_j)=\frac{1}{1-R^2_{X_j|X_{-j}}}\]
The general rule of thumb is that VIFs exceeding 4 warrant further investigation, while VIFs exceeding 10 are signs of serious multicollinearity requiring correction.
library(car)
vif(lm(brozek ~ age + weight + #OLS
height + adipos +
neck + chest +
abdom + hip + thigh +
knee + ankle +
biceps + forearm +
wrist, data=fat))
## age weight height adipos neck chest abdom hip
## 2.250902 33.786851 2.256593 16.163444 4.430734 10.684562 13.346689 15.158277
## thigh knee ankle biceps forearm wrist
## 7.961508 4.828828 1.945514 3.674508 2.193390 3.379612
Regularization methods modify the loss function (see below). One of the benefits of using regularization methods is that they protect against multicollinearity.
Ridge regression minimizes the residual sum of squares with an \(L2\) penalty.
\[\sum_{i=1}^n(y_i-\beta_0-\sum_{j=1}^p\beta_jx_{ij})^2+\lambda\sum_{j=1}^p\beta_j^2\]
Big Ideas:
### ridge
#install.packages(c("glmnet", "faraway"))
library(glmnet) #function for ridge regression
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-6
library(faraway) #has the dataset fat
set.seed(1233)
#RIDGE REGRESSION
#we need to define the model equation
X <- model.matrix(brozek ~ age + weight +
height + adipos +
neck + chest +
abdom + hip + thigh +
knee + ankle +
biceps + forearm +
wrist, data=fat)[,-1]
#and the outcome
Y <- fat[,"brozek"]
#Penalty type (alpha=0 is ridge)
cv.lambda <- cv.glmnet(x=X, y=Y,
alpha = 0,
lambda=exp(seq(-5,8,.1)))
plot(cv.lambda) #MSE for several lambdas
cv.lambda$lambda.min
## [1] 0.0407622
#ridge path
plot(cv.lambda$glmnet.fit,
"lambda", label=FALSE)
When lambda is high, the coefficients shrink toward zero.
lmin <- cv.lambda$lambda.min
ridge.model <- glmnet(x=X, y=Y,
alpha = 0,
lambda = lmin)
ridge.model$beta
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s0
## age 0.06507401
## weight -0.06789165
## height -0.06028540
## adipos 0.10207562
## neck -0.45063815
## chest -0.01601489
## abdom 0.81945358
## hip -0.18402114
## thigh 0.22019252
## knee -0.01128917
## ankle 0.12891779
## biceps 0.13023719
## forearm 0.41741050
## wrist -1.51553984
ols.regression <- lm(brozek ~ age + weight + #OLS
height + adipos +
neck + chest +
abdom + hip + thigh +
knee + ankle +
biceps + forearm +
wrist, data=fat)
#OLS vs RIDGE
round(cbind(OLS = coef(ols.regression),
ridge = c(ridge.model$a0, #intercept for
as.vector(ridge.model$beta))),4)
## OLS ridge
## (Intercept) -15.1901 -13.0910
## age 0.0569 0.0651
## weight -0.0813 -0.0679
## height -0.0531 -0.0603
## adipos 0.0610 0.1021
## neck -0.4450 -0.4506
## chest -0.0309 -0.0160
## abdom 0.8790 0.8195
## hip -0.2031 -0.1840
## thigh 0.2274 0.2202
## knee -0.0010 -0.0113
## ankle 0.1572 0.1289
## biceps 0.1485 0.1302
## forearm 0.4297 0.4174
## wrist -1.4793 -1.5155
Ridge regression minimizes the residual sum of squares with an \(L1\) penalty.
\[\sum_{i=1}^n(y_i-\beta_0-\sum_{j=1}^p\beta_jx_{ij})^2+\lambda\sum_{j=1}^p|\beta_j|\]
Big Ideas:
#Penalty type (alpha=1 is lasso
#and alpha=0 is the ridge)
cv.lambda.lasso <- cv.glmnet(x=X, y=Y,
alpha = 1)
plot(cv.lambda.lasso) #MSE for several lambdas
cv.lambda.lasso
##
## Call: cv.glmnet(x = X, y = Y, alpha = 1)
##
## Measure: Mean-Squared Error
##
## Lambda Index Measure SE Nonzero
## min 0.0313 58 17.28 1.527 11
## 1se 0.4239 30 18.71 1.378 4
#Lasso path
plot(cv.lambda.lasso$glmnet.fit,
"lambda", label=FALSE)
l.lasso.min <- cv.lambda.lasso$lambda.min
l.lasso.min
## [1] 0.03132734
lasso.model <- glmnet(x=X, y=Y,
alpha = 1,
lambda = l.lasso.min)
lasso.model$beta
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s0
## age 0.05461237
## weight -0.06606504
## height -0.08055424
## adipos .
## neck -0.40276110
## chest .
## abdom 0.83148580
## hip -0.14672738
## thigh 0.17130686
## knee .
## ankle 0.08596538
## biceps 0.11316099
## forearm 0.38807121
## wrist -1.43075188
In 2015, 20.5 million newborns, an estimated 14.6 per cent of all babies born globally that year, suffered from low birthweight.
Source: UNICEF
https://data.unicef.org/topic/nutrition/low-birthweight/
lowbw<-read.csv("https://raw.githubusercontent.com/kitadasmalley/DATA252/main/Data/lowbwt.csv")
str(lowbw)
## 'data.frame': 189 obs. of 11 variables:
## $ id : int 85 86 87 88 89 91 92 93 94 95 ...
## $ low : int 0 0 0 0 0 0 0 0 0 0 ...
## $ age : int 19 33 20 21 18 21 22 17 29 26 ...
## $ lwt : int 182 155 105 108 107 124 118 103 123 113 ...
## $ race : int 2 3 1 1 1 3 1 3 1 1 ...
## $ smoke: int 0 0 1 1 1 0 0 0 1 1 ...
## $ ptl : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ht : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ui : int 1 0 0 1 1 0 0 0 0 0 ...
## $ ftv : int 0 3 1 2 0 0 1 1 1 0 ...
## $ bwt : int 2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...
Variables:
id
= Identification Codelow
= Low Birth Weight Baby (1=Yes under 2500g,
0=No)age
= Mother’s age in yearslwt
= Weight at Last Periodrace
= Race (1=White, 2=Black, 3=Other)smoke
= Smoke during Pregnancy (1=Yes, 0=No)ptl
= History of Premature Labour (# of times)ht
= History of Hypertension (1=Yes, 0=No)ui
= Presence of Uterine Irritability (1=Yes,
0=No)ftv
= Visits to Doctor During 1st trimesterbwt
= Baby’s birth Weight in GramsFit a linear model to predict birth weight (variable
bwt
) using all other predictors except for low
and id
.
Source Examples from:
https://bookdown.org/tpinto_home/Regularisation/lasso-regression.html