Load data
Read in data provided by Kaggle for this competition. They are organized in the data/
folder of this RStudio project:
training <- read_csv("data/train.csv") %>%
rename(
FirstFlrSF = `1stFlrSF`,
SecondFlrSF = `2ndFlrSF`,
ThirdSsnPorch = `3SsnPorch`
) %>%
# Fit your models to this outcome variable:
mutate(log_SalePrice = log(SalePrice+1))
test <- read_csv("data/test.csv")%>%
rename(
FirstFlrSF = `1stFlrSF`,
SecondFlrSF = `2ndFlrSF`,
ThirdSsnPorch = `3SsnPorch`
)
sample_submission <- read_csv("data/sample_submission.csv")
# Function that takes in a LASSO fit object and returns a "tidy" data frame of
# the beta-hat coefficients for each lambda value used in LASSO fit.
get_LASSO_coefficients <- function(LASSO_fit){
beta_hats <- LASSO_fit %>%
broom::tidy(return_zeros = TRUE) %>%
select(term, estimate, lambda) %>%
arrange(desc(lambda))
return(beta_hats)
}
Look at your data!
Always, ALWAYS, ALWAYS start by looking at your raw data. This gives you visual sense of what information you have to help build your predictive models. To get a full description of each variable, read the data dictionary in the data_description.txt
file in the data/
folder.
Note that the following code chunk has eval = FALSE
meaning “don’t evaluate this chunk with knitting” because .Rmd
files won’t knit if they include a View()
:
#View(training)
#glimpse(training)
#View(test)
#glimpse(test)
# Pay close attention to the variables and variable types in sample_submission.
# Your submission must match this exactly.
#glimpse(sample_submission)
# Hint:
#skim(training)
#skim(test)
### Clean-up the data
### From MP2
# Combine all data for homogenous cleaning
test$SalePrice <- NA # do this so that num of cols match
test$log_SalePrice <- NA # do this so that num of cols match
combined <- rbind(training, test)
# Fix stupid stuff
combined$GarageYrBlt[combined$GarageYrBlt==2207] <- 2007
# Look for fields with lots of NAs
na_col <- which(colSums(is.na(combined)) > 0)
sort(colSums(sapply(combined[na_col], is.na)), decreasing = TRUE)
## PoolQC SalePrice log_SalePrice MiscFeature Alley
## 1506 1459 1459 1456 1400
## Fence FireplaceQu LotFrontage GarageYrBlt GarageFinish
## 1211 753 233 80 80
## GarageQual GarageCond GarageType BsmtCond BsmtQual
## 80 80 78 46 45
## BsmtExposure BsmtFinType1 BsmtFinType2 MasVnrType MasVnrArea
## 45 43 43 16 15
## MSZoning Utilities BsmtFullBath BsmtHalfBath Functional
## 4 2 2 2 2
## Exterior1st Exterior2nd BsmtFinSF1 BsmtFinSF2 BsmtUnfSF
## 1 1 1 1 1
## TotalBsmtSF KitchenQual GarageCars GarageArea SaleType
## 1 1 1 1 1
# For the categorical fields where NA = meaningful, change NA to NO
combined$Alley = factor(combined$Alley, levels=c(levels(combined$Alley), "NO"))
combined$Alley[is.na(combined$Alley)] = "NO"
combined$BsmtCond = factor(combined$BsmtCond, levels=c(levels(combined$BsmtCond), "NO"))
combined$BsmtCond[is.na(combined$BsmtCond)] = "NO"
combined$BsmtExposure[is.na(combined$BsmtExposure)] = "NO"
combined$BsmtFinType1 = factor(combined$BsmtFinType1, levels=c(levels(combined$BsmtFinType1), "NO"))
combined$BsmtFinType1[is.na(combined$BsmtFinType1)] = "NO"
combined$BsmtFinType2 = factor(combined$BsmtFinType2, levels=c(levels(combined$BsmtFinType2), "NO"))
combined$BsmtFinType2[is.na(combined$BsmtFinType2)] = "NO"
combined$BsmtQual = factor(combined$BsmtQual, levels=c(levels(combined$BsmtQual), "NO"))
combined$BsmtQual[is.na(combined$BsmtQual)] = "NO"
combined$Electrical = factor(combined$Electrical, levels=c(levels(combined$Electrical), "NO"))
combined$Electrical[is.na(combined$Electrical)] = "NO" # ASSUMED
combined$FireplaceQu = factor(combined$FireplaceQu, levels=c(levels(combined$FireplaceQu), "NO"))
combined$FireplaceQu[is.na(combined$FireplaceQu)] = "NO"
combined$Fence = factor(combined$Fence, levels=c(levels(combined$Fence), "NO"))
combined$Fence[is.na(combined$Fence)] = "NO"
combined$GarageCond = factor(combined$GarageCond, levels=c(levels(combined$GarageCond), "NO"))
combined$GarageCond[is.na(combined$GarageCond)] = "NO"
combined$GarageFinish = factor(combined$GarageFinish, levels=c(levels(combined$GarageFinish), "NO"))
combined$GarageFinish[is.na(combined$GarageFinish)] = "NO"
combined$GarageQual = factor(combined$GarageQual, levels=c(levels(combined$GarageQual), "NO"))
combined$GarageQual[is.na(combined$GarageQual)] = "NO"
combined$GarageType = factor(combined$GarageType, levels=c(levels(combined$GarageType), "NO"))
combined$GarageType[is.na(combined$GarageType)] = "NO"
combined$MasVnrType = factor(combined$MasVnrType, levels=c(levels(combined$MasVnrType), "NO"))
combined$MasVnrType[is.na(combined$MasVnrType)] = "NO"
combined$MiscFeature = factor(combined$MiscFeature, levels=c(levels(combined$MiscFeature), "NO"))
combined$MiscFeature[is.na(combined$MiscFeature)] = "NO"
combined$PoolQC = factor(combined$PoolQC, levels=c(levels(combined$PoolQC), "NO"))
combined$PoolQC[is.na(combined$PoolQC)] = "NO"
combined$Utilities = factor(combined$Utilities, levels=c(levels(combined$Utilities), "NO"))
combined$Utilities[is.na(combined$Utilities)] = "NO" # ASSUMED
# For the categorical fields where NA = missing data, assume most common category
combined$Exterior1st[is.na(combined$Exterior1st)] <- names(sort(-table(combined$Exterior1st)))[1]
combined$Exterior2nd[is.na(combined$Exterior2nd)] <- names(sort(-table(combined$Exterior2nd)))[1]
combined$Functional[is.na(combined$Functional)] <- names(sort(-table(combined$Functional)))[1]
combined$KitchenQual[is.na(combined$KitchenQual)] <- names(sort(-table(combined$KitchenQual)))[1]
combined$MSZoning[is.na(combined$MSZoning)] <- names(sort(-table(combined$MSZoning)))[1]
combined$SaleType[is.na(combined$SaleType)] <- names(sort(-table(combined$SaleType)))[1]
# For the numerical fields where NA = meaningful, make NA 0
combined$BsmtFinSF1[is.na(combined$BsmtFinSF1)] <- 0
combined$BsmtFinSF2[is.na(combined$BsmtFinSF2)] <- 0
combined$BsmtFullBath[is.na(combined$BsmtFullBath)] <- 0
combined$BsmtHalfBath[is.na(combined$BsmtHalfBath)] <- 0
combined$BsmtUnfSF[is.na(combined$BsmtUnfSF)] <- 0
combined$GarageArea[is.na(combined$GarageArea)] <- 0
combined$GarageCars[is.na(combined$GarageCars)] <- 0
combined$GarageYrBlt[is.na(combined$GarageYrBlt)] <- 0
combined$LotFrontage[is.na(combined$LotFrontage)] <- 0
combined$MasVnrArea[is.na(combined$MasVnrArea)] <- 0
combined$TotalBsmtSF[is.na(combined$TotalBsmtSF)] <- 0
# Did we get rid of NAs?
na_col <- which(colSums(is.na(combined)) > 0)
sort(colSums(sapply(combined[na_col], is.na)), decreasing = TRUE)
## SalePrice log_SalePrice
## 1459 1459
# Separate the training and test sets again
training <- combined[1:50,]
test <- combined[51:1509,]
Reaching for the stars
- Find the \(\lambda^*\) tuning parameter that yields the LASSO model with the lowest estimated RMLSE as well as this lowest RMLSE as well. You may use functions included in a package for this.
- Convince yourself with a visualization that the \(\lambda^*\) you found is indeed the one that returns the lowest estimated RMLSE.
- What is the model \(\widehat{f}\)_2 resulting from this \(\lambda^*\)? Output a data frame of the \(\widehat{\beta}\).
- Visualize the progression of \(\widehat{\beta}\) for different \(\lambda\) values and mark \(\lambda^*\) with a vertical line:
# Find lambda star:
# Recall the other "extreme" is a model that is completely regularized, meaning
# you use none of the predictors, so that y_hat is simply the mean balance.
# REMEMBER THIS VALUE AS WELL!!!
mean(training$SalePrice)
## [1] 177928.5
# 3. Based on the above model formula, create "model matrix" representation of
# the predictor variables. Note:
# -the model_matrix() function conveniently converts all categorical predictors
# to numerical ones using one-hot encoding as seen in MP4
# -we remove the first column corresponding to the intercept because it is
# simply a column of ones.
x_matrix <- training %>%
modelr::model_matrix(model_formula, data = .) %>%
select(-`(Intercept)`) %>%
as.matrix()
# Compare the original data to the model matrix. What is different?
#training
#x_matrix
# 4.a) Fit a LASSO model. Note the inputs
# -Instead of inputing a model formula, you input the corresponding x_matrix and
# outcome variable
# -Setting alpha = 1 sets the regularization method to be LASSO. Setting it to be 0
# sets the regularization method to be "ridge regression", another regulization
# method that we don't have time to cover in this class
# -lambda is complexity/tuning parameter whose value we specify. Here let's
# specify 10, an arbitrarily chosen value
LASSO_fit_a <- glmnet(x = x_matrix, y = training$SalePrice, alpha = 1, lambda = 10)
#LASSO_fit_a
# Unfortunately the output isn't that informative. Let's use a wrapper function
# that yields a more informative output:
get_LASSO_coefficients <- function(LASSO_fit){
beta_hats <- LASSO_fit %>%
broom::tidy(return_zeros = TRUE) %>%
select(term, estimate, lambda) %>%
arrange(desc(lambda))
return(beta_hats)
}
#get_LASSO_coefficients(LASSO_fit_a)
# For that value of lambda = 10, we have the beta-hat coefficients that minimizes
# the equation seen in Lec19 via numerical optimization. Observe how all the
# beta-hats have been shrunk while the beta-hat for Limit variable has been
# "shrunk" to 0 and hence is dropped from the model. Compare above output with
# previously seen "unregularized" regression results
lm(model_formula, data = training) %>%
tidy(conf.int = TRUE)
(Intercept) |
41.9064247 |
25.5524020 |
1.6400190 |
0.1183598 |
-11.7771799 |
95.5900293 |
MSSubClass |
0.0011911 |
0.0004686 |
2.5417580 |
0.0204490 |
0.0002066 |
0.0021755 |
LotFrontage |
0.0000981 |
0.0005957 |
0.1646788 |
0.8710320 |
-0.0011534 |
0.0013496 |
LotArea |
0.0000283 |
0.0000066 |
4.3072501 |
0.0004242 |
0.0000145 |
0.0000421 |
OverallQual |
0.1186191 |
0.0311636 |
3.8063387 |
0.0012930 |
0.0531469 |
0.1840913 |
OverallCond |
0.1197853 |
0.0311446 |
3.8461075 |
0.0011833 |
0.0543530 |
0.1852176 |
YearBuilt |
0.0050119 |
0.0011172 |
4.4861594 |
0.0002856 |
0.0026648 |
0.0073590 |
YearRemodAdd |
-0.0028219 |
0.0014703 |
-1.9191817 |
0.0709579 |
-0.0059109 |
0.0002672 |
MasVnrArea |
0.0000129 |
0.0001507 |
0.0856053 |
0.9327252 |
-0.0003037 |
0.0003294 |
BsmtFinSF1 |
0.0003838 |
0.0001212 |
3.1656436 |
0.0053515 |
0.0001291 |
0.0006385 |
BsmtFinSF2 |
0.0002171 |
0.0001962 |
1.1062669 |
0.2831776 |
-0.0001952 |
0.0006293 |
BsmtUnfSF |
0.0002949 |
0.0001014 |
2.9091171 |
0.0093592 |
0.0000819 |
0.0005079 |
FirstFlrSF |
0.0001118 |
0.0001508 |
0.7415039 |
0.4679530 |
-0.0002050 |
0.0004286 |
SecondFlrSF |
0.0004008 |
0.0001436 |
2.7917279 |
0.0120483 |
0.0000992 |
0.0007024 |
BsmtFullBath |
0.0267287 |
0.0842291 |
0.3173330 |
0.7546423 |
-0.1502301 |
0.2036874 |
BsmtHalfBath |
-0.1044070 |
0.0994523 |
-1.0498193 |
0.3076928 |
-0.3133485 |
0.1045346 |
FullBath |
-0.0328149 |
0.0592904 |
-0.5534607 |
0.5867576 |
-0.1573795 |
0.0917497 |
HalfBath |
-0.1016615 |
0.0658233 |
-1.5444609 |
0.1398760 |
-0.2399511 |
0.0366281 |
BedroomAbvGr |
-0.0483873 |
0.0415738 |
-1.1638883 |
0.2596654 |
-0.1357305 |
0.0389560 |
KitchenAbvGr |
-0.2688876 |
0.2106137 |
-1.2766859 |
0.2179387 |
-0.7113706 |
0.1735954 |
TotRmsAbvGrd |
-0.0304452 |
0.0295860 |
-1.0290393 |
0.3170914 |
-0.0926031 |
0.0317127 |
Fireplaces |
0.0222626 |
0.0284464 |
0.7826157 |
0.4440277 |
-0.0375011 |
0.0820263 |
GarageYrBlt |
0.0000471 |
0.0000508 |
0.9282223 |
0.3655730 |
-0.0000596 |
0.0001538 |
GarageCars |
0.1634391 |
0.0807655 |
2.0236251 |
0.0581144 |
-0.0062429 |
0.3331212 |
GarageArea |
-0.0005312 |
0.0002490 |
-2.1332285 |
0.0469246 |
-0.0010543 |
-0.0000080 |
WoodDeckSF |
-0.0001279 |
0.0001763 |
-0.7253752 |
0.4775474 |
-0.0004983 |
0.0002425 |
OpenPorchSF |
0.0000478 |
0.0004051 |
0.1180911 |
0.9073033 |
-0.0008032 |
0.0008989 |
EnclosedPorch |
-0.0001996 |
0.0004181 |
-0.4774432 |
0.6387926 |
-0.0010780 |
0.0006787 |
ScreenPorch |
-0.0005011 |
0.0003049 |
-1.6436871 |
0.1175932 |
-0.0011416 |
0.0001394 |
MiscVal |
-0.0000836 |
0.0000897 |
-0.9322048 |
0.3635670 |
-0.0002721 |
0.0001048 |
MoSold |
-0.0034921 |
0.0066055 |
-0.5286668 |
0.6034949 |
-0.0173698 |
0.0103856 |
YrSold |
-0.0178841 |
0.0124741 |
-1.4336956 |
0.1688024 |
-0.0440913 |
0.0083231 |
# 4.b) Fit a LASSO model considering TWO lambda tuning/complexity parameters at
# once and look at beta-hats
lambda_inputs <- c(10, 1000)
LASSO_fit_b <- glmnet(x = x_matrix, y = training$SalePrice, alpha = 1, lambda = lambda_inputs)
get_LASSO_coefficients(LASSO_fit_b)
(Intercept) |
1.890048e+06 |
1000 |
MSSubClass |
1.694510e+01 |
1000 |
LotFrontage |
3.310509e+01 |
1000 |
LotArea |
3.836266e+00 |
1000 |
OverallQual |
2.942784e+04 |
1000 |
OverallCond |
3.007936e+02 |
1000 |
YearBuilt |
3.143870e+01 |
1000 |
YearRemodAdd |
0.000000e+00 |
1000 |
MasVnrArea |
3.635564e+00 |
1000 |
BsmtFinSF1 |
1.386592e+01 |
1000 |
BsmtFinSF2 |
-2.524315e+01 |
1000 |
BsmtUnfSF |
0.000000e+00 |
1000 |
TotalBsmtSF |
1.874543e+01 |
1000 |
FirstFlrSF |
0.000000e+00 |
1000 |
SecondFlrSF |
0.000000e+00 |
1000 |
LowQualFinSF |
0.000000e+00 |
1000 |
GrLivArea |
2.367634e+01 |
1000 |
BsmtFullBath |
0.000000e+00 |
1000 |
BsmtHalfBath |
0.000000e+00 |
1000 |
FullBath |
0.000000e+00 |
1000 |
HalfBath |
-8.514848e+02 |
1000 |
BedroomAbvGr |
-1.042105e+04 |
1000 |
KitchenAbvGr |
-2.729295e+04 |
1000 |
TotRmsAbvGrd |
0.000000e+00 |
1000 |
Fireplaces |
0.000000e+00 |
1000 |
GarageYrBlt |
3.160257e+00 |
1000 |
GarageCars |
0.000000e+00 |
1000 |
GarageArea |
0.000000e+00 |
1000 |
WoodDeckSF |
3.659517e+01 |
1000 |
OpenPorchSF |
8.314993e+01 |
1000 |
EnclosedPorch |
-7.676558e+00 |
1000 |
ThirdSsnPorch |
0.000000e+00 |
1000 |
ScreenPorch |
-2.783537e+01 |
1000 |
PoolArea |
0.000000e+00 |
1000 |
MiscVal |
-1.188951e+01 |
1000 |
MoSold |
0.000000e+00 |
1000 |
YrSold |
-1.003344e+03 |
1000 |
(Intercept) |
4.702230e+06 |
10 |
MSSubClass |
1.700394e+02 |
10 |
LotFrontage |
3.180632e+01 |
10 |
LotArea |
4.030335e+00 |
10 |
OverallQual |
2.452059e+04 |
10 |
OverallCond |
1.354994e+04 |
10 |
YearBuilt |
4.743587e+02 |
10 |
YearRemodAdd |
-2.275040e+02 |
10 |
MasVnrArea |
4.132174e+01 |
10 |
BsmtFinSF1 |
2.692454e+01 |
10 |
BsmtFinSF2 |
-8.607825e-01 |
10 |
BsmtUnfSF |
0.000000e+00 |
10 |
TotalBsmtSF |
2.115604e+01 |
10 |
FirstFlrSF |
2.164121e+01 |
10 |
SecondFlrSF |
5.121103e+01 |
10 |
LowQualFinSF |
0.000000e+00 |
10 |
GrLivArea |
2.960674e+01 |
10 |
BsmtFullBath |
-8.619281e+03 |
10 |
BsmtHalfBath |
-1.575791e+04 |
10 |
FullBath |
-1.478083e+04 |
10 |
HalfBath |
-2.667411e+04 |
10 |
BedroomAbvGr |
-9.773136e+03 |
10 |
KitchenAbvGr |
-2.229373e+04 |
10 |
TotRmsAbvGrd |
-5.495253e+03 |
10 |
Fireplaces |
-1.507280e+03 |
10 |
GarageYrBlt |
-8.907350e-01 |
10 |
GarageCars |
4.166576e+04 |
10 |
GarageArea |
-1.267516e+02 |
10 |
WoodDeckSF |
3.496014e+00 |
10 |
OpenPorchSF |
9.140406e+01 |
10 |
EnclosedPorch |
-3.462296e+01 |
10 |
ThirdSsnPorch |
0.000000e+00 |
10 |
ScreenPorch |
-7.004326e+01 |
10 |
PoolArea |
0.000000e+00 |
10 |
MiscVal |
-9.772408e+00 |
10 |
MoSold |
-2.607119e+02 |
10 |
YrSold |
-2.637326e+03 |
10 |
# The above output is in tidy/long format, which makes it hard to compare beta-hats
# for both lambda values. Let's convert it to wide format and compare the beta-hats
get_LASSO_coefficients(LASSO_fit_b) %>%
tidyr::spread(lambda, estimate)
(Intercept) |
4.702230e+06 |
1.890048e+06 |
BedroomAbvGr |
-9.773136e+03 |
-1.042105e+04 |
BsmtFinSF1 |
2.692454e+01 |
1.386592e+01 |
BsmtFinSF2 |
-8.607825e-01 |
-2.524315e+01 |
BsmtFullBath |
-8.619281e+03 |
0.000000e+00 |
BsmtHalfBath |
-1.575791e+04 |
0.000000e+00 |
BsmtUnfSF |
0.000000e+00 |
0.000000e+00 |
EnclosedPorch |
-3.462296e+01 |
-7.676558e+00 |
Fireplaces |
-1.507280e+03 |
0.000000e+00 |
FirstFlrSF |
2.164121e+01 |
0.000000e+00 |
FullBath |
-1.478083e+04 |
0.000000e+00 |
GarageArea |
-1.267516e+02 |
0.000000e+00 |
GarageCars |
4.166576e+04 |
0.000000e+00 |
GarageYrBlt |
-8.907350e-01 |
3.160257e+00 |
GrLivArea |
2.960674e+01 |
2.367634e+01 |
HalfBath |
-2.667411e+04 |
-8.514848e+02 |
KitchenAbvGr |
-2.229373e+04 |
-2.729295e+04 |
LotArea |
4.030335e+00 |
3.836266e+00 |
LotFrontage |
3.180632e+01 |
3.310509e+01 |
LowQualFinSF |
0.000000e+00 |
0.000000e+00 |
MasVnrArea |
4.132174e+01 |
3.635564e+00 |
MiscVal |
-9.772408e+00 |
-1.188951e+01 |
MoSold |
-2.607119e+02 |
0.000000e+00 |
MSSubClass |
1.700394e+02 |
1.694510e+01 |
OpenPorchSF |
9.140406e+01 |
8.314993e+01 |
OverallCond |
1.354994e+04 |
3.007936e+02 |
OverallQual |
2.452059e+04 |
2.942784e+04 |
PoolArea |
0.000000e+00 |
0.000000e+00 |
ScreenPorch |
-7.004326e+01 |
-2.783537e+01 |
SecondFlrSF |
5.121103e+01 |
0.000000e+00 |
ThirdSsnPorch |
0.000000e+00 |
0.000000e+00 |
TotalBsmtSF |
2.115604e+01 |
1.874543e+01 |
TotRmsAbvGrd |
-5.495253e+03 |
0.000000e+00 |
WoodDeckSF |
3.496014e+00 |
3.659517e+01 |
YearBuilt |
4.743587e+02 |
3.143870e+01 |
YearRemodAdd |
-2.275040e+02 |
0.000000e+00 |
YrSold |
-2.637326e+03 |
-1.003344e+03 |
# Notice how for the larger lambda, all non-intercept beta-hats have been shrunk
# to 0. All that remains is the intercept, whose value is the mean of the y.
# This is because lambda = 1000 penalizes complexity more harshly.
# 4.c) Fit a LASSO model with several lambda tuning/complexity parameters at once
# and look at beta-hats
lambda_inputs <- seq(from = 0, to = 1000)
#lambda_inputs
LASSO_fit_c <- glmnet(x = x_matrix, y = training$SalePrice, alpha = 1, lambda = lambda_inputs)
# Create visualization here:
# Since we are now considering several possible values of lambda tuning parameter
# let's visualize instead:
#get_LASSO_coefficients(LASSO_fit_c) %>%
# Plot:
#ggplot(aes(x = lambda, y = estimate, col = term)) +
#geom_line() +
#labs(x = "lambda", y = "beta-hat")
# However a typical LASSO plot doesn't show the intercept since it is a beta-hat
# value that is not a candidate to be shrunk to zero, so let's remove it from
# our plot:
#get_LASSO_coefficients(LASSO_fit_c) %>%
#filter(term != "(Intercept)") %>%
# Plot:
#ggplot(aes(x = lambda, y = estimate, col = term)) +
#geom_line() +
#labs(x = "lambda", y = "beta-hat")
# It's hard to see in what order the beta-hats get shrunk to 0, so let's zoom-in
# the plot a bit
get_LASSO_coefficients(LASSO_fit_c) %>%
filter(term != "(Intercept)") %>%
# Plot:
ggplot(aes(x = lambda, y = estimate, col = term)) +
geom_line() +
labs(x = "lambda", y = "beta-hat") +
coord_cartesian(xlim=c(0, 500), ylim = c(-10, 10))

# Output data frame of beta-hats for the LASSO model that uses lambda_star:
# 4.d) Fit a LASSO model with a narrower search grid of lambda tuning/complexity
# parameter values AND such that the lambdas are spaced by multiplicative powers
# of 10, instead of additive differences, and look at beta-hats
lambda_inputs <- 10^seq(from = -5, to = 3, length = 100)
#summary(lambda_inputs)
LASSO_fit_d <- glmnet(x = x_matrix, y = training$SalePrice, alpha = 1, lambda = lambda_inputs)
# Plot all beta-hats with lambda on log10-scale
LASSO_coefficients_plot <- get_LASSO_coefficients(LASSO_fit_d) %>%
filter(term != "(Intercept)") %>%
# Plot:
ggplot(aes(x = lambda, y = estimate, col = term)) +
geom_line() +
labs(x = "lambda (log10-scale)", y = "beta-hat") +
scale_x_log10()
#LASSO_coefficients_plot
# Zoom-in. In what order to the beta-hat slopes get shrunk to 0?
#LASSO_coefficients_plot +
# coord_cartesian(xlim = c(10^0, 10^3), ylim = c(-2, 2))
# 5. However, how do we know which lambda value to use? Should we set it to
# yield a less complex or more complex model? Let's use the glmnet package's
# built in crossvalidation functionality, using the same search grid of
# lambda_input values:
lambda_inputs <- 10^seq(from = -5, to = 3, length = 100)
LASSO_CV <- cv.glmnet(
x = x_matrix,
y = training$SalePrice,
alpha = 1,
lambda = lambda_inputs,
nfolds = 10,
type.measure = "mse"
)
#LASSO_CV
# Alas that output is not useful, so let's broom::tidy() it
LASSO_CV %>%
broom::tidy() %>%
rename(mse = estimate)
1000.0000000 |
804136465 |
163458899 |
640677566 |
967595364 |
21 |
830.2175681 |
854948598 |
176853619 |
678094980 |
1031802217 |
22 |
689.2612104 |
902233625 |
189894462 |
712339163 |
1092128088 |
23 |
572.2367659 |
963323233 |
207789606 |
755533626 |
1171112839 |
25 |
475.0810162 |
1033811119 |
222825836 |
810985283 |
1256636955 |
26 |
394.4206059 |
1084910073 |
234685605 |
850224467 |
1319595678 |
28 |
327.4549163 |
1118873044 |
250484078 |
868388966 |
1369357123 |
28 |
271.8588243 |
1176260430 |
268296794 |
907963636 |
1444557224 |
29 |
225.7019720 |
1241154305 |
284092569 |
957061735 |
1525246874 |
30 |
187.3817423 |
1293989659 |
298142390 |
995847268 |
1592132049 |
30 |
155.5676144 |
1337682028 |
312042471 |
1025639557 |
1649724499 |
29 |
129.1549665 |
1374024255 |
326084560 |
1047939695 |
1700108814 |
29 |
107.2267222 |
1404897363 |
337704900 |
1067192463 |
1742602264 |
29 |
89.0215085 |
1429454495 |
347070078 |
1082384417 |
1776524573 |
30 |
73.9072203 |
1456423740 |
354865634 |
1101558106 |
1811289374 |
30 |
61.3590727 |
1486220813 |
362121381 |
1124099431 |
1848342194 |
30 |
50.9413801 |
1517898304 |
368903730 |
1148994575 |
1886802034 |
31 |
42.2924287 |
1545933069 |
375058409 |
1170874660 |
1920991478 |
31 |
35.1119173 |
1570718526 |
380729960 |
1189988566 |
1951448485 |
31 |
29.1505306 |
1592094172 |
385548409 |
1206545763 |
1977642581 |
31 |
24.2012826 |
1610491318 |
389737967 |
1220753351 |
2000229285 |
31 |
20.0923300 |
1625538452 |
393318245 |
1232220207 |
2018856697 |
31 |
16.6810054 |
1638283873 |
396454348 |
1241829525 |
2034738221 |
31 |
13.8488637 |
1650015956 |
399511575 |
1250504381 |
2049527531 |
31 |
11.4975700 |
1659972362 |
402002616 |
1257969746 |
2061974978 |
31 |
9.5454846 |
1668275696 |
404174912 |
1264100784 |
2072450608 |
31 |
7.9248290 |
1675143243 |
406034809 |
1269108434 |
2081178053 |
31 |
6.5793322 |
1680775404 |
407390493 |
1273384911 |
2088165897 |
31 |
5.4622772 |
1686141910 |
409090928 |
1277050982 |
2095232838 |
32 |
4.5348785 |
1689866405 |
410019241 |
1279847164 |
2099885647 |
32 |
3.7649358 |
1692735969 |
410965518 |
1281770451 |
2103701487 |
32 |
3.1257158 |
1695604282 |
411752249 |
1283852032 |
2107356531 |
32 |
2.5950242 |
1698190121 |
412456009 |
1285734112 |
2110646130 |
32 |
2.1544347 |
1699766862 |
413033167 |
1286733695 |
2112800028 |
32 |
1.7886495 |
1701751478 |
413694309 |
1288057169 |
2115445786 |
32 |
1.4849683 |
1703266845 |
414230938 |
1289035907 |
2117497783 |
32 |
1.2328467 |
1705168375 |
414837054 |
1290331321 |
2120005429 |
32 |
1.0235310 |
1706325358 |
415299987 |
1291025372 |
2121625345 |
32 |
0.8497534 |
1707423562 |
415739253 |
1291684309 |
2123162816 |
32 |
0.7054802 |
1708465972 |
416157282 |
1292308690 |
2124623255 |
32 |
0.5857021 |
1709442469 |
416550699 |
1292891770 |
2125993167 |
32 |
0.4862602 |
1710350701 |
416918840 |
1293431861 |
2127269541 |
32 |
0.4037017 |
1711196225 |
417263902 |
1293932324 |
2128460127 |
32 |
0.3351603 |
1711983297 |
417587429 |
1294395868 |
2129570726 |
32 |
0.2782559 |
1712715569 |
417890621 |
1294824947 |
2130606190 |
32 |
0.2310130 |
1713396746 |
418174410 |
1295222335 |
2131571156 |
32 |
0.1917910 |
1714030022 |
418439950 |
1295590072 |
2132469972 |
32 |
0.1592283 |
1714618472 |
418688290 |
1295930182 |
2133306761 |
32 |
0.1321941 |
1715165074 |
418920462 |
1296244612 |
2134085537 |
32 |
0.1097499 |
1715674496 |
419137409 |
1296537087 |
2134811906 |
32 |
0.0911163 |
1716147478 |
419340246 |
1296807232 |
2135487724 |
32 |
0.0756463 |
1716587295 |
419529853 |
1297057442 |
2136117147 |
32 |
0.0628029 |
1716996645 |
419707109 |
1297289536 |
2136703755 |
33 |
0.0521401 |
1717377801 |
419872960 |
1297504841 |
2137250760 |
33 |
0.0432876 |
1717733116 |
420028160 |
1297704956 |
2137761276 |
33 |
0.0359381 |
1718064848 |
420173554 |
1297891294 |
2138238402 |
33 |
0.0298365 |
1718374906 |
420309805 |
1298065102 |
2138684711 |
33 |
0.0247708 |
1718665061 |
420437586 |
1298227475 |
2139102647 |
33 |
0.0205651 |
1718937189 |
420557689 |
1298379500 |
2139494877 |
33 |
0.0170735 |
1719192927 |
420670783 |
1298522145 |
2139863710 |
33 |
0.0141747 |
1719433584 |
420777350 |
1298656233 |
2140210934 |
33 |
0.0117681 |
1719660370 |
420877852 |
1298782517 |
2140538222 |
33 |
0.0097701 |
1719874420 |
420972736 |
1298901684 |
2140847156 |
33 |
0.0081113 |
1720076780 |
421062425 |
1299014355 |
2141139205 |
33 |
0.0067342 |
1720268415 |
421147315 |
1299121100 |
2141415729 |
33 |
0.0055908 |
1720450203 |
421227771 |
1299222432 |
2141677974 |
33 |
0.0046416 |
1720622939 |
421304131 |
1299318807 |
2141927070 |
33 |
0.0038535 |
1720787339 |
421376706 |
1299410633 |
2142164045 |
33 |
0.0031993 |
1720944054 |
421445779 |
1299498275 |
2142389833 |
33 |
0.0026561 |
1721093673 |
421511609 |
1299582063 |
2142605282 |
33 |
0.0022051 |
1721236729 |
421574433 |
1299662296 |
2142811162 |
33 |
0.0018307 |
1721373707 |
421634466 |
1299739241 |
2143008172 |
33 |
0.0015199 |
1721505043 |
421691904 |
1299813139 |
2143196947 |
33 |
0.0012619 |
1721631134 |
421746926 |
1299884208 |
2143378060 |
33 |
0.0010476 |
1721752338 |
421799696 |
1299952642 |
2143552034 |
33 |
0.0008697 |
1721868980 |
421850364 |
1300018616 |
2143719343 |
33 |
0.0007221 |
1721981354 |
421899065 |
1300082289 |
2143880418 |
33 |
0.0005995 |
1722089723 |
421945923 |
1300143801 |
2144035646 |
33 |
0.0004977 |
1722194341 |
421991053 |
1300203288 |
2144185395 |
33 |
0.0004132 |
1722295423 |
422034560 |
1300260863 |
2144329984 |
33 |
0.0003430 |
1722393170 |
422076538 |
1300316632 |
2144469709 |
33 |
0.0002848 |
1722487765 |
422117075 |
1300370690 |
2144604840 |
33 |
0.0002364 |
1722579375 |
422156252 |
1300423123 |
2144735626 |
33 |
0.0001963 |
1722668152 |
422194141 |
1300474010 |
2144862293 |
33 |
0.0001630 |
1722754236 |
422230812 |
1300523424 |
2144985047 |
33 |
0.0001353 |
1722837754 |
422266325 |
1300571429 |
2145104080 |
33 |
0.0001123 |
1722918826 |
422300739 |
1300618086 |
2145219565 |
33 |
0.0000933 |
1722997558 |
422334107 |
1300663450 |
2145331665 |
33 |
0.0000774 |
1723074049 |
422366478 |
1300707571 |
2145440527 |
33 |
0.0000643 |
1723148393 |
422397897 |
1300750497 |
2145546290 |
33 |
0.0000534 |
1723220674 |
422428405 |
1300792269 |
2145649079 |
33 |
0.0000443 |
1723290972 |
422458043 |
1300832929 |
2145749014 |
33 |
0.0000368 |
1723359359 |
422486845 |
1300872514 |
2145846204 |
33 |
0.0000305 |
1723425904 |
422514846 |
1300911057 |
2145940750 |
33 |
0.0000254 |
1723490671 |
422542078 |
1300948593 |
2146032749 |
33 |
0.0000210 |
1723553720 |
422568569 |
1300985152 |
2146122289 |
33 |
0.0000175 |
1723615108 |
422594346 |
1301020761 |
2146209454 |
33 |
0.0000145 |
1723674886 |
422619437 |
1301055449 |
2146294323 |
33 |
0.0000120 |
1723733105 |
422643864 |
1301089242 |
2146376969 |
33 |
0.0000100 |
1723789813 |
422667650 |
1301122162 |
2146457463 |
33 |
# What is te smallest estimated mse?
LASSO_CV %>%
broom::tidy() %>%
rename(mse = estimate) %>%
arrange(mse)
1000.0000000 |
804136465 |
163458899 |
640677566 |
967595364 |
21 |
830.2175681 |
854948598 |
176853619 |
678094980 |
1031802217 |
22 |
689.2612104 |
902233625 |
189894462 |
712339163 |
1092128088 |
23 |
572.2367659 |
963323233 |
207789606 |
755533626 |
1171112839 |
25 |
475.0810162 |
1033811119 |
222825836 |
810985283 |
1256636955 |
26 |
394.4206059 |
1084910073 |
234685605 |
850224467 |
1319595678 |
28 |
327.4549163 |
1118873044 |
250484078 |
868388966 |
1369357123 |
28 |
271.8588243 |
1176260430 |
268296794 |
907963636 |
1444557224 |
29 |
225.7019720 |
1241154305 |
284092569 |
957061735 |
1525246874 |
30 |
187.3817423 |
1293989659 |
298142390 |
995847268 |
1592132049 |
30 |
155.5676144 |
1337682028 |
312042471 |
1025639557 |
1649724499 |
29 |
129.1549665 |
1374024255 |
326084560 |
1047939695 |
1700108814 |
29 |
107.2267222 |
1404897363 |
337704900 |
1067192463 |
1742602264 |
29 |
89.0215085 |
1429454495 |
347070078 |
1082384417 |
1776524573 |
30 |
73.9072203 |
1456423740 |
354865634 |
1101558106 |
1811289374 |
30 |
61.3590727 |
1486220813 |
362121381 |
1124099431 |
1848342194 |
30 |
50.9413801 |
1517898304 |
368903730 |
1148994575 |
1886802034 |
31 |
42.2924287 |
1545933069 |
375058409 |
1170874660 |
1920991478 |
31 |
35.1119173 |
1570718526 |
380729960 |
1189988566 |
1951448485 |
31 |
29.1505306 |
1592094172 |
385548409 |
1206545763 |
1977642581 |
31 |
24.2012826 |
1610491318 |
389737967 |
1220753351 |
2000229285 |
31 |
20.0923300 |
1625538452 |
393318245 |
1232220207 |
2018856697 |
31 |
16.6810054 |
1638283873 |
396454348 |
1241829525 |
2034738221 |
31 |
13.8488637 |
1650015956 |
399511575 |
1250504381 |
2049527531 |
31 |
11.4975700 |
1659972362 |
402002616 |
1257969746 |
2061974978 |
31 |
9.5454846 |
1668275696 |
404174912 |
1264100784 |
2072450608 |
31 |
7.9248290 |
1675143243 |
406034809 |
1269108434 |
2081178053 |
31 |
6.5793322 |
1680775404 |
407390493 |
1273384911 |
2088165897 |
31 |
5.4622772 |
1686141910 |
409090928 |
1277050982 |
2095232838 |
32 |
4.5348785 |
1689866405 |
410019241 |
1279847164 |
2099885647 |
32 |
3.7649358 |
1692735969 |
410965518 |
1281770451 |
2103701487 |
32 |
3.1257158 |
1695604282 |
411752249 |
1283852032 |
2107356531 |
32 |
2.5950242 |
1698190121 |
412456009 |
1285734112 |
2110646130 |
32 |
2.1544347 |
1699766862 |
413033167 |
1286733695 |
2112800028 |
32 |
1.7886495 |
1701751478 |
413694309 |
1288057169 |
2115445786 |
32 |
1.4849683 |
1703266845 |
414230938 |
1289035907 |
2117497783 |
32 |
1.2328467 |
1705168375 |
414837054 |
1290331321 |
2120005429 |
32 |
1.0235310 |
1706325358 |
415299987 |
1291025372 |
2121625345 |
32 |
0.8497534 |
1707423562 |
415739253 |
1291684309 |
2123162816 |
32 |
0.7054802 |
1708465972 |
416157282 |
1292308690 |
2124623255 |
32 |
0.5857021 |
1709442469 |
416550699 |
1292891770 |
2125993167 |
32 |
0.4862602 |
1710350701 |
416918840 |
1293431861 |
2127269541 |
32 |
0.4037017 |
1711196225 |
417263902 |
1293932324 |
2128460127 |
32 |
0.3351603 |
1711983297 |
417587429 |
1294395868 |
2129570726 |
32 |
0.2782559 |
1712715569 |
417890621 |
1294824947 |
2130606190 |
32 |
0.2310130 |
1713396746 |
418174410 |
1295222335 |
2131571156 |
32 |
0.1917910 |
1714030022 |
418439950 |
1295590072 |
2132469972 |
32 |
0.1592283 |
1714618472 |
418688290 |
1295930182 |
2133306761 |
32 |
0.1321941 |
1715165074 |
418920462 |
1296244612 |
2134085537 |
32 |
0.1097499 |
1715674496 |
419137409 |
1296537087 |
2134811906 |
32 |
0.0911163 |
1716147478 |
419340246 |
1296807232 |
2135487724 |
32 |
0.0756463 |
1716587295 |
419529853 |
1297057442 |
2136117147 |
32 |
0.0628029 |
1716996645 |
419707109 |
1297289536 |
2136703755 |
33 |
0.0521401 |
1717377801 |
419872960 |
1297504841 |
2137250760 |
33 |
0.0432876 |
1717733116 |
420028160 |
1297704956 |
2137761276 |
33 |
0.0359381 |
1718064848 |
420173554 |
1297891294 |
2138238402 |
33 |
0.0298365 |
1718374906 |
420309805 |
1298065102 |
2138684711 |
33 |
0.0247708 |
1718665061 |
420437586 |
1298227475 |
2139102647 |
33 |
0.0205651 |
1718937189 |
420557689 |
1298379500 |
2139494877 |
33 |
0.0170735 |
1719192927 |
420670783 |
1298522145 |
2139863710 |
33 |
0.0141747 |
1719433584 |
420777350 |
1298656233 |
2140210934 |
33 |
0.0117681 |
1719660370 |
420877852 |
1298782517 |
2140538222 |
33 |
0.0097701 |
1719874420 |
420972736 |
1298901684 |
2140847156 |
33 |
0.0081113 |
1720076780 |
421062425 |
1299014355 |
2141139205 |
33 |
0.0067342 |
1720268415 |
421147315 |
1299121100 |
2141415729 |
33 |
0.0055908 |
1720450203 |
421227771 |
1299222432 |
2141677974 |
33 |
0.0046416 |
1720622939 |
421304131 |
1299318807 |
2141927070 |
33 |
0.0038535 |
1720787339 |
421376706 |
1299410633 |
2142164045 |
33 |
0.0031993 |
1720944054 |
421445779 |
1299498275 |
2142389833 |
33 |
0.0026561 |
1721093673 |
421511609 |
1299582063 |
2142605282 |
33 |
0.0022051 |
1721236729 |
421574433 |
1299662296 |
2142811162 |
33 |
0.0018307 |
1721373707 |
421634466 |
1299739241 |
2143008172 |
33 |
0.0015199 |
1721505043 |
421691904 |
1299813139 |
2143196947 |
33 |
0.0012619 |
1721631134 |
421746926 |
1299884208 |
2143378060 |
33 |
0.0010476 |
1721752338 |
421799696 |
1299952642 |
2143552034 |
33 |
0.0008697 |
1721868980 |
421850364 |
1300018616 |
2143719343 |
33 |
0.0007221 |
1721981354 |
421899065 |
1300082289 |
2143880418 |
33 |
0.0005995 |
1722089723 |
421945923 |
1300143801 |
2144035646 |
33 |
0.0004977 |
1722194341 |
421991053 |
1300203288 |
2144185395 |
33 |
0.0004132 |
1722295423 |
422034560 |
1300260863 |
2144329984 |
33 |
0.0003430 |
1722393170 |
422076538 |
1300316632 |
2144469709 |
33 |
0.0002848 |
1722487765 |
422117075 |
1300370690 |
2144604840 |
33 |
0.0002364 |
1722579375 |
422156252 |
1300423123 |
2144735626 |
33 |
0.0001963 |
1722668152 |
422194141 |
1300474010 |
2144862293 |
33 |
0.0001630 |
1722754236 |
422230812 |
1300523424 |
2144985047 |
33 |
0.0001353 |
1722837754 |
422266325 |
1300571429 |
2145104080 |
33 |
0.0001123 |
1722918826 |
422300739 |
1300618086 |
2145219565 |
33 |
0.0000933 |
1722997558 |
422334107 |
1300663450 |
2145331665 |
33 |
0.0000774 |
1723074049 |
422366478 |
1300707571 |
2145440527 |
33 |
0.0000643 |
1723148393 |
422397897 |
1300750497 |
2145546290 |
33 |
0.0000534 |
1723220674 |
422428405 |
1300792269 |
2145649079 |
33 |
0.0000443 |
1723290972 |
422458043 |
1300832929 |
2145749014 |
33 |
0.0000368 |
1723359359 |
422486845 |
1300872514 |
2145846204 |
33 |
0.0000305 |
1723425904 |
422514846 |
1300911057 |
2145940750 |
33 |
0.0000254 |
1723490671 |
422542078 |
1300948593 |
2146032749 |
33 |
0.0000210 |
1723553720 |
422568569 |
1300985152 |
2146122289 |
33 |
0.0000175 |
1723615108 |
422594346 |
1301020761 |
2146209454 |
33 |
0.0000145 |
1723674886 |
422619437 |
1301055449 |
2146294323 |
33 |
0.0000120 |
1723733105 |
422643864 |
1301089242 |
2146376969 |
33 |
0.0000100 |
1723789813 |
422667650 |
1301122162 |
2146457463 |
33 |
# The lambda_star is in the top row. We can extract this lambda_star value from
# the LASSO_CV object:
lambda_star <- LASSO_CV$lambda.min
lambda_star
## [1] 1000
# Visualize the progression of beta-hats for different lambda values and mark lambda_star with a vertical line:
# What do the all these values mean? For each value of the lambda
# tuning/complexity parameter, let's plot the estimated MSE generated by
# crossvalidation:
CV_plot <- LASSO_CV %>%
broom::tidy() %>%
rename(mse = estimate) %>%
arrange(mse) %>%
# plot:
ggplot(aes(x = lambda)) +
geom_point(aes(y = mse)) +
scale_x_log10() +
labs(x = "lambda (log10-scale)", y = "Estimated MSE")
#CV_plot
# Zoom-in:
CV_plot +
coord_cartesian(xlim=c(10^(-2), 10^2), ylim = c(40000, 50000))

# Mark the lambda_star with dashed blue line
CV_plot +
coord_cartesian(xlim=c(10^(-2), 10^2), ylim = c(40000, 50000)) +
geom_vline(xintercept = lambda_star, linetype = "dashed", col = "blue")

# 6. Now mark lambda_star in beta-hat vs lambda plot:
LASSO_coefficients_plot +
geom_vline(xintercept = lambda_star, linetype = "dashed", col = "blue")

# zoom-in:
LASSO_coefficients_plot +
geom_vline(xintercept = lambda_star, linetype = "dashed", col = "blue") +
coord_cartesian(ylim = c(-3, 3))

# What are the beta_hat values resulting from lambda_star? Which are shrunk to 0?
get_LASSO_coefficients(LASSO_fit_d) %>%
filter(lambda == lambda_star)
(Intercept) |
1.890048e+06 |
1000 |
MSSubClass |
1.694510e+01 |
1000 |
LotFrontage |
3.310509e+01 |
1000 |
LotArea |
3.836266e+00 |
1000 |
OverallQual |
2.942784e+04 |
1000 |
OverallCond |
3.007936e+02 |
1000 |
YearBuilt |
3.143870e+01 |
1000 |
YearRemodAdd |
0.000000e+00 |
1000 |
MasVnrArea |
3.635564e+00 |
1000 |
BsmtFinSF1 |
1.386592e+01 |
1000 |
BsmtFinSF2 |
-2.524315e+01 |
1000 |
BsmtUnfSF |
0.000000e+00 |
1000 |
TotalBsmtSF |
1.874543e+01 |
1000 |
FirstFlrSF |
0.000000e+00 |
1000 |
SecondFlrSF |
0.000000e+00 |
1000 |
LowQualFinSF |
0.000000e+00 |
1000 |
GrLivArea |
2.367634e+01 |
1000 |
BsmtFullBath |
0.000000e+00 |
1000 |
BsmtHalfBath |
0.000000e+00 |
1000 |
FullBath |
0.000000e+00 |
1000 |
HalfBath |
-8.514848e+02 |
1000 |
BedroomAbvGr |
-1.042105e+04 |
1000 |
KitchenAbvGr |
-2.729295e+04 |
1000 |
TotRmsAbvGrd |
0.000000e+00 |
1000 |
Fireplaces |
0.000000e+00 |
1000 |
GarageYrBlt |
3.160257e+00 |
1000 |
GarageCars |
0.000000e+00 |
1000 |
GarageArea |
0.000000e+00 |
1000 |
WoodDeckSF |
3.659517e+01 |
1000 |
OpenPorchSF |
8.314993e+01 |
1000 |
EnclosedPorch |
-7.676558e+00 |
1000 |
ThirdSsnPorch |
0.000000e+00 |
1000 |
ScreenPorch |
-2.783537e+01 |
1000 |
PoolArea |
0.000000e+00 |
1000 |
MiscVal |
-1.188951e+01 |
1000 |
MoSold |
0.000000e+00 |
1000 |
YrSold |
-1.003344e+03 |
1000 |
# Fit & predict
# 7. Get predictions from f_hat LASSO model using lambda_star
training <- training %>%
mutate(y_hat_LASSO = predict(LASSO_fit_d, newx = x_matrix, s = lambda_star)[,1])
# model matrix representation of predictor variables for training set:
x_matrix_train <- training %>%
modelr::model_matrix(model_formula, data = .) %>%
select(-`(Intercept)`) %>%
as.matrix()
# model matrix representation of predictor variables for test set:
x_matrix_test <- test %>%
modelr::model_matrix(model_formula, data = .) %>%
select(-`(Intercept)`) %>%
as.matrix()
# The previous didn't work b/c there is no outcome variable Balance in test as
# specified in model_formula. The solution is to create a temporary dummy
# variable of 1's (or any value); it makes no difference since ultimately we
# only care about x values.
x_matrix_test <- test %>%
# Create temporary outcome variance just to get model matrix to work:
mutate(SalePrice = 1) %>%
modelr::model_matrix(model_formula, data = .) %>%
select(-`(Intercept)`) %>%
as.matrix()
# Fit/train model to training set using lambda star
LASSO_fit_train <- glmnet(x = x_matrix_train, y = training$SalePrice, alpha = 1, lambda = lambda_star)
# Predict y_hat's for test data using model and same lambda = lambda_star.
test_res <- test %>%
mutate(y_hat_LASSO = predict(LASSO_fit_train, newx = x_matrix_test, s = lambda_star)[,1])
test_res