In order to check for missing values we will use complete.cases:
Credit_1 <- Credit
complete.cases(Credit_1) %>% sum()
## [1] 400
Credit_2 <- model.matrix(data = Credit_1,Income~.) %>% as.matrix()
Credit_2 <- Credit_2[,-c(1:2)]
y <- Credit_1$Income %>% as.matrix()
Ridge_reg <- glmnet(x = Credit_2,
y = y,alpha = 0)
elastic_net_reg_low <- glmnet(x = Credit_2,
y = y,alpha = 0.2)
elastic_net_reg_high <- glmnet(x = Credit_2,
y = y,alpha = 0.8)
Lasso_reg <- glmnet(x = Credit_2,
y = y,alpha = 1)
In the upcoming boxes we can see the number of omitted variables in the Lasso, Ridge, elastic (with \(\alpha\) = 0.8) and elastic (with \(\alpha\) = 0.2) regressions, respectively:
Lasso_reg$df
## [1] 0 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3
## [24] 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 5 5 6 6 7 7 7
## [47] 8 8 8 8 8 8 8 9 9 9 9 9 9 10 10 10 10 10 10 10 10 10 10
## [70] 10 11 11 11 11
Ridge_reg$df
## [1] 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
## [24] 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
## [47] 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
## [70] 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
## [93] 11 11 11 11 11 11 11 11
elastic_net_reg_high$df
## [1] 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3
## [24] 3 4 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 6 7 7 8 8 8
## [47] 9 9 9 9 8 8 8 9 9 9 9 9 9 10 10 10 10 10 10 10 10 10 10
## [70] 10 10 10 11 11 11 11 11
elastic_net_reg_low$df
## [1] 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3
## [24] 3 3 3 4 4 4 4 5 5 5 5 5 5 5 5 5 5 5 6 6 6 7 7
## [47] 7 8 9 9 9 9 9 9 9 9 9 9 10 10 10 10 11 11 11 11 11 11 11
## [70] 11 11 11 11 11 11 11 11 11 10 10 10 10 11 11 11 11 11 11 11 11 11 11
## [93] 11
In the upcoming boxes we can see the graphs of the Lasso, Ridge, elastic (with \(\alpha\) = 0.8) and elastic (with \(\alpha\) = 0.2) regressions with variables value against log(lambda), respectively:
plotting <- function(reg){
tmp <- coef(reg) %>% as.matrix() %>% as.data.frame()
tmp$coef <- row.names(tmp)
tmp <- reshape::melt(tmp, id = "coef")
tmp$variable <- as.numeric(gsub("s", "", tmp$variable))
# extract the lambda values
tmp$lambda <- reg$lambda[tmp$variable+1]
tmp$norm <- apply(abs(coef(reg)[-1,]),
2, sum)[tmp$variable+1] # compute L1 norm
ggplot(tmp[tmp$coef != "(Intercept)",],
aes(lambda, value, color = coef, linetype = coef)) +
geom_line() +
scale_x_log10() +
xlab("Lambda (log scale)") +
guides(color = guide_legend(title = ""),
linetype = guide_legend(title = "")) +
theme_bw() +
theme(legend.key.width = unit(3,"lines"))
}
plotting(Lasso_reg)
plotting(Ridge_reg)
plotting(elastic_net_reg_high)
plotting(elastic_net_reg_low)
set.seed(123)
sample <- sample.split(Credit_1,SplitRatio = 0.75)
train1 <- subset(Credit_1,sample ==TRUE)
test1 <- subset(Credit_1, sample==FALSE)
y_train <- train1$Income
y_test <- test1$Income
train1 <- model.matrix(data = train1,Income~.) %>% as.matrix()
test1 <- model.matrix(data = test1,Income~.) %>% as.matrix()
train1 <- train1[,-1]
test1 <- test1[,-1]
Ridge_reg_cv <- cv.glmnet(x = train1,
y = y_train,nfolds = 10,alpha = 0)
Lasso_reg_cv <- cv.glmnet(x = train1,
y = y_train,nfolds = 10,alpha = 1)
#Ridge_reg_cv$lambda.min
#Lasso_reg_cv$lambda.min
The min \(\lambda\) for the Ridge regression is: 3.0432174, and for the Lasso regression is 0.1824361.
Next, we will plot MSE against log(\(\lambda\)) for each model:
plot(Ridge_reg_cv)
plot(Lasso_reg_cv)
prediction_Ridge <- predict(Ridge_reg_cv,Ridge_reg_cv$lambda.min,
newx = test1)
prediction_Lasso <- predict(Lasso_reg_cv,Lasso_reg_cv$lambda.min,
newx = test1)
#mean((y_test - prediction_Ridge)^2)
#mean((y_test - prediction_Lasso)^2)
The MSE for the Ridge regression is 198.2677045 and for the Lasso regression is 120.1863162
As we will see - all the variables are ready except for the color:
winequality <- read_excel("C:/Users/dorgo/Documents/R/winequality.xlsx")
View(winequality)
lapply(winequality, class)
## $`fixed acidity`
## [1] "numeric"
##
## $`volatile acidity`
## [1] "numeric"
##
## $`citric acid`
## [1] "numeric"
##
## $`residual sugar`
## [1] "numeric"
##
## $chlorides
## [1] "numeric"
##
## $`free sulfur dioxide`
## [1] "numeric"
##
## $`total sulfur dioxide`
## [1] "numeric"
##
## $density
## [1] "numeric"
##
## $pH
## [1] "numeric"
##
## $sulphates
## [1] "numeric"
##
## $alcohol
## [1] "numeric"
##
## $color
## [1] "character"
##
## $quality
## [1] "numeric"
winequality_traits <- model.matrix(data = winequality, quality~.) %>%
as.data.frame()
colnames(winequality_traits) <- gsub("\\(|\\)|`",""
,colnames(winequality_traits))
winequality_traits <- winequality_traits %>%
select(-Intercept)
winequality <- cbind(winequality$quality,winequality_traits)
colnames(winequality)[1] <- "quality"
rm(winequality_traits)
Now we will see the summary of the pcr, with 6 PC’s:
principal_component <- pcr(quality~.,data = winequality,
scale = TRUE,,ncomp = 6)
summary(principal_component)
## Data: X dimension: 6497 12
## Y dimension: 6497 1
## Fit method: svdpc
## Number of components considered: 6
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## X 31.745 52.82 65.83 73.92 79.95 85.07
## quality 1.687 10.25 14.42 17.25 20.84 22.71
sample <- sample.split(winequality,SplitRatio = 0.75)
train1 <- subset(winequality,sample ==TRUE)
test1 <- subset(winequality, sample==FALSE)
y_test <- test1$quality
x_test <- test1[,2:13]
Now we will see the summary of the pcr, with cross-validation:
principal_component_CV <- pcr(quality~.,data = train1,
scale = TRUE,validation = "CV")
summary(principal_component_CV)
## Data: X dimension: 4497 12
## Y dimension: 4497 1
## Fit method: svdpc
## Number of components considered: 12
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 0.8663 0.8601 0.8259 0.8059 0.7934 0.7727 0.7653
## adjCV 0.8663 0.8601 0.8258 0.8058 0.7934 0.7726 0.7652
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps
## CV 0.7563 0.7532 0.7335 0.7335 0.7333 0.7309
## adjCV 0.7559 0.7531 0.7333 0.7333 0.7332 0.7307
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 31.717 52.789 65.73 73.91 79.90 85.02 89.45
## quality 1.524 9.268 13.65 16.35 20.73 22.37 24.28
## 8 comps 9 comps 10 comps 11 comps 12 comps
## X 93.65 96.63 98.77 99.78 100.00
## quality 24.78 28.71 28.75 28.81 29.32
It is hard to spot the optimal PC’s, so we will use the function selectNcomp:
selectNcomp(principal_component_CV,method = "onesigma")
## [1] 9
Next we will plot the MSEP against the number of components:
validationplot(principal_component_CV, val.type = "MSEP")
pcr_pred <- predict(pcr(quality~.,data = test1),ncomp = 9,
newx = test1)
#mean((pcr_pred - y_test)^2)
The MSE in the test is: 0.5641212.