ml4econ

In order to check for missing values we will use complete.cases:

Credit_1 <- Credit
complete.cases(Credit_1) %>% sum()

## [1] 400

Credit_2 <- model.matrix(data = Credit_1,Income~.) %>% as.matrix()
Credit_2 <- Credit_2[,-c(1:2)]
y <- Credit_1$Income %>% as.matrix()

1

Ridge_reg <- glmnet(x = Credit_2,
       y = y,alpha = 0)
elastic_net_reg_low <- glmnet(x = Credit_2,
                    y = y,alpha = 0.2)
elastic_net_reg_high <- glmnet(x = Credit_2,
                          y = y,alpha = 0.8)
Lasso_reg <- glmnet(x = Credit_2,
                    y = y,alpha = 1)

In the upcoming boxes we can see the number of omitted variables in the Lasso, Ridge, elastic (with \(\alpha\) = 0.8) and elastic (with \(\alpha\) = 0.2) regressions, respectively:

Lasso_reg$df

##  [1]  0  1  1  1  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  3  3
## [24]  3  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  5  5  6  6  7  7  7
## [47]  8  8  8  8  8  8  8  9  9  9  9  9  9 10 10 10 10 10 10 10 10 10 10
## [70] 10 11 11 11 11

Ridge_reg$df

##   [1] 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
##  [24] 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
##  [47] 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
##  [70] 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
##  [93] 11 11 11 11 11 11 11 11

elastic_net_reg_high$df

##  [1]  0  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  3  3
## [24]  3  4  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  6  7  7  8  8  8
## [47]  9  9  9  9  8  8  8  9  9  9  9  9  9 10 10 10 10 10 10 10 10 10 10
## [70] 10 10 10 11 11 11 11 11

elastic_net_reg_low$df

##  [1]  0  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  3  3
## [24]  3  3  3  4  4  4  4  5  5  5  5  5  5  5  5  5  5  5  6  6  6  7  7
## [47]  7  8  9  9  9  9  9  9  9  9  9  9 10 10 10 10 11 11 11 11 11 11 11
## [70] 11 11 11 11 11 11 11 11 11 10 10 10 10 11 11 11 11 11 11 11 11 11 11
## [93] 11

In the upcoming boxes we can see the graphs of the Lasso, Ridge, elastic (with \(\alpha\) = 0.8) and elastic (with \(\alpha\) = 0.2) regressions with variables value against log(lambda), respectively:

plotting <- function(reg){
tmp <- coef(reg) %>% as.matrix() %>% as.data.frame()
tmp$coef <- row.names(tmp)
tmp <- reshape::melt(tmp, id = "coef")
tmp$variable <- as.numeric(gsub("s", "", tmp$variable))
# extract the lambda values
tmp$lambda <- reg$lambda[tmp$variable+1] 
tmp$norm <- apply(abs(coef(reg)[-1,]),
                  2, sum)[tmp$variable+1] # compute L1 norm

ggplot(tmp[tmp$coef != "(Intercept)",],
       aes(lambda, value, color = coef, linetype = coef)) + 
  geom_line() + 
  scale_x_log10() + 
  xlab("Lambda (log scale)") + 
  guides(color = guide_legend(title = ""), 
         linetype = guide_legend(title = "")) +
  theme_bw() + 
  theme(legend.key.width = unit(3,"lines"))
}
plotting(Lasso_reg)

plotting(Ridge_reg)

plotting(elastic_net_reg_high)

plotting(elastic_net_reg_low)

set.seed(123)
sample <-  sample.split(Credit_1,SplitRatio = 0.75)
train1 <- subset(Credit_1,sample ==TRUE)
test1  <- subset(Credit_1, sample==FALSE)
y_train <- train1$Income
y_test <- test1$Income
train1 <-  model.matrix(data = train1,Income~.) %>% as.matrix()
test1 <-  model.matrix(data = test1,Income~.) %>% as.matrix()
train1 <- train1[,-1]
test1 <- test1[,-1]

Ridge_reg_cv <- cv.glmnet(x = train1,
                    y = y_train,nfolds = 10,alpha = 0)
Lasso_reg_cv <- cv.glmnet(x = train1,
                          y = y_train,nfolds = 10,alpha = 1)
#Ridge_reg_cv$lambda.min
#Lasso_reg_cv$lambda.min

The min \(\lambda\) for the Ridge regression is: 3.0432174, and for the Lasso regression is 0.1824361.

Next, we will plot MSE against log(\(\lambda\)) for each model:

plot(Ridge_reg_cv)

plot(Lasso_reg_cv)

prediction_Ridge <- predict(Ridge_reg_cv,Ridge_reg_cv$lambda.min,
                            newx = test1)
prediction_Lasso <- predict(Lasso_reg_cv,Lasso_reg_cv$lambda.min,
                            newx = test1)

#mean((y_test - prediction_Ridge)^2)
#mean((y_test - prediction_Lasso)^2)

The MSE for the Ridge regression is 198.2677045 and for the Lasso regression is 120.1863162

2

As we will see - all the variables are ready except for the color:

winequality <- read_excel("C:/Users/dorgo/Documents/R/winequality.xlsx")
View(winequality)
lapply(winequality, class)

## $`fixed acidity`
## [1] "numeric"
## 
## $`volatile acidity`
## [1] "numeric"
## 
## $`citric acid`
## [1] "numeric"
## 
## $`residual sugar`
## [1] "numeric"
## 
## $chlorides
## [1] "numeric"
## 
## $`free sulfur dioxide`
## [1] "numeric"
## 
## $`total sulfur dioxide`
## [1] "numeric"
## 
## $density
## [1] "numeric"
## 
## $pH
## [1] "numeric"
## 
## $sulphates
## [1] "numeric"
## 
## $alcohol
## [1] "numeric"
## 
## $color
## [1] "character"
## 
## $quality
## [1] "numeric"

winequality_traits <- model.matrix(data = winequality, quality~.) %>% 
  as.data.frame()
colnames(winequality_traits) <- gsub("\\(|\\)|`",""
                                     ,colnames(winequality_traits))
winequality_traits <- winequality_traits %>% 
  select(-Intercept)
winequality <- cbind(winequality$quality,winequality_traits)
colnames(winequality)[1] <- "quality"
rm(winequality_traits)

Now we will see the summary of the pcr, with 6 PC’s:

principal_component <- pcr(quality~.,data = winequality,
                           scale = TRUE,,ncomp = 6)
summary(principal_component)

## Data:    X dimension: 6497 12 
##  Y dimension: 6497 1
## Fit method: svdpc
## Number of components considered: 6
## TRAINING: % variance explained
##          1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## X         31.745    52.82    65.83    73.92    79.95    85.07
## quality    1.687    10.25    14.42    17.25    20.84    22.71

sample <-  sample.split(winequality,SplitRatio = 0.75)
train1 <- subset(winequality,sample ==TRUE)
test1  <- subset(winequality, sample==FALSE)
y_test <- test1$quality
x_test <- test1[,2:13]

Now we will see the summary of the pcr, with cross-validation:

principal_component_CV <- pcr(quality~.,data = train1,
                           scale = TRUE,validation = "CV")
summary(principal_component_CV)

## Data:    X dimension: 4497 12 
##  Y dimension: 4497 1
## Fit method: svdpc
## Number of components considered: 12
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV          0.8663   0.8601   0.8259   0.8059   0.7934   0.7727   0.7653
## adjCV       0.8663   0.8601   0.8258   0.8058   0.7934   0.7726   0.7652
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps
## CV      0.7563   0.7532   0.7335    0.7335    0.7333    0.7309
## adjCV   0.7559   0.7531   0.7333    0.7333    0.7332    0.7307
## 
## TRAINING: % variance explained
##          1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X         31.717   52.789    65.73    73.91    79.90    85.02    89.45
## quality    1.524    9.268    13.65    16.35    20.73    22.37    24.28
##          8 comps  9 comps  10 comps  11 comps  12 comps
## X          93.65    96.63     98.77     99.78    100.00
## quality    24.78    28.71     28.75     28.81     29.32

It is hard to spot the optimal PC’s, so we will use the function selectNcomp:

selectNcomp(principal_component_CV,method = "onesigma")

## [1] 9

Next we will plot the MSEP against the number of components:

validationplot(principal_component_CV, val.type = "MSEP")

pcr_pred <- predict(pcr(quality~.,data = test1),ncomp = 9,
        newx = test1)
#mean((pcr_pred - y_test)^2)

The MSE in the test is: 0.5641212.

ml4econ_2

Dor Goldenberg

May 6, 2019

1

2