#install.packages("glmnet", repos = "http://cran.us.r-project.org")
library("glmnet")
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-18
#install.packages("ISLR", repos = "http://cran.us.r-project.org")
library(ISLR)
dat<-Credit
attach(dat)
#View(dat)
#summary(dat)
str(dat)
## 'data.frame': 400 obs. of 12 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Income : num 14.9 106 104.6 148.9 55.9 ...
## $ Limit : int 3606 6645 7075 9504 4897 8047 3388 7114 3300 6819 ...
## $ Rating : int 283 483 514 681 357 569 259 512 266 491 ...
## $ Cards : int 2 3 4 3 2 4 2 2 5 3 ...
## $ Age : int 34 82 71 36 68 77 37 87 66 41 ...
## $ Education: int 11 15 11 11 16 10 12 9 13 19 ...
## $ Gender : Factor w/ 2 levels " Male","Female": 1 2 1 2 1 1 2 1 2 2 ...
## $ Student : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 2 ...
## $ Married : Factor w/ 2 levels "No","Yes": 2 2 1 1 2 1 1 1 1 2 ...
## $ Ethnicity: Factor w/ 3 levels "African American",..: 3 2 2 2 3 3 1 2 3 1 ...
## $ Balance : int 333 903 580 964 331 1151 203 872 279 1350 ...
…and check for NA’s and irrelevant variables:
length(which(is.na(dat)))
## [1] 0
–> There are no missing values (0 length vector of missing values)
find non-quantitative variables:
colnames(dat[,sapply(dat,class)=="factor"])
## [1] "Gender" "Student" "Married" "Ethnicity"
create dat2 with no non-quantitative variables (and without ID which is irrelevant):
dat1 <- model.matrix(Income~., data = dat)
dat1 <- dat1[,-1]
head(dat1)
## ID Limit Rating Cards Age Education GenderFemale StudentYes MarriedYes
## 1 1 3606 283 2 34 11 0 0 1
## 2 2 6645 483 3 82 15 1 1 1
## 3 3 7075 514 4 71 11 0 0 0
## 4 4 9504 681 3 36 11 1 0 0
## 5 5 4897 357 2 68 16 0 0 1
## 6 6 8047 569 4 77 10 0 0 0
## EthnicityAsian EthnicityCaucasian Balance
## 1 0 1 333
## 2 1 0 903
## 3 1 0 580
## 4 1 0 964
## 5 0 1 331
## 6 0 1 1151
The predicted variable y and the standardized predictors x:
y <- dat1[Income]
x <- scale(dat1[,-Income])
lasso <- glmnet(x,y,alpha=1)
ridge <- glmnet(x,y,alpha=0)
elastic_net1 <- glmnet(x,y,alpha=0.01)
elastic_net2 <- glmnet(x,y,alpha=0.9)
I’ll plot the coeffiecient to log lambda (in the top axis is the number of non-zero coefficients):
print("Lasso:")
## [1] "Lasso:"
plot(lasso,xvar="lambda",label=TRUE)
print("Ridge:")
## [1] "Ridge:"
plot(ridge,xvar="lambda",label=TRUE)
print("Elastic net 1:")
## [1] "Elastic net 1:"
plot(elastic_net1,xvar="lambda",label=TRUE)
print("Elastic net 2:")
## [1] "Elastic net 2:"
plot(elastic_net2,xvar="lambda",label=TRUE)
As lambda gets smaller, the coefficients gets bigger, but while the ridge and elastic with alpha=0.1 models are monotonic and smooth, the Lasso and elastic with alpha=0.9 models are not all monotonic nor smooth, and for some variables has a downward trend from some point. It is also clear that the Lasso model all coeffeficients zero out, while in Ridge model non of the coefficients zero out. This is because the Lasso restriction is comprised of absolute value which makes sharpe edges while the Ridge restriction is comprised of quadratic values, which makes smooth round-ish edges. 6. Use model$df to check how many non-zero coefficients for each lambda for the different methods.
plot(lasso$lambda,lasso$df)
plot(ridge$lambda,ridge$df)
plot(elastic_net1$lambda,elastic_net1$df)
plot(elastic_net2$lambda,elastic_net2$df)
7. Use plot(model) to view a plot of the coefficients as a function of lambda. –> see section 5
index <- sample(x=1:nrow(x), size=.25*nrow(x))
test_x <- x[index,]
train_x <- x[-index,]
test_y <- y[index]
train_y <- y[-index]
ridge_cv <- cv.glmnet(train_x, train_y, alpha = 0, nfolds = 5)
lasso_cv <- cv.glmnet(train_x, train_y, alpha = 1, nfolds = 5)
ridge_cv$lambda.min
## [1] 2.873112
lasso_cv$lambda.min
## [1] 1.606302
plot(ridge_cv)
Lasso CV:
plot(lasso_cv)
y_hat_ridge <- predict(ridge_cv, s = ridge_cv$lambda.min, newx = test_x)
y_hat_lasso <- predict(lasso_cv, s = lasso_cv$lambda.min, newx = test_x)
ridge_MSE <-mean((y_hat_ridge-test_y)^2)
lasso_MSE <-mean((y_hat_lasso-test_y)^2)
It turned out the ridge model has a smaller MSE and so supposdley better predictions, but as we plot them together we can see they make quite similar predictions:
plot(y_hat_lasso,y_hat_ridge)
abline(0,1,lwd=2)
And as we plot them with the real values, we can how similair both models are in their predictions:
library(lattice)
xyplot(y_hat_lasso + y_hat_ridge ~ test_y, ,auto.key = list(points = FALSE,lines = TRUE))
In this exercise we will predict wine quality using different chemical aspects of the wine using the Wine data set. 1. Load Wine data - View data and variable classes. (I changed the file type to csv)
wine_df <- read.csv('winequality.csv')
attach(wine_df)
str(wine_df)
## 'data.frame': 6497 obs. of 13 variables:
## $ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
## $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ color : Factor w/ 2 levels "red","white": 1 1 1 1 1 1 1 1 1 1 ...
## $ quality : int 5 5 5 6 5 5 5 7 7 5 ...
Is the data ready for modeling? I’ll check for missing values, recode color as a dummy and scale the data:
wine_df[!complete.cases(wine_df),]
## [1] fixed.acidity volatile.acidity citric.acid
## [4] residual.sugar chlorides free.sulfur.dioxide
## [7] total.sulfur.dioxide density pH
## [10] sulphates alcohol color
## [13] quality
## <0 rows> (or 0-length row.names)
wine_df$color <- ifelse(wine_df$color=='red', 1,0)
wine_df <- as.data.frame(scale(wine_df))
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
require(pls)
pcr <- pcr(formula = quality ~ ., data = wine_df, scale = FALSE, ncomp = 10, validation = "none")
summary(pcr)
## Data: X dimension: 6497 12
## Y dimension: 6497 1
## Fit method: svdpc
## Number of components considered: 10
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 31.745 52.82 65.83 73.92 79.95 85.07 89.54
## quality 1.687 10.25 14.42 17.25 20.84 22.71 23.73
## 8 comps 9 comps 10 comps
## X 93.73 96.66 98.80
## quality 24.75 29.21 29.21
As we can see, the percent of variance explained increases with the number of components, as in the 7th components it reaches almost 90%.
matrix(pcr$coefficients, 12, 10, dimnames = list(names(wine_df)[-13]))
## [,1] [,2] [,3] [,4]
## fixed.acidity -0.017102272 -0.067281264 0.006277672 -0.021938750
## volatile.acidity -0.023879527 -0.030150812 -0.076981950 -0.114112895
## citric.acid 0.007599157 -0.030557711 0.065059398 0.110713988
## residual.sugar 0.016363984 -0.056828630 -0.085053352 -0.113251438
## chlorides -0.019650869 -0.064028210 -0.062913065 -0.022412586
## free.sulfur.dioxide 0.022981834 -0.008191289 -0.033871660 0.028290154
## total.sulfur.dioxide 0.027481889 -0.007707702 -0.028422006 0.007788352
## density -0.009450230 -0.112723400 -0.139103179 -0.151879381
## pH -0.012022122 0.023718201 -0.049743220 0.020825387
## sulphates -0.018805850 -0.041064559 -0.026635901 0.081985362
## alcohol -0.001315469 0.086259050 0.127482447 0.146372047
## color -0.031316085 -0.045414856 -0.053380629 -0.050590388
## [,5] [,6] [,7] [,8]
## fixed.acidity 0.01238323 0.050098215 0.01206224 -0.035519767
## volatile.acidity -0.14721930 -0.062114216 -0.09526155 -0.069449327
## citric.acid 0.14633680 0.111483650 0.06362073 0.113480150
## residual.sugar -0.03387647 -0.001317633 0.04217083 0.110987271
## chlorides -0.16512572 -0.193574786 -0.19175078 -0.124765570
## free.sulfur.dioxide -0.01012527 0.063852530 0.01500800 -0.008908081
## total.sulfur.dioxide -0.02298625 0.008413455 -0.01049935 -0.024466402
## density -0.08620532 -0.093615087 -0.09631981 -0.094980798
## pH 0.11813915 0.066363567 0.01287539 0.027783616
## sulphates 0.10647432 0.143058084 0.22144387 0.188428072
## alcohol 0.19387817 0.282918950 0.28516302 0.344895806
## color -0.03349278 -0.014329103 -0.03657424 -0.042864472
## [,9] [,10]
## fixed.acidity 0.025527358 0.03244886
## volatile.acidity -0.270949978 -0.27268692
## citric.acid -0.009181439 -0.01389809
## residual.sugar 0.161773615 0.16173608
## chlorides -0.037863332 -0.03377105
## free.sulfur.dioxide 0.134673508 0.12975061
## total.sulfur.dioxide -0.155774423 -0.14876906
## density -0.069344587 -0.06841929
## pH 0.034573950 0.03832698
## sulphates 0.112940367 0.11227417
## alcohol 0.418720635 0.42217826
## color 0.041715172 0.03659735
plot(pcr)
set.seed(1234)
index <- sample(1:nrow(wine_df), size=.2*nrow(wine_df))
wine_train <- wine_df[-index,]
wine_test <- wine_df[index,]
wine_test_y <- wine_test$quality
wine_test_x <- wine_test[,-13]
pcr_cv <- pcr(formula = quality ~ ., data = wine_train, scale = FALSE, validation = "CV")
summary(pcr_cv)
## Data: X dimension: 5198 12
## Y dimension: 5198 1
## Fit method: svdpc
## Number of components considered: 12
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 0.9972 0.9905 0.9417 0.9212 0.9025 0.8851 0.8737
## adjCV 0.9972 0.9905 0.9416 0.9212 0.9024 0.8850 0.8735
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps
## CV 0.8693 0.8635 0.8360 0.8361 0.8359 0.8346
## adjCV 0.8694 0.8634 0.8358 0.8359 0.8358 0.8344
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 31.552 52.63 65.63 73.83 79.93 85.09 89.57
## quality 1.371 10.83 14.76 18.18 21.45 23.50 24.30
## 8 comps 9 comps 10 comps 11 comps 12 comps
## X 93.74 96.64 98.79 99.79 100.00
## quality 25.33 30.14 30.15 30.20 30.51
The optimal number of principal components is probably : 96.64 percent of the variation is explained (and The key idea is that often a small number of principal components needed to explain most of the variability in the data), while the CV estimator is 0.8360
validationplot(pcr_cv)
8 components explained 94% of variablity and can also be used as the optimal numberm though, though 9 components explain more without a big chance in the error.
pcr_all <- predict(pcr_cv, wine_test_x, ncomp=12)
mean((pcr_all-wine_test_y)^2)
## [1] 0.7551776
pcr_9 <- predict(pcr_cv, wine_test_x, ncomp=9)
mean((pcr_9-wine_test_y)^2)
## [1] 0.76262
pcr_8 <- predict(pcr_cv, wine_test_x, ncomp=8)
mean((pcr_8-wine_test_y)^2)
## [1] 0.8037395
THE END.