data("BostonHousing")
###Boston Housing dataset has 506 observations and 14 variables
data <- BostonHousing
str(data)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : num 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ b : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
set.seed(222)
view(data)
ind <- sample(2, nrow(data), replace = T, prob = c(0.7, 0.3))
train_data <- data[ind==1,]
test_data <- data[ind==2,]
view(test_data)
view(train_data)
x <- data.matrix(train_data[,1:13])
y <- train_data$medv
l_ridge <- glmnet(x, y, family="gaussian", alpha=0)
plot(l_ridge, xvar = 'lambda', label=T)
cv_out_ridge = cv.glmnet(x, y, alpha =0)
plot (cv_out_ridge)
names(cv_out_ridge)
## [1] "lambda" "cvm" "cvsd" "cvup" "cvlo"
## [6] "nzero" "call" "name" "glmnet.fit" "lambda.min"
## [11] "lambda.1se"
lambda_min <- cv_out_ridge$lambda.min
lambda_min
## [1] 0.6609873
lambda_1se<- cv_out_ridge$lambda.1se
lambda_1se
## [1] 4.663135
l_ridge <- glmnet(x, y, family="gaussian", alpha=0)
plot(l_ridge, xvar = 'lambda', label=T)
cv_out_ridge = cv.glmnet(x, y, alpha =0)
plot (cv_out_ridge)
names(cv_out_ridge)
## [1] "lambda" "cvm" "cvsd" "cvup" "cvlo"
## [6] "nzero" "call" "name" "glmnet.fit" "lambda.min"
## [11] "lambda.1se"
lambda_min <- cv_out_ridge$lambda.min
lambda_min
## [1] 0.6609873
lambda_1se<- cv_out_ridge$lambda.1se
lambda_1se
## [1] 4.663135
plot(l_ridge, xvar = 'lambda', label=T)
abline(v = log(cv_out_ridge$lambda.1se), col = "red", lty = "dashed")
abline(v = log(cv_out_ridge$lambda.min), col = "blue", lty = "dashed")
# set lambda to one of these values and build the model
l_ridge_final <- glmnet(x, y, family="gaussian", lambda = lambda_1se, alpha=0)
coef(l_ridge_final)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 16.932104599
## crim -0.088883221
## zn 0.021173911
## indus -0.076004871
## chas 1.597809301
## nox -3.808180660
## rm 3.864271100
## age -0.016786220
## dis -0.413660680
## rad 0.003558169
## tax -0.003711038
## ptratio -0.658480768
## b 0.006195020
## lstat -0.270391027
plot(coef(l_ridge_final))
# alternate plot of the coeffs
coef(l_ridge_final) %>%
tidy() %>%
filter(row != "(Intercept)") %>%
top_n(25, wt = abs(value)) %>%
ggplot(aes(value, reorder(row, value))) +
geom_point() +
ggtitle("Top 25 influential variables") +
xlab("Coefficient") +
ylab(NULL)
## Warning: 'tidy.dgCMatrix' is deprecated.
## See help("Deprecated")
## Warning: 'tidy.dgTMatrix' is deprecated.
## See help("Deprecated")
p1 <- predict(l_ridge_final, x) #remember to use x and not train_data
rmse_l_ridge_final <- sqrt(mean((train_data$medv-p1)^2))
rmse_l_ridge_final
## [1] 4.590626
cv_out_lasso = cv.glmnet(x, y, alpha = 1)
plot (cv_out_lasso)
names(cv_out_lasso)
## [1] "lambda" "cvm" "cvsd" "cvup" "cvlo"
## [6] "nzero" "call" "name" "glmnet.fit" "lambda.min"
## [11] "lambda.1se"
# two lambda values may be noted. 'lambda.min', 'lambda.1se'- lambda for error within 1 standard deviation
lambda_min <- cv_out_lasso$lambda.min
lambda_min
## [1] 0.02267497
lambda_1se<- cv_out_lasso$lambda.1se
lambda_1se
## [1] 0.3367161
l_lasso <- glmnet(x, y, family="gaussian", alpha=1)
plot(l_lasso, xvar = 'lambda', label=T)
abline(v = log(cv_out_lasso$lambda.1se), col = "red", lty = "dashed")
abline(v = log(cv_out_lasso$lambda.min), col = "blue", lty = "dashed")
l_lasso_final <- glmnet(x, y, family="gaussian", lambda = lambda_1se, alpha=1)
coef(l_lasso_final)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 9.317372353
## crim -0.083088453
## zn .
## indus .
## chas 0.734904190
## nox -2.571132768
## rm 5.374982341
## age -0.007170991
## dis -0.232202852
## rad .
## tax -0.002333689
## ptratio -0.813096482
## b 0.005645356
## lstat -0.356054885
plot(coef(l_lasso_final))