Check out the following dataset and its description. We will compare OLS, Ridge, and Lasso in the prediction of house prices (medv).
#Boston Housing Data
library(mlbench)
data(BostonHousing)
?BostonHousing
## avvio in corso del server httpd per la guida ... fatto
data <- BostonHousing
str(data)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : num 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ b : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
head(data)
## crim zn indus chas nox rm age dis rad tax ptratio b lstat
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
summary(data)
## crim zn indus chas nox
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 0:471 Min. :0.3850
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1: 35 1st Qu.:0.4490
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.5380
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.5547
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.6240
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :0.8710
## rm age dis rad
## Min. :3.561 Min. : 2.90 Min. : 1.130 Min. : 1.000
## 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100 1st Qu.: 4.000
## Median :6.208 Median : 77.50 Median : 3.207 Median : 5.000
## Mean :6.285 Mean : 68.57 Mean : 3.795 Mean : 9.549
## 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188 3rd Qu.:24.000
## Max. :8.780 Max. :100.00 Max. :12.127 Max. :24.000
## tax ptratio b lstat
## Min. :187.0 Min. :12.60 Min. : 0.32 Min. : 1.73
## 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38 1st Qu.: 6.95
## Median :330.0 Median :19.05 Median :391.44 Median :11.36
## Mean :408.2 Mean :18.46 Mean :356.67 Mean :12.65
## 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23 3rd Qu.:16.95
## Max. :711.0 Max. :22.00 Max. :396.90 Max. :37.97
## medv
## Min. : 5.00
## 1st Qu.:17.02
## Median :21.20
## Mean :22.53
## 3rd Qu.:25.00
## Max. :50.00
# 1. What we want to do is create a prediction model to predict medv as a function
# of all of the other variables
# 70 -30 split
#divide data into training and testing
set.seed(100)
index = sample(1:nrow(data), 0.7*nrow(data))
train = data[index,] # Create the training data
test = data[-index,] # Create the test data
dim(train)
## [1] 354 14
dim(test)
## [1] 152 14
# linear model
lin_mod = lm(medv ~ ., data = train )
model_summ <- summary(lin_mod)
model_summ
##
## Call:
## lm(formula = medv ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.8674 -2.5849 -0.3951 1.7570 24.8739
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 35.822004 5.788951 6.188 1.75e-09 ***
## crim -0.121715 0.030950 -3.933 0.000102 ***
## zn 0.049343 0.014799 3.334 0.000950 ***
## indus 0.034271 0.065892 0.520 0.603332
## chas1 2.812441 0.981354 2.866 0.004417 **
## nox -11.527333 4.080098 -2.825 0.005003 **
## rm 3.567397 0.477861 7.465 7.04e-13 ***
## age -0.011754 0.014923 -0.788 0.431430
## dis -1.470180 0.221921 -6.625 1.36e-10 ***
## rad 0.269251 0.069304 3.885 0.000123 ***
## tax -0.010241 0.003965 -2.583 0.010219 *
## ptratio -0.926630 0.145096 -6.386 5.58e-10 ***
## b 0.008313 0.002788 2.982 0.003074 **
## lstat -0.640238 0.057700 -11.096 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.267 on 340 degrees of freedom
## Multiple R-squared: 0.7787, Adjusted R-squared: 0.7703
## F-statistic: 92.04 on 13 and 340 DF, p-value: < 2.2e-16
# calculate MSE:
MSE_OLS <- mean(model_summ$residuals^2)
MSE_OLS
## [1] 17.48866
library(ggplot2)
y_predict_ols <- predict(lin_mod)
df <- data.frame(cbind(test$medv, y_predict_ols))
## Warning in cbind(test$medv, y_predict_ols): number of rows of result is not a
## multiple of vector length (arg 1)
colnames(df) <- c("y","y_predict_ols")
ols <- ggplot(df, aes(x=y, y=y_predict_ols)) +
geom_point() +
geom_smooth(method=lm, se=FALSE)
ols
## `geom_smooth()` using formula = 'y ~ x'
x_train <- data.matrix(train[,-14]) # all variables except the dependent variable
y_train <- train$medv
x_test <- data.matrix(test[,-14]) # all variables except the dependent variable
y_test <- test$medv
# Ridge Regression
# tries to shrink coefficients but keeps all variables in the model.
library(glmnet)
## Warning: il pacchetto 'glmnet' รจ stato creato con R versione 4.3.3
## Caricamento del pacchetto richiesto: Matrix
## Loaded glmnet 4.1-8
lambdas <- 10^seq(2, -3, by = -.1)
round(lambdas, 3)
## [1] 100.000 79.433 63.096 50.119 39.811 31.623 25.119 19.953 15.849
## [10] 12.589 10.000 7.943 6.310 5.012 3.981 3.162 2.512 1.995
## [19] 1.585 1.259 1.000 0.794 0.631 0.501 0.398 0.316 0.251
## [28] 0.200 0.158 0.126 0.100 0.079 0.063 0.050 0.040 0.032
## [37] 0.025 0.020 0.016 0.013 0.010 0.008 0.006 0.005 0.004
## [46] 0.003 0.003 0.002 0.002 0.001 0.001
# nlambda: determines the number of regularization parameters to be tested
# alpha: determines the weighting to be used. In case of ridge regression the value of alpha is zero
# family: determines the distribution family to be used. Since this is a regression model, we will use the Gaussian distribution
# lambda: determines the lambda values to be tried
ridge_mod = glmnet(x_train, y_train, alpha = 0, lambda = lambdas)
cv_fit <- cv.glmnet(x_train, y_train, alpha = 0, lambda = lambdas)
opt_lambda_R <- cv_fit$lambda.min
opt_lambda_R
## [1] 0.1258925
plot(cv_fit)
ridge_pred <- predict(ridge_mod, newx = x_test, s = opt_lambda_R)
MSE_Ridge <- mean((ridge_pred - y_test)^2)
MSE_Ridge
## [1] 33.98975
df <- data.frame(cbind(y_test,ridge_pred))
ridge <- ggplot(df, aes(x=y_test, y=ridge_pred)) +
geom_point() +
geom_smooth(method=lm, se=FALSE)
ridge
## `geom_smooth()` using formula = 'y ~ x'
lambdas <- 10^seq(2, -3, by = -.1)
# Setting alpha = 1 implements lasso regression
lasso_mod = glmnet(x_train, y_train, alpha = 1, lambda = lambdas)
cv_fit_L <- cv.glmnet(x_train, y_train, alpha = 1, lambda = lambdas)
opt_lambda_L <- cv_fit_L$lambda.min
opt_lambda_L
## [1] 0.007943282
plot(cv_fit_L)
lasso_pred <- predict(lasso_mod, newx = x_test, s = opt_lambda_L)
MSE_Lasso <- mean((lasso_pred - y_test)^2)
MSE_Lasso
## [1] 34.14267
library(ggplot2)
df <- data.frame(cbind(y_test,lasso_pred))
lasso <- ggplot(df, aes(x=y_test, y=lasso_pred)) +
geom_point() +
geom_smooth(method=lm, se=FALSE)
lasso
## `geom_smooth()` using formula = 'y ~ x'
For this exercise, we will again us the Boston Housing data to predict the housing prices (medv).
library(mlbench)
data(BostonHousing)
?BostonHousing
data <- BostonHousing
# Decision trees
# Step 1: Divide data into training- and test-dat
set.seed(100) # make results reproducible
index = sample(1:nrow(data), 0.7*nrow(data))
train = data[index,] # Create the training data
test = data[-index,] # Create the test data
# Step 2: Building and plotting the tree
# use the "rpart" library for model building (recursive partitioning)
library(rpart)
library(rpart.plot)
## Warning: il pacchetto 'rpart.plot' รจ stato creato con R versione 4.3.3
reg_tree <- rpart(medv ~ ., data = train)
reg_tree
## n= 354
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 354 27978.6100 21.99294
## 2) lstat>=7.685 256 7168.6790 18.19336
## 4) lstat>=16.215 108 1757.6670 14.02222
## 8) dis< 1.9733 56 694.0793 11.74643 *
## 9) dis>=1.9733 52 461.2023 16.47308 *
## 5) lstat< 16.215 148 2160.8060 21.23716
## 10) rm< 6.6045 134 1268.9740 20.54776 *
## 11) rm>=6.6045 14 218.5721 27.83571 *
## 3) lstat< 7.685 98 7459.7270 31.91837
## 6) rm< 7.445 82 3321.7890 29.11951
## 12) age< 90.05 75 1672.7870 27.94667
## 24) rm< 6.7905 46 326.0646 24.98913 *
## 25) rm>=6.7905 29 306.1283 32.63793 *
## 13) age>=90.05 7 440.4686 41.68571 *
## 7) rm>=7.445 16 203.5175 46.26250 *
prp(reg_tree, digits = 4, extra = 1)
tree_pred <- predict(reg_tree, test)
MSE_trees <- mean((tree_pred - y_test)^2)
MSE_trees
## [1] 40.42522
# Step 4: Pruning --------------------------------------------------------------
# The complexity measure is a combination of the size of a tree and the ability
# of the tree to separate the classes of the target variable. If the next best
# split in growing a tree does not reduce the treeโs prediction quality by a
# certain amount, rpart will terminate the growing process.
# the cp (complexity parameter) argument is one of the parameters that are
# used to control the complexity of the tree. The smaller the cp value, the
# larger (complex) tree rpart will attempt to fit. Default is 0.01
reg_tree_1 <- rpart(medv ~ ., data = train, cp = 0.001)
reg_tree_1
## n= 354
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 354 27978.61000 21.992940
## 2) lstat>=7.685 256 7168.67900 18.193360
## 4) lstat>=16.215 108 1757.66700 14.022220
## 8) dis< 1.9733 56 694.07930 11.746430
## 16) tax>=551.5 42 432.97900 10.604760
## 32) nox>=0.675 29 183.74210 9.531034
## 64) crim>=9.75613 20 80.69750 8.475000 *
## 65) crim< 9.75613 9 31.17556 11.877780 *
## 33) nox< 0.675 13 141.22000 13.000000 *
## 17) tax< 551.5 14 42.12857 15.171430 *
## 9) dis>=1.9733 52 461.20230 16.473080
## 18) crim>=0.65402 38 196.95890 15.294740
## 36) b< 364.455 23 69.35652 14.443480 *
## 37) b>=364.455 15 85.38000 16.600000 *
## 19) crim< 0.65402 14 68.26857 19.671430 *
## 5) lstat< 16.215 148 2160.80600 21.237160
## 10) rm< 6.6045 134 1268.97400 20.547760
## 20) nox>=0.5125 85 690.86400 19.760000
## 40) lstat>=14.4 21 164.89140 17.842860
## 80) rm>=6.1215 9 68.30222 15.644440 *
## 81) rm< 6.1215 12 20.46917 19.491670 *
## 41) lstat< 14.4 64 423.46230 20.389060
## 82) indus< 12.91 37 236.37680 19.581080
## 164) b< 391.225 8 22.36000 17.600000 *
## 165) b>=391.225 29 173.95790 20.127590
## 330) rm< 6.0335 18 125.34940 19.294440 *
## 331) rm>=6.0335 11 15.66909 21.490910 *
## 83) indus>=12.91 27 129.82960 21.496300 *
## 21) nox< 0.5125 49 433.86000 21.914290
## 42) dis>=4.4649 27 115.38070 20.718520
## 84) rm< 5.9295 12 22.76917 19.041670 *
## 85) rm>=5.9295 15 31.87600 22.060000 *
## 43) dis< 4.4649 22 232.49270 23.381820
## 86) ptratio>=17.9 14 40.89714 22.314290 *
## 87) ptratio< 17.9 8 147.72000 25.250000 *
## 11) rm>=6.6045 14 218.57210 27.835710 *
## 3) lstat< 7.685 98 7459.72700 31.918370
## 6) rm< 7.445 82 3321.78900 29.119510
## 12) age< 90.05 75 1672.78700 27.946670
## 24) rm< 6.7905 46 326.06460 24.989130
## 48) rm< 6.5325 28 84.19857 23.707140 *
## 49) rm>=6.5325 18 124.26500 26.983330 *
## 25) rm>=6.7905 29 306.12830 32.637930
## 50) rm< 6.9125 8 65.12000 30.100000 *
## 51) rm>=6.9125 21 169.84950 33.604760
## 102) zn< 27.5 7 76.99714 31.842860 *
## 103) zn>=27.5 14 60.25714 34.485710 *
## 13) age>=90.05 7 440.46860 41.685710 *
## 7) rm>=7.445 16 203.51750 46.262500 *
prp(reg_tree_1, digits = 4, extra = 1)
tree_pred_1 <- predict(reg_tree_1, test)
MSE_trees_1 <- mean((tree_pred_1 - y_test)^2)
MSE_trees_1
## [1] 35.68706
Since the MSE of the OLS was 17.4887, this is lower than the one of the decision trees.
Diff_MSE <- MSE_trees_1-MSE_OLS
Diff_MSE
## [1] 18.1984
# Variable importance
plot(reg_tree$variable.importance, xlab="variable",
ylab="Importance", xaxt = "n", pch=20)
axis(1, at=1:12, labels=row.names(reg_tree))
# Random Forest
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Caricamento pacchetto: 'randomForest'
## Il seguente oggetto รจ mascherato da 'package:ggplot2':
##
## margin
# mtry: Number of variables randomly sampled as candidates at each split.
# ntree: Number of trees to grow.
# proximity: if proximity=TRUE when randomForest is called, a matrix of
# proximity measures among the input (based on the frequency that
# pairs of data points are in the same terminal nodes).
random_forest <- randomForest(medv ~ ., ntree=500, mtry=2, data = train, proximity=TRUE)
random_forest
##
## Call:
## randomForest(formula = medv ~ ., data = train, ntree = 500, mtry = 2, proximity = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 2
##
## Mean of squared residuals: 11.52723
## % Var explained: 85.42
RF_pred <- predict(random_forest, test)
# Calculate the mean squared error (MSE)
MSE_RF <- mean((RF_pred - test$medv)^2)
MSE_RF
## [1] 22.49241
# Calculate the mean squared error (MSE)
MSE_RF <- mean((RF_pred - test$medv)^2)
MSE_RF
## [1] 22.49241
# Variable importance
varImpPlot(random_forest, sort = T, n.var = 10, main = "Top 10 - Variable Importance")
importance(random_forest)
## IncNodePurity
## crim 2053.1236
## zn 947.4067
## indus 2400.8780
## chas 250.2841
## nox 2142.1948
## rm 5679.1174
## age 1145.1261
## dis 1579.3465
## rad 615.9872
## tax 1301.0040
## ptratio 1939.3532
## b 820.8560
## lstat 5684.7071