Exercise 1

Check out the following dataset and its description. We will compare OLS, Ridge, and Lasso in the prediction of house prices (medv).

#Boston Housing Data
library(mlbench)
data(BostonHousing)
?BostonHousing

## avvio in corso del server httpd per la guida ... fatto

data <- BostonHousing
str(data)

## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : num  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ b      : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...

head(data)

##      crim zn indus chas   nox    rm  age    dis rad tax ptratio      b lstat
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7

summary(data)

##       crim                zn             indus       chas         nox        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   0:471   Min.   :0.3850  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1: 35   1st Qu.:0.4490  
##  Median : 0.25651   Median :  0.00   Median : 9.69           Median :0.5380  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14           Mean   :0.5547  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10           3rd Qu.:0.6240  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74           Max.   :0.8710  
##        rm             age              dis              rad        
##  Min.   :3.561   Min.   :  2.90   Min.   : 1.130   Min.   : 1.000  
##  1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100   1st Qu.: 4.000  
##  Median :6.208   Median : 77.50   Median : 3.207   Median : 5.000  
##  Mean   :6.285   Mean   : 68.57   Mean   : 3.795   Mean   : 9.549  
##  3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188   3rd Qu.:24.000  
##  Max.   :8.780   Max.   :100.00   Max.   :12.127   Max.   :24.000  
##       tax           ptratio            b              lstat      
##  Min.   :187.0   Min.   :12.60   Min.   :  0.32   Min.   : 1.73  
##  1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38   1st Qu.: 6.95  
##  Median :330.0   Median :19.05   Median :391.44   Median :11.36  
##  Mean   :408.2   Mean   :18.46   Mean   :356.67   Mean   :12.65  
##  3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23   3rd Qu.:16.95  
##  Max.   :711.0   Max.   :22.00   Max.   :396.90   Max.   :37.97  
##       medv      
##  Min.   : 5.00  
##  1st Qu.:17.02  
##  Median :21.20  
##  Mean   :22.53  
##  3rd Qu.:25.00  
##  Max.   :50.00

Ex. 1.a

# 1. What we want to do is create a prediction model to predict medv as a function 
# of all of the other variables 

# 70 -30 split 

#divide data into training and testing
set.seed(100) 

index = sample(1:nrow(data), 0.7*nrow(data)) 

train = data[index,] # Create the training data 
test = data[-index,] # Create the test data

dim(train)

## [1] 354  14

dim(test)

## [1] 152  14

OLS regression

# linear model
lin_mod = lm(medv ~ ., data = train )
model_summ <- summary(lin_mod)
model_summ

## 
## Call:
## lm(formula = medv ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.8674  -2.5849  -0.3951   1.7570  24.8739 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  35.822004   5.788951   6.188 1.75e-09 ***
## crim         -0.121715   0.030950  -3.933 0.000102 ***
## zn            0.049343   0.014799   3.334 0.000950 ***
## indus         0.034271   0.065892   0.520 0.603332    
## chas1         2.812441   0.981354   2.866 0.004417 ** 
## nox         -11.527333   4.080098  -2.825 0.005003 ** 
## rm            3.567397   0.477861   7.465 7.04e-13 ***
## age          -0.011754   0.014923  -0.788 0.431430    
## dis          -1.470180   0.221921  -6.625 1.36e-10 ***
## rad           0.269251   0.069304   3.885 0.000123 ***
## tax          -0.010241   0.003965  -2.583 0.010219 *  
## ptratio      -0.926630   0.145096  -6.386 5.58e-10 ***
## b             0.008313   0.002788   2.982 0.003074 ** 
## lstat        -0.640238   0.057700 -11.096  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.267 on 340 degrees of freedom
## Multiple R-squared:  0.7787, Adjusted R-squared:  0.7703 
## F-statistic: 92.04 on 13 and 340 DF,  p-value: < 2.2e-16

# calculate MSE: 
MSE_OLS <- mean(model_summ$residuals^2)
MSE_OLS

## [1] 17.48866

library(ggplot2)
y_predict_ols <- predict(lin_mod)

df <- data.frame(cbind(test$medv, y_predict_ols))

## Warning in cbind(test$medv, y_predict_ols): number of rows of result is not a
## multiple of vector length (arg 1)

colnames(df) <- c("y","y_predict_ols")

ols <- ggplot(df, aes(x=y, y=y_predict_ols)) +
    geom_point() +
    geom_smooth(method=lm, se=FALSE)
ols

## `geom_smooth()` using formula = 'y ~ x'

x_train <- data.matrix(train[,-14])  # all variables except the dependent variable
y_train <- train$medv

x_test <- data.matrix(test[,-14]) # all variables except the dependent variable
y_test <- test$medv

Ridge Regression

# Ridge Regression
# tries to shrink coefficients but keeps all variables in the model. 

library(glmnet)

## Warning: il pacchetto 'glmnet' è stato creato con R versione 4.3.3

## Caricamento del pacchetto richiesto: Matrix

## Loaded glmnet 4.1-8

lambdas <- 10^seq(2, -3, by = -.1)
round(lambdas, 3)

##  [1] 100.000  79.433  63.096  50.119  39.811  31.623  25.119  19.953  15.849
## [10]  12.589  10.000   7.943   6.310   5.012   3.981   3.162   2.512   1.995
## [19]   1.585   1.259   1.000   0.794   0.631   0.501   0.398   0.316   0.251
## [28]   0.200   0.158   0.126   0.100   0.079   0.063   0.050   0.040   0.032
## [37]   0.025   0.020   0.016   0.013   0.010   0.008   0.006   0.005   0.004
## [46]   0.003   0.003   0.002   0.002   0.001   0.001

# nlambda: determines the number of regularization parameters to be tested 
# alpha: determines the weighting to be used. In case of ridge regression the value of alpha is zero
# family: determines the distribution family to be used. Since this is a regression model, we will use the Gaussian distribution 
# lambda: determines the lambda values to be tried 

ridge_mod = glmnet(x_train, y_train, alpha = 0, lambda = lambdas)

cv_fit <- cv.glmnet(x_train, y_train, alpha = 0, lambda = lambdas)
opt_lambda_R <- cv_fit$lambda.min
opt_lambda_R

## [1] 0.1258925

plot(cv_fit)

ridge_pred <- predict(ridge_mod, newx = x_test, s = opt_lambda_R)

MSE_Ridge <- mean((ridge_pred - y_test)^2)
MSE_Ridge

## [1] 33.98975

df <- data.frame(cbind(y_test,ridge_pred))

ridge <- ggplot(df, aes(x=y_test, y=ridge_pred)) +
    geom_point() +
    geom_smooth(method=lm, se=FALSE)

ridge

## `geom_smooth()` using formula = 'y ~ x'

Lasso regression

lambdas <- 10^seq(2, -3, by = -.1)

# Setting alpha = 1 implements lasso regression
lasso_mod = glmnet(x_train, y_train, alpha = 1, lambda = lambdas)

cv_fit_L <- cv.glmnet(x_train, y_train, alpha = 1, lambda = lambdas)
opt_lambda_L <- cv_fit_L$lambda.min
opt_lambda_L

## [1] 0.007943282

plot(cv_fit_L)

lasso_pred <- predict(lasso_mod, newx = x_test, s = opt_lambda_L)

MSE_Lasso <- mean((lasso_pred - y_test)^2)
MSE_Lasso

## [1] 34.14267

library(ggplot2)
df <- data.frame(cbind(y_test,lasso_pred))

lasso <- ggplot(df, aes(x=y_test, y=lasso_pred)) +
    geom_point() +
    geom_smooth(method=lm, se=FALSE)

lasso

## `geom_smooth()` using formula = 'y ~ x'

Exercise 2

For this exercise, we will again us the Boston Housing data to predict the housing prices (medv).

library(mlbench) 

data(BostonHousing)
?BostonHousing

data <- BostonHousing

Exercise 2.a

# Decision trees

# Step 1: Divide data into training- and test-dat
set.seed(100) # make results reproducible 
index = sample(1:nrow(data), 0.7*nrow(data)) 

train = data[index,] # Create the training data 
test = data[-index,] # Create the test data

# Step 2: Building and plotting the tree
# use the "rpart" library for model building (recursive partitioning)
library(rpart)
library(rpart.plot)

## Warning: il pacchetto 'rpart.plot' è stato creato con R versione 4.3.3

reg_tree <- rpart(medv ~ ., data = train)
reg_tree

## n= 354 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 354 27978.6100 21.99294  
##    2) lstat>=7.685 256  7168.6790 18.19336  
##      4) lstat>=16.215 108  1757.6670 14.02222  
##        8) dis< 1.9733 56   694.0793 11.74643 *
##        9) dis>=1.9733 52   461.2023 16.47308 *
##      5) lstat< 16.215 148  2160.8060 21.23716  
##       10) rm< 6.6045 134  1268.9740 20.54776 *
##       11) rm>=6.6045 14   218.5721 27.83571 *
##    3) lstat< 7.685 98  7459.7270 31.91837  
##      6) rm< 7.445 82  3321.7890 29.11951  
##       12) age< 90.05 75  1672.7870 27.94667  
##         24) rm< 6.7905 46   326.0646 24.98913 *
##         25) rm>=6.7905 29   306.1283 32.63793 *
##       13) age>=90.05 7   440.4686 41.68571 *
##      7) rm>=7.445 16   203.5175 46.26250 *

prp(reg_tree, digits = 4, extra = 1)

tree_pred <- predict(reg_tree, test)

MSE_trees <- mean((tree_pred - y_test)^2)
MSE_trees

## [1] 40.42522

# Step 4: Pruning --------------------------------------------------------------
# The complexity measure is a combination of the size of a tree and the ability 
# of the tree to separate the classes of the target variable. If the next best 
# split in growing a tree does not reduce the tree’s prediction quality by a 
# certain amount, rpart will terminate the growing process.

# the cp (complexity parameter) argument is one of the parameters that are 
# used to control the complexity of the tree. The smaller the cp value, the 
# larger (complex) tree rpart will attempt to fit. Default is 0.01

reg_tree_1 <- rpart(medv ~ ., data = train, cp = 0.001)
reg_tree_1

## n= 354 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##   1) root 354 27978.61000 21.992940  
##     2) lstat>=7.685 256  7168.67900 18.193360  
##       4) lstat>=16.215 108  1757.66700 14.022220  
##         8) dis< 1.9733 56   694.07930 11.746430  
##          16) tax>=551.5 42   432.97900 10.604760  
##            32) nox>=0.675 29   183.74210  9.531034  
##              64) crim>=9.75613 20    80.69750  8.475000 *
##              65) crim< 9.75613 9    31.17556 11.877780 *
##            33) nox< 0.675 13   141.22000 13.000000 *
##          17) tax< 551.5 14    42.12857 15.171430 *
##         9) dis>=1.9733 52   461.20230 16.473080  
##          18) crim>=0.65402 38   196.95890 15.294740  
##            36) b< 364.455 23    69.35652 14.443480 *
##            37) b>=364.455 15    85.38000 16.600000 *
##          19) crim< 0.65402 14    68.26857 19.671430 *
##       5) lstat< 16.215 148  2160.80600 21.237160  
##        10) rm< 6.6045 134  1268.97400 20.547760  
##          20) nox>=0.5125 85   690.86400 19.760000  
##            40) lstat>=14.4 21   164.89140 17.842860  
##              80) rm>=6.1215 9    68.30222 15.644440 *
##              81) rm< 6.1215 12    20.46917 19.491670 *
##            41) lstat< 14.4 64   423.46230 20.389060  
##              82) indus< 12.91 37   236.37680 19.581080  
##               164) b< 391.225 8    22.36000 17.600000 *
##               165) b>=391.225 29   173.95790 20.127590  
##                 330) rm< 6.0335 18   125.34940 19.294440 *
##                 331) rm>=6.0335 11    15.66909 21.490910 *
##              83) indus>=12.91 27   129.82960 21.496300 *
##          21) nox< 0.5125 49   433.86000 21.914290  
##            42) dis>=4.4649 27   115.38070 20.718520  
##              84) rm< 5.9295 12    22.76917 19.041670 *
##              85) rm>=5.9295 15    31.87600 22.060000 *
##            43) dis< 4.4649 22   232.49270 23.381820  
##              86) ptratio>=17.9 14    40.89714 22.314290 *
##              87) ptratio< 17.9 8   147.72000 25.250000 *
##        11) rm>=6.6045 14   218.57210 27.835710 *
##     3) lstat< 7.685 98  7459.72700 31.918370  
##       6) rm< 7.445 82  3321.78900 29.119510  
##        12) age< 90.05 75  1672.78700 27.946670  
##          24) rm< 6.7905 46   326.06460 24.989130  
##            48) rm< 6.5325 28    84.19857 23.707140 *
##            49) rm>=6.5325 18   124.26500 26.983330 *
##          25) rm>=6.7905 29   306.12830 32.637930  
##            50) rm< 6.9125 8    65.12000 30.100000 *
##            51) rm>=6.9125 21   169.84950 33.604760  
##             102) zn< 27.5 7    76.99714 31.842860 *
##             103) zn>=27.5 14    60.25714 34.485710 *
##        13) age>=90.05 7   440.46860 41.685710 *
##       7) rm>=7.445 16   203.51750 46.262500 *

prp(reg_tree_1, digits = 4, extra = 1)

tree_pred_1 <- predict(reg_tree_1, test)

MSE_trees_1 <- mean((tree_pred_1 - y_test)^2)
MSE_trees_1

## [1] 35.68706

Since the MSE of the OLS was 17.4887, this is lower than the one of the decision trees.

Diff_MSE <- MSE_trees_1-MSE_OLS
Diff_MSE

## [1] 18.1984

# Variable importance
plot(reg_tree$variable.importance, xlab="variable", 
    ylab="Importance", xaxt = "n", pch=20)
axis(1, at=1:12, labels=row.names(reg_tree))

Exercise 2.b

# Random Forest 
library(randomForest)

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Caricamento pacchetto: 'randomForest'

## Il seguente oggetto è mascherato da 'package:ggplot2':
## 
##     margin

# mtry: Number of variables randomly sampled as candidates at each split. 
# ntree: Number of trees to grow.
# proximity: if proximity=TRUE when randomForest is called, a matrix of 
#             proximity measures among the input (based on the frequency that 
#             pairs of data points are in the same terminal nodes).

random_forest <- randomForest(medv ~ ., ntree=500, mtry=2, data = train, proximity=TRUE)
random_forest

## 
## Call:
##  randomForest(formula = medv ~ ., data = train, ntree = 500, mtry = 2,      proximity = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 11.52723
##                     % Var explained: 85.42

RF_pred <-  predict(random_forest, test)

# Calculate the mean squared error (MSE)
MSE_RF <- mean((RF_pred - test$medv)^2)
MSE_RF

## [1] 22.49241

# Calculate the mean squared error (MSE)
MSE_RF <- mean((RF_pred - test$medv)^2)
MSE_RF

## [1] 22.49241

# Variable importance
varImpPlot(random_forest, sort = T, n.var = 10, main = "Top 10 - Variable Importance")

importance(random_forest)

##         IncNodePurity
## crim        2053.1236
## zn           947.4067
## indus       2400.8780
## chas         250.2841
## nox         2142.1948
## rm          5679.1174
## age         1145.1261
## dis         1579.3465
## rad          615.9872
## tax         1301.0040
## ptratio     1939.3532
## b            820.8560
## lstat       5684.7071

Tutorial 2

Lorenzo Barisone

09/17/2024