# Cleaning environment and console
rm(list = ls())
cat("\014")
# Calling packages
library(rpart)
library(rpart.plot)
library(forecast)
library(pander)
library(dummies)
library(caret)
library(neuralnet)
library(ellipse)
library(corrplot)
# Importing data
house <- read.csv("BostonHousing.csv", sep = ",", header = TRUE)
attach(house)
str(house)
## 'data.frame': 506 obs. of 14 variables:
## $ CRIM : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ ZN : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ INDUS : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ CHAS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NOX : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ RM : num 6.58 6.42 7.18 7 7.15 ...
## $ AGE : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ DIS : num 4.09 4.97 4.97 6.06 6.06 ...
## $ RAD : int 1 2 2 3 3 3 5 5 5 5 ...
## $ TAX : int 296 242 242 222 222 222 311 311 311 311 ...
## $ PTRATIO : num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ LSTAT : num 4.98 9.14 4.03 2.94 5.33 ...
## $ MEDV : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
## $ CAT..MEDV: int 0 0 1 1 1 0 0 0 0 0 ...
pander(head(house))
| CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX |
|---|---|---|---|---|---|---|---|---|---|
| 0.00632 | 18 | 2.31 | 0 | 0.538 | 6.575 | 65.2 | 4.09 | 1 | 296 |
| 0.02731 | 0 | 7.07 | 0 | 0.469 | 6.421 | 78.9 | 4.967 | 2 | 242 |
| 0.02729 | 0 | 7.07 | 0 | 0.469 | 7.185 | 61.1 | 4.967 | 2 | 242 |
| 0.03237 | 0 | 2.18 | 0 | 0.458 | 6.998 | 45.8 | 6.062 | 3 | 222 |
| 0.06905 | 0 | 2.18 | 0 | 0.458 | 7.147 | 54.2 | 6.062 | 3 | 222 |
| 0.02985 | 0 | 2.18 | 0 | 0.458 | 6.43 | 58.7 | 6.062 | 3 | 222 |
| PTRATIO | LSTAT | MEDV | CAT..MEDV |
|---|---|---|---|
| 15.3 | 4.98 | 24 | 0 |
| 17.8 | 9.14 | 21.6 | 0 |
| 17.8 | 4.03 | 34.7 | 1 |
| 18.7 | 2.94 | 33.4 | 1 |
| 18.7 | 5.33 | 36.2 | 1 |
| 18.7 | 5.21 | 28.7 | 0 |
# Partitioning (60%-40%)
house_train.obs <- sample(rownames(house), dim(house)[1]*0.6)
house_train.set <- house[house_train.obs,]
house_valid.obs <- setdiff(rownames(house), house_train.obs)
house_valid.set <- house[house_valid.obs,]
Build our model : we need data for this, so we use only a part of the total dataset, which we call “training set”. During this phase, we are “tuning” the parameters and adjusting things so that we can explain the training set correctly.
Verify that the model is good at predicting “new” data : if we use, to test the model, the same data which was used to build it, we will find that the predictions are almost perfect. However, if we apply the model to new and unseen data, then most probably the predictions won’t be as good. This is why we must keep one partition of the entire dataset to be used for validation (even sometimes we keep a third part called “test set”).
# Running the regression :
MLR <- lm(MEDV ~ CRIM + CHAS + RM, data = house_train.set)
summary(MLR)
##
## Call:
## lm(formula = MEDV ~ CRIM + CHAS + RM, data = house_train.set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.0751 -2.8530 -0.1695 2.9024 25.3980
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -31.16106 3.30130 -9.439 < 2e-16 ***
## CRIM -0.31484 0.04307 -7.310 2.46e-12 ***
## CHAS 9.99984 1.53024 6.535 2.74e-10 ***
## RM 8.57376 0.51690 16.587 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.741 on 299 degrees of freedom
## Multiple R-squared: 0.617, Adjusted R-squared: 0.6132
## F-statistic: 160.6 on 3 and 299 DF, p-value: < 2.2e-16
# Equation for predicting MEDV (using the estimated coefficients) :
equation <- -34.10504 + -0.20280*CRIM + 6.84439*CHAS + 9.08959*RM
# Adding regression outcome into the data frame :
house$Prediction <- round(equation, 4)
head(house$Prediction)
## [1] 25.6577 24.2537 31.1981 29.4973 30.8443 24.3350
1- The relationship between outcome and predictors is linear.
2- The residuals are normally distributed.
3- The residuals have mean = zero and constant variance.
pairs(house_train.set[, c(1, 4, 6, 13)])
# Log transformation :
house_train.set$log_CRIM <- log(house_train.set$CRIM)
# Ploting to see the effect :
plot(house_train.set$MEDV, house_train.set$log_CRIM)
# New model with variable tranformation :
MLR_transf <- lm(MEDV ~ log_CRIM + CHAS + RM, data = house_train.set)
summary(MLR_transf)
##
## Call:
## lm(formula = MEDV ~ log_CRIM + CHAS + RM, data = house_train.set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.0701 -3.4913 -0.7244 2.7162 25.6250
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -30.6627 3.2019 -9.576 < 2e-16 ***
## log_CRIM -1.2940 0.1517 -8.530 7.42e-16 ***
## CHAS 11.4437 1.4970 7.644 2.88e-13 ***
## RM 8.1563 0.5124 15.917 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.589 on 299 degrees of freedom
## Multiple R-squared: 0.6369, Adjusted R-squared: 0.6333
## F-statistic: 174.8 on 3 and 299 DF, p-value: < 2.2e-16
plot(MLR_transf)
predicted_price = -34.3656 + -0.8609*log(0.1) + 7.3472*0 + 8.8831*6
predicted_price
## [1] 20.9153
corrplot(cor(house))
# Creating a model with no predictors :
MLR_nopred <- lm(MEDV ~ 1, data = house_train.set[, -c(1, 14)])
# We are taking away column 1 (because it corresponds to the untransformed CRIM) and column 14 (because it contains information about the OUTCOME variable MEDV)!
# Creating a FULL model (all predictors)
MLR_full <- lm(MEDV ~., data = house_train.set[, -c(1, 14)])
# Running the FORWARD selection :
MLR_forward <- step(MLR_nopred, scope = list(lower = MLR_nopred, upper = MLR_full), direction = "forward")
## Start: AIC=1347.8
## MEDV ~ 1
##
## Df Sum of Sq RSS AIC
## + LSTAT 1 14831.2 10897 1089.5
## + RM 1 12654.3 13074 1144.7
## + PTRATIO 1 6224.9 19503 1265.9
## + NOX 1 5659.2 20069 1274.5
## + INDUS 1 5603.2 20125 1275.4
## + log_CRIM 1 5410.6 20317 1278.3
## + TAX 1 5355.9 20372 1279.1
## + AGE 1 4068.4 21659 1297.7
## + RAD 1 3975.7 21752 1298.9
## + ZN 1 3601.6 22126 1304.1
## + CHAS 1 2562.7 23165 1318.0
## + DIS 1 2214.5 23513 1322.5
## <none> 25728 1347.8
##
## Step: AIC=1089.49
## MEDV ~ LSTAT
##
## Df Sum of Sq RSS AIC
## + RM 1 2592.21 8304.4 1009.2
## + CHAS 1 1308.73 9587.9 1052.7
## + PTRATIO 1 1156.73 9739.9 1057.5
## + DIS 1 348.22 10548.4 1081.7
## + AGE 1 247.91 10648.7 1084.5
## + ZN 1 96.89 10799.7 1088.8
## + TAX 1 79.26 10817.4 1089.3
## <none> 10896.6 1089.5
## + log_CRIM 1 21.44 10875.2 1090.9
## + RAD 1 18.93 10877.7 1091.0
## + NOX 1 2.57 10894.1 1091.4
## + INDUS 1 0.19 10896.4 1091.5
##
## Step: AIC=1009.18
## MEDV ~ LSTAT + RM
##
## Df Sum of Sq RSS AIC
## + CHAS 1 1126.97 7177.5 966.99
## + PTRATIO 1 831.53 7472.9 979.21
## + TAX 1 225.72 8078.7 1002.83
## + DIS 1 218.09 8086.3 1003.11
## + RAD 1 195.55 8108.9 1003.96
## + AGE 1 63.50 8240.9 1008.85
## <none> 8304.4 1009.18
## + ZN 1 11.96 8292.5 1010.74
## + INDUS 1 11.19 8293.2 1010.77
## + log_CRIM 1 6.77 8297.7 1010.93
## + NOX 1 0.14 8304.3 1011.17
##
## Step: AIC=966.99
## MEDV ~ LSTAT + RM + CHAS
##
## Df Sum of Sq RSS AIC
## + PTRATIO 1 697.22 6480.2 938.02
## + TAX 1 338.63 6838.8 954.34
## + RAD 1 302.20 6875.3 955.95
## + log_CRIM 1 74.49 7103.0 965.82
## + DIS 1 69.39 7108.1 966.04
## + NOX 1 49.78 7127.7 966.88
## + ZN 1 47.94 7129.5 966.95
## <none> 7177.5 966.99
## + INDUS 1 8.55 7168.9 968.62
## + AGE 1 0.00 7177.5 968.99
##
## Step: AIC=938.02
## MEDV ~ LSTAT + RM + CHAS + PTRATIO
##
## Df Sum of Sq RSS AIC
## + DIS 1 173.128 6307.1 931.82
## + TAX 1 94.323 6385.9 935.58
## + RAD 1 48.263 6432.0 937.76
## <none> 6480.2 938.02
## + NOX 1 38.403 6441.8 938.22
## + AGE 1 10.981 6469.3 939.51
## + INDUS 1 4.731 6475.5 939.80
## + ZN 1 3.429 6476.8 939.86
## + log_CRIM 1 1.984 6478.3 939.93
##
## Step: AIC=931.82
## MEDV ~ LSTAT + RM + CHAS + PTRATIO + DIS
##
## Df Sum of Sq RSS AIC
## + NOX 1 411.69 5895.4 913.36
## + TAX 1 233.83 6073.3 922.37
## + RAD 1 123.92 6183.2 927.80
## + log_CRIM 1 99.68 6207.4 928.99
## + INDUS 1 53.48 6253.6 931.24
## + AGE 1 49.35 6257.8 931.44
## + ZN 1 43.80 6263.3 931.71
## <none> 6307.1 931.82
##
## Step: AIC=913.36
## MEDV ~ LSTAT + RM + CHAS + PTRATIO + DIS + NOX
##
## Df Sum of Sq RSS AIC
## + TAX 1 48.708 5846.7 912.85
## + ZN 1 39.015 5856.4 913.35
## <none> 5895.4 913.36
## + AGE 1 12.092 5883.3 914.74
## + RAD 1 10.249 5885.2 914.84
## + INDUS 1 0.402 5895.0 915.34
## + log_CRIM 1 0.110 5895.3 915.36
##
## Step: AIC=912.85
## MEDV ~ LSTAT + RM + CHAS + PTRATIO + DIS + NOX + TAX
##
## Df Sum of Sq RSS AIC
## + ZN 1 70.099 5776.6 911.20
## <none> 5846.7 912.85
## + log_CRIM 1 24.753 5822.0 913.56
## + RAD 1 20.564 5826.1 913.78
## + AGE 1 18.029 5828.7 913.91
## + INDUS 1 7.822 5838.9 914.44
##
## Step: AIC=911.2
## MEDV ~ LSTAT + RM + CHAS + PTRATIO + DIS + NOX + TAX + ZN
##
## Df Sum of Sq RSS AIC
## + log_CRIM 1 56.250 5720.4 910.23
## <none> 5776.6 911.20
## + RAD 1 22.932 5753.7 911.99
## + INDUS 1 11.241 5765.4 912.61
## + AGE 1 9.117 5767.5 912.72
##
## Step: AIC=910.23
## MEDV ~ LSTAT + RM + CHAS + PTRATIO + DIS + NOX + TAX + ZN + log_CRIM
##
## Df Sum of Sq RSS AIC
## <none> 5720.4 910.23
## + AGE 1 13.4720 5706.9 911.52
## + INDUS 1 11.6571 5708.7 911.61
## + RAD 1 1.2841 5719.1 912.16
summary(MLR_forward)
##
## Call:
## lm(formula = MEDV ~ LSTAT + RM + CHAS + PTRATIO + DIS + NOX +
## TAX + ZN + log_CRIM, data = house_train.set[, -c(1, 14)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.2491 -2.5433 -0.4525 2.2283 20.7290
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.254689 5.952793 4.914 1.48e-06 ***
## LSTAT -0.574477 0.057239 -10.036 < 2e-16 ***
## RM 4.671773 0.496481 9.410 < 2e-16 ***
## CHAS 8.045255 1.218528 6.602 1.90e-10 ***
## PTRATIO -0.709177 0.154686 -4.585 6.74e-06 ***
## DIS -1.238169 0.230663 -5.368 1.62e-07 ***
## NOX -15.350337 4.481796 -3.425 0.000703 ***
## TAX -0.008104 0.003070 -2.640 0.008742 **
## ZN 0.037956 0.016639 2.281 0.023255 *
## log_CRIM 0.473780 0.279122 1.697 0.090685 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.419 on 293 degrees of freedom
## Multiple R-squared: 0.7777, Adjusted R-squared: 0.7708
## F-statistic: 113.9 on 9 and 293 DF, p-value: < 2.2e-16
# Running the BACKWARD selection :
MLR_backward <- step(MLR_full, direction = "backward")
## Start: AIC=914.67
## MEDV ~ ZN + INDUS + CHAS + NOX + RM + AGE + DIS + RAD + TAX +
## PTRATIO + LSTAT + log_CRIM
##
## Df Sum of Sq RSS AIC
## - RAD 1 2.56 5693.6 912.81
## - AGE 1 12.33 5703.4 913.33
## - INDUS 1 15.65 5706.7 913.51
## - log_CRIM 1 32.71 5723.8 914.41
## <none> 5691.1 914.67
## - ZN 1 91.49 5782.6 917.51
## - TAX 1 102.54 5793.6 918.09
## - NOX 1 219.25 5910.3 924.13
## - PTRATIO 1 390.47 6081.5 932.78
## - DIS 1 474.73 6165.8 936.95
## - CHAS 1 839.65 6530.7 954.37
## - LSTAT 1 1639.58 7330.6 989.38
## - RM 1 1665.07 7356.1 990.44
##
## Step: AIC=912.81
## MEDV ~ ZN + INDUS + CHAS + NOX + RM + AGE + DIS + TAX + PTRATIO +
## LSTAT + log_CRIM
##
## Df Sum of Sq RSS AIC
## - INDUS 1 13.26 5706.9 911.52
## - AGE 1 15.07 5708.7 911.61
## <none> 5693.6 912.81
## - log_CRIM 1 61.35 5755.0 914.06
## - ZN 1 94.93 5788.6 915.82
## - TAX 1 156.44 5850.1 919.02
## - NOX 1 218.57 5912.2 922.23
## - PTRATIO 1 409.44 6103.1 931.85
## - DIS 1 476.41 6170.0 935.16
## - CHAS 1 850.73 6544.4 953.01
## - LSTAT 1 1637.35 7331.0 987.40
## - RM 1 1731.36 7425.0 991.26
##
## Step: AIC=911.52
## MEDV ~ ZN + CHAS + NOX + RM + AGE + DIS + TAX + PTRATIO + LSTAT +
## log_CRIM
##
## Df Sum of Sq RSS AIC
## - AGE 1 13.47 5720.4 910.23
## <none> 5706.9 911.52
## - log_CRIM 1 60.60 5767.5 912.72
## - ZN 1 91.19 5798.1 914.32
## - TAX 1 143.58 5850.5 917.04
## - NOX 1 205.42 5912.3 920.23
## - PTRATIO 1 407.76 6114.7 930.43
## - DIS 1 545.49 6252.4 937.18
## - CHAS 1 864.51 6571.4 952.26
## - LSTAT 1 1624.88 7331.8 985.43
## - RM 1 1730.60 7437.5 989.77
##
## Step: AIC=910.23
## MEDV ~ ZN + CHAS + NOX + RM + DIS + TAX + PTRATIO + LSTAT + log_CRIM
##
## Df Sum of Sq RSS AIC
## <none> 5720.4 910.23
## - log_CRIM 1 56.25 5776.6 911.20
## - ZN 1 101.60 5822.0 913.56
## - TAX 1 136.04 5856.4 915.35
## - NOX 1 229.03 5949.4 920.13
## - PTRATIO 1 410.36 6130.7 929.22
## - DIS 1 562.55 6282.9 936.65
## - CHAS 1 851.07 6571.4 950.26
## - RM 1 1728.68 7449.0 988.24
## - LSTAT 1 1966.60 7687.0 997.77
summary(MLR_backward)
##
## Call:
## lm(formula = MEDV ~ ZN + CHAS + NOX + RM + DIS + TAX + PTRATIO +
## LSTAT + log_CRIM, data = house_train.set[, -c(1, 14)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.2491 -2.5433 -0.4525 2.2283 20.7290
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.254689 5.952793 4.914 1.48e-06 ***
## ZN 0.037956 0.016639 2.281 0.023255 *
## CHAS 8.045255 1.218528 6.602 1.90e-10 ***
## NOX -15.350337 4.481796 -3.425 0.000703 ***
## RM 4.671773 0.496481 9.410 < 2e-16 ***
## DIS -1.238169 0.230663 -5.368 1.62e-07 ***
## TAX -0.008104 0.003070 -2.640 0.008742 **
## PTRATIO -0.709177 0.154686 -4.585 6.74e-06 ***
## LSTAT -0.574477 0.057239 -10.036 < 2e-16 ***
## log_CRIM 0.473780 0.279122 1.697 0.090685 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.419 on 293 degrees of freedom
## Multiple R-squared: 0.7777, Adjusted R-squared: 0.7708
## F-statistic: 113.9 on 9 and 293 DF, p-value: < 2.2e-16
# Running BOTH selections :
MLR_both <- step(MLR_full, direction = "both")
## Start: AIC=914.67
## MEDV ~ ZN + INDUS + CHAS + NOX + RM + AGE + DIS + RAD + TAX +
## PTRATIO + LSTAT + log_CRIM
##
## Df Sum of Sq RSS AIC
## - RAD 1 2.56 5693.6 912.81
## - AGE 1 12.33 5703.4 913.33
## - INDUS 1 15.65 5706.7 913.51
## - log_CRIM 1 32.71 5723.8 914.41
## <none> 5691.1 914.67
## - ZN 1 91.49 5782.6 917.51
## - TAX 1 102.54 5793.6 918.09
## - NOX 1 219.25 5910.3 924.13
## - PTRATIO 1 390.47 6081.5 932.78
## - DIS 1 474.73 6165.8 936.95
## - CHAS 1 839.65 6530.7 954.37
## - LSTAT 1 1639.58 7330.6 989.38
## - RM 1 1665.07 7356.1 990.44
##
## Step: AIC=912.81
## MEDV ~ ZN + INDUS + CHAS + NOX + RM + AGE + DIS + TAX + PTRATIO +
## LSTAT + log_CRIM
##
## Df Sum of Sq RSS AIC
## - INDUS 1 13.26 5706.9 911.52
## - AGE 1 15.07 5708.7 911.61
## <none> 5693.6 912.81
## - log_CRIM 1 61.35 5755.0 914.06
## + RAD 1 2.56 5691.1 914.67
## - ZN 1 94.93 5788.6 915.82
## - TAX 1 156.44 5850.1 919.02
## - NOX 1 218.57 5912.2 922.23
## - PTRATIO 1 409.44 6103.1 931.85
## - DIS 1 476.41 6170.0 935.16
## - CHAS 1 850.73 6544.4 953.01
## - LSTAT 1 1637.35 7331.0 987.40
## - RM 1 1731.36 7425.0 991.26
##
## Step: AIC=911.52
## MEDV ~ ZN + CHAS + NOX + RM + AGE + DIS + TAX + PTRATIO + LSTAT +
## log_CRIM
##
## Df Sum of Sq RSS AIC
## - AGE 1 13.47 5720.4 910.23
## <none> 5706.9 911.52
## - log_CRIM 1 60.60 5767.5 912.72
## + INDUS 1 13.26 5693.6 912.81
## + RAD 1 0.17 5706.7 913.51
## - ZN 1 91.19 5798.1 914.32
## - TAX 1 143.58 5850.5 917.04
## - NOX 1 205.42 5912.3 920.23
## - PTRATIO 1 407.76 6114.7 930.43
## - DIS 1 545.49 6252.4 937.18
## - CHAS 1 864.51 6571.4 952.26
## - LSTAT 1 1624.88 7331.8 985.43
## - RM 1 1730.60 7437.5 989.77
##
## Step: AIC=910.23
## MEDV ~ ZN + CHAS + NOX + RM + DIS + TAX + PTRATIO + LSTAT + log_CRIM
##
## Df Sum of Sq RSS AIC
## <none> 5720.4 910.23
## - log_CRIM 1 56.25 5776.6 911.20
## + AGE 1 13.47 5706.9 911.52
## + INDUS 1 11.66 5708.7 911.61
## + RAD 1 1.28 5719.1 912.16
## - ZN 1 101.60 5822.0 913.56
## - TAX 1 136.04 5856.4 915.35
## - NOX 1 229.03 5949.4 920.13
## - PTRATIO 1 410.36 6130.7 929.22
## - DIS 1 562.55 6282.9 936.65
## - CHAS 1 851.07 6571.4 950.26
## - RM 1 1728.68 7449.0 988.24
## - LSTAT 1 1966.60 7687.0 997.77
summary(MLR_both)
##
## Call:
## lm(formula = MEDV ~ ZN + CHAS + NOX + RM + DIS + TAX + PTRATIO +
## LSTAT + log_CRIM, data = house_train.set[, -c(1, 14)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.2491 -2.5433 -0.4525 2.2283 20.7290
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.254689 5.952793 4.914 1.48e-06 ***
## ZN 0.037956 0.016639 2.281 0.023255 *
## CHAS 8.045255 1.218528 6.602 1.90e-10 ***
## NOX -15.350337 4.481796 -3.425 0.000703 ***
## RM 4.671773 0.496481 9.410 < 2e-16 ***
## DIS -1.238169 0.230663 -5.368 1.62e-07 ***
## TAX -0.008104 0.003070 -2.640 0.008742 **
## PTRATIO -0.709177 0.154686 -4.585 6.74e-06 ***
## LSTAT -0.574477 0.057239 -10.036 < 2e-16 ***
## log_CRIM 0.473780 0.279122 1.697 0.090685 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.419 on 293 degrees of freedom
## Multiple R-squared: 0.7777, Adjusted R-squared: 0.7708
## F-statistic: 113.9 on 9 and 293 DF, p-value: < 2.2e-16
# Running the model with the selected predictors
MLR_selected <- lm(MEDV ~ log_CRIM + CHAS + RM + ZN + NOX + DIS + PTRATIO + LSTAT + TAX, data = house_train.set[, -c(1, 14)])
summary(MLR_selected)
##
## Call:
## lm(formula = MEDV ~ log_CRIM + CHAS + RM + ZN + NOX + DIS + PTRATIO +
## LSTAT + TAX, data = house_train.set[, -c(1, 14)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.2491 -2.5433 -0.4525 2.2283 20.7290
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.254689 5.952793 4.914 1.48e-06 ***
## log_CRIM 0.473780 0.279122 1.697 0.090685 .
## CHAS 8.045255 1.218528 6.602 1.90e-10 ***
## RM 4.671773 0.496481 9.410 < 2e-16 ***
## ZN 0.037956 0.016639 2.281 0.023255 *
## NOX -15.350337 4.481796 -3.425 0.000703 ***
## DIS -1.238169 0.230663 -5.368 1.62e-07 ***
## PTRATIO -0.709177 0.154686 -4.585 6.74e-06 ***
## LSTAT -0.574477 0.057239 -10.036 < 2e-16 ***
## TAX -0.008104 0.003070 -2.640 0.008742 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.419 on 293 degrees of freedom
## Multiple R-squared: 0.7777, Adjusted R-squared: 0.7708
## F-statistic: 113.9 on 9 and 293 DF, p-value: < 2.2e-16
# Creating the log transformation in the validations set :
house_valid.set$log_CRIM = log(house_valid.set$CRIM)
# Predicting the validation data using the initial proposed model :
pred_initial = predict(MLR_transf, house_valid.set)
accuracy(pred_initial, house_valid.set$MEDV)
## ME RMSE MAE MPE MAPE
## Test set 0.3355111 7.32952 4.763903 -2.926922 22.66992
# Predicting the validation data using the new model :
pred_selected = predict(MLR_selected, house_valid.set)
accuracy(pred_selected, house_valid.set$MEDV)
## ME RMSE MAE MPE MAPE
## Test set 0.4453469 5.90066 3.918759 -0.1941773 18.9326
# Comparing with FULL model :
pred_full = predict(MLR_full, house_valid.set)
accuracy(pred_full, house_valid.set$MEDV)
## ME RMSE MAE MPE MAPE
## Test set 0.4771218 5.93313 3.933339 0.03933146 18.98019
cars <- read.csv("ToyotaCorolla.csv", sep = ",", header = TRUE)
str(cars)
## 'data.frame': 1436 obs. of 39 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Model : chr "TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors" "TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors" " TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors" "TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors" ...
## $ Price : int 13500 13750 13950 14950 13750 12950 16900 18600 21500 12950 ...
## $ Age_08_04 : int 23 23 24 26 30 32 27 30 27 23 ...
## $ Mfg_Month : int 10 10 9 7 3 1 6 3 6 10 ...
## $ Mfg_Year : int 2002 2002 2002 2002 2002 2002 2002 2002 2002 2002 ...
## $ KM : int 46986 72937 41711 48000 38500 61000 94612 75889 19700 71138 ...
## $ Fuel_Type : chr "Diesel" "Diesel" "Diesel" "Diesel" ...
## $ HP : int 90 90 90 90 90 90 90 90 192 69 ...
## $ Met_Color : int 1 1 1 0 0 0 1 1 0 0 ...
## $ Color : chr "Blue" "Silver" "Blue" "Black" ...
## $ Automatic : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CC : int 2000 2000 2000 2000 2000 2000 2000 2000 1800 1900 ...
## $ Doors : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Cylinders : int 4 4 4 4 4 4 4 4 4 4 ...
## $ Gears : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Quarterly_Tax : int 210 210 210 210 210 210 210 210 100 185 ...
## $ Weight : int 1165 1165 1165 1165 1170 1170 1245 1245 1185 1105 ...
## $ Mfr_Guarantee : int 0 0 1 1 1 0 0 1 0 0 ...
## $ BOVAG_Guarantee : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Guarantee_Period : int 3 3 3 3 3 3 3 3 3 3 ...
## $ ABS : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Airbag_1 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Airbag_2 : int 1 1 1 1 1 1 1 1 0 1 ...
## $ Airco : int 0 1 0 0 1 1 1 1 1 1 ...
## $ Automatic_airco : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Boardcomputer : int 1 1 1 1 1 1 1 1 0 1 ...
## $ CD_Player : int 0 1 0 0 0 0 0 1 0 0 ...
## $ Central_Lock : int 1 1 0 0 1 1 1 1 1 0 ...
## $ Powered_Windows : int 1 0 0 0 1 1 1 1 1 0 ...
## $ Power_Steering : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Radio : int 0 0 0 0 0 0 0 0 1 0 ...
## $ Mistlamps : int 0 0 0 0 1 1 0 0 0 0 ...
## $ Sport_Model : int 0 0 0 0 0 0 1 0 0 0 ...
## $ Backseat_Divider : int 1 1 1 1 1 1 1 1 0 1 ...
## $ Metallic_Rim : int 0 0 0 0 0 0 0 0 1 0 ...
## $ Radio_cassette : int 0 0 0 0 0 0 0 0 1 0 ...
## $ Parking_Assistant: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Tow_Bar : int 0 0 0 0 0 0 0 0 0 0 ...
pander(head(cars))
| Id | Model | Price | Age_08_04 | Mfg_Month | Mfg_Year |
|---|---|---|---|---|---|
| 1 | TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors | 13500 | 23 | 10 | 2002 |
| 2 | TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors | 13750 | 23 | 10 | 2002 |
| 3 | TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors | 13950 | 24 | 9 | 2002 |
| 4 | TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors | 14950 | 26 | 7 | 2002 |
| 5 | TOYOTA Corolla 2.0 D4D HATCHB SOL 2/3-Doors | 13750 | 30 | 3 | 2002 |
| 6 | TOYOTA Corolla 2.0 D4D HATCHB SOL 2/3-Doors | 12950 | 32 | 1 | 2002 |
| KM | Fuel_Type | HP | Met_Color | Color | Automatic | CC | Doors |
|---|---|---|---|---|---|---|---|
| 46986 | Diesel | 90 | 1 | Blue | 0 | 2000 | 3 |
| 72937 | Diesel | 90 | 1 | Silver | 0 | 2000 | 3 |
| 41711 | Diesel | 90 | 1 | Blue | 0 | 2000 | 3 |
| 48000 | Diesel | 90 | 0 | Black | 0 | 2000 | 3 |
| 38500 | Diesel | 90 | 0 | Black | 0 | 2000 | 3 |
| 61000 | Diesel | 90 | 0 | White | 0 | 2000 | 3 |
| Cylinders | Gears | Quarterly_Tax | Weight | Mfr_Guarantee | BOVAG_Guarantee |
|---|---|---|---|---|---|
| 4 | 5 | 210 | 1165 | 0 | 1 |
| 4 | 5 | 210 | 1165 | 0 | 1 |
| 4 | 5 | 210 | 1165 | 1 | 1 |
| 4 | 5 | 210 | 1165 | 1 | 1 |
| 4 | 5 | 210 | 1170 | 1 | 1 |
| 4 | 5 | 210 | 1170 | 0 | 1 |
| Guarantee_Period | ABS | Airbag_1 | Airbag_2 | Airco | Automatic_airco |
|---|---|---|---|---|---|
| 3 | 1 | 1 | 1 | 0 | 0 |
| 3 | 1 | 1 | 1 | 1 | 0 |
| 3 | 1 | 1 | 1 | 0 | 0 |
| 3 | 1 | 1 | 1 | 0 | 0 |
| 3 | 1 | 1 | 1 | 1 | 0 |
| 3 | 1 | 1 | 1 | 1 | 0 |
| Boardcomputer | CD_Player | Central_Lock | Powered_Windows | Power_Steering |
|---|---|---|---|---|
| 1 | 0 | 1 | 1 | 1 |
| 1 | 1 | 1 | 0 | 1 |
| 1 | 0 | 0 | 0 | 1 |
| 1 | 0 | 0 | 0 | 1 |
| 1 | 0 | 1 | 1 | 1 |
| 1 | 0 | 1 | 1 | 1 |
| Radio | Mistlamps | Sport_Model | Backseat_Divider | Metallic_Rim |
|---|---|---|---|---|
| 0 | 0 | 0 | 1 | 0 |
| 0 | 0 | 0 | 1 | 0 |
| 0 | 0 | 0 | 1 | 0 |
| 0 | 0 | 0 | 1 | 0 |
| 0 | 1 | 0 | 1 | 0 |
| 0 | 1 | 0 | 1 | 0 |
| Radio_cassette | Parking_Assistant | Tow_Bar |
|---|---|---|
| 0 | 0 | 0 |
| 0 | 0 | 0 |
| 0 | 0 | 0 |
| 0 | 0 | 0 |
| 0 | 0 | 0 |
| 0 | 0 | 0 |
set.seed(1)
# Partitioning the data (60% - 40%)
train.obs <- sample(rownames(cars), dim(cars)[1]*0.6)
train.set <- cars[train.obs, c(3,4,7,8,9,12,14,17,19,21,25,26,28,30,34,39)]
valid.obs <- setdiff(rownames(cars), train.obs)
valid.set <- cars[valid.obs, c(3,4,7,8,9,12,14,17,19,21,25,26,28,30,34,39)]
# We only considered the variables which were determined in the prompt!
tree <- rpart(
Price ~ .,
data = train.set,
method = "anova",
control = rpart.control(
cp = 0.001,
minbucket = 1,
maxdepth = 30
)
)
plot_tree = prp(
tree,
type = 1,
extra = 1,
under = TRUE,
split.font = 1,
varlen = -10
)
# Checking the variable importance
sort(tree$variable.importance, decreasing = TRUE)
## Age_08_04 Automatic_airco KM Quarterly_Tax
## 9885745349 2987651311 2886703313 1885088412
## HP Guarantee_Period CD_Player Fuel_Type
## 1665897142 430364046 357332146 206914123
## Airco Powered_Windows Doors Mfr_Guarantee
## 122474786 109250988 63966015 21300178
## Automatic Sport_Model
## 19776457 3395399
# First, let's create two vectors, one for the predicted values, and another for the actual values :
predicted_train <- predict(tree, train.set)
actual_train <- train.set$Price
# And lastly, we make use of the RSME formula to calculate it :
RMSE_train = sqrt(mean((predicted_train-actual_train)^2))
RMSE_train
## [1] 986.5121
predicted_valid <- predict(tree, valid.set)
actual_valid <- valid.set$Price
RMSE_valid = sqrt(mean((predicted_valid-actual_valid)^2))
RMSE_valid
## [1] 1192.877
par(mfrow = c(1, 2))
boxplot(
predicted_train,
actual_train,
names = c("Predicted", "Actual"),
ylab = "Price",
xlab = "Training Data"
)
boxplot(
predicted_valid,
actual_valid,
names = c("Predicted", "Actual"),
ylab = "Price",
xlab = "Validation Data"
)
smaller_tree <- rpart(Price ~ ., data = train.set, method = "anova")
plot_smaller_tree = prp(
smaller_tree,
type = 1,
extra = 1,
under = TRUE,
split.font = 1,
varlen = -10
)
predicted_valid_small <- predict(smaller_tree, valid.set)
RMSE_valid_small = sqrt(mean((predicted_valid_small-actual_valid)^2))
RMSE_valid_small
## [1] 1341.264
predicted_train_small <- predict(smaller_tree, train.set)
RMSE_train_small = sqrt(mean((predicted_train_small-actual_train)^2))
RMSE_train_small
## [1] 1396.864
RMSE_train_small is bigger than
RMSE_valid_small, which is logical since we pruned the
tree. Now the tree is smaller and it does not overfit the training data
as much as before.xerror).options(scipen = 999)
# Categorizing the price variable into 20 bins
train.set$Price_Bins <- cut(train.set$Price, breaks = 20,
labels=c("1" , "2", "3", "4", "5", "6" , "7", "8", "9","10", "11", "12" ,"13", "14", "15", "16", "17", "18", "19", "20"))
# Taking "continuous" price away
new_train.set = train.set[, -1]
# And now, running the classification tree :
class_tree <- rpart(Price_Bins ~ ., data = new_train.set, method = "class")
plot_class_tree = prp(
class_tree,
type = 1,
extra = 1,
under = TRUE,
split.font = 1,
varlen = -10
)
The number of records in each terminal node. Although the numbers are roughly the same in both trees, sometimes they are NOT the same. Nevertheless, in some instances there is no difference at all.
The values for the prices. This is quite expected, since now inside each node we do not have a big number but a small one, indicating not the price per se, but the category in which it is located.
# Creating the new car
new_car <- data.frame(Age_08_04 = 77, KM = 117,000, Fuel_Type = "Petrol", HP = 110, Automatic = 0, Doors = 5, Quarterly_Tax = 100, Mfr_Guarantee = 0, Guarantee_Period = 3, Airco = 1, Automatic_airco = 0, CD_Player = 0, Powered_Windows = 0, Sport_Model = 0, Tow_Bar = 1)
# Predicting the new car's price with the regression tree :
predict(smaller_tree, new_car)
## 1
## 7875.686
# Predicting the new car's price with the regression tree :
predict(class_tree, new_car)
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 1 0.02654867 0.2256637 0.4867257 0.2522124 0.008849558 0 0 0 0 0 0 0 0 0
## 15 16 17 18 19 20
## 1 0 0 0 0 0 0
plot(train.set$Price, train.set$Price_Bins, xlab = "Price", ylab = "Bin number")
price_vs_bins = table(train.set$Price, train.set$Price_Bins)
which(price_vs_bins[, 3]!=0)
## 7200 7250 7350 7400 7450 7490 7495 7500 7600 7750 7795 7800 7900 7950 7995 8000
## 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
## 8100 8200 8250 8400 8450 8495 8500
## 44 45 46 47 48 49 50
# Just a measure of error (RMSE) :
predicted_train_class <- predict(class_tree, new_train.set)
actual_class <- new_train.set$Price_Bins
RMSE_train_class = sqrt(mean((predicted_train_class-actual_class)^2))
RMSE_train_class
## [1] NA
# The result is `NA`...
set.seed(1)
# First, remove the "Price_Bins" variable :
train.set_nobins <- train.set[, -17]
# Now, we dummify the categorical predictors ("Doors", "Fuel_Type" and "Guarantee_Period") :
train.set.dumm <-
cbind(train.set[1:3], dummy(train.set$Fuel_Type, sep = "_"), train.set[5:6], dummy(train.set$Doors, sep = "_"), train.set[8:9], dummy(train.set$Guarantee_Period, sep = "_"), train.set[11:16])
names(train.set.dumm)[4:6] <- c("Fuel_Type_CNG", "Fuel_Type_Diesel", "Fuel_Type_Petrol")
names(train.set.dumm)[9:12] <- c("Doors_2", "Doors_3", "Doors_4", "Doors_5")
names(train.set.dumm)[15:21] <- c("Guarantee_Period_3", "Guarantee_Period_6", "Guarantee_Period_12", "Guarantee_Period_13", "Guarantee_Period_24", "Guarantee_Period_28", "Guarantee_Period_36")
# "Min-max" normalization :
norm.values.train <- caret::preProcess(train.set_nobins[, c(1:3, 5, 8)], method = "range")
train.set_norm <- predict(norm.values.train, train.set.dumm)
set.seed(1)
# We dummify the categorical predictors ("Doors" and "Guarantee") :
valid.set.dumm <-
cbind(valid.set[1:3], dummy(valid.set$Fuel_Type, sep = "_"), valid.set[5:6], dummy(valid.set$Doors, sep = "_"), valid.set[8:9], dummy(valid.set$Guarantee_Period, sep = "_"), valid.set[11:16])
names(valid.set.dumm)[4:6] <- c("Fuel_Type_CNG", "Fuel_Type_Diesel", "Fuel_Type_Petrol")
names(valid.set.dumm)[9:12] <- c("Doors_2", "Doors_3", "Doors_4", "Doors_5")
names(valid.set.dumm)[15:21] <- c("Guarantee_Period_3", "Guarantee_Period_6", "Guarantee_Period_12", "Guarantee_Period_13", "Guarantee_Period_24", "Guarantee_Period_28", "Guarantee_Period_36")
# "Min-max" normalization :
norm.values.valid <- caret::preProcess(valid.set[, c(1:3, 5, 8)], method = "range")
valid.set_norm <- predict(norm.values.valid, valid.set.dumm)
set.seed(1)
net.train <- neuralnet(Price ~., data = train.set_norm, hidden = 2)
# IMPORTANT NOTE : inside hidden, "2" alone means "ONE layer + 2 nodes"
# Similarly, "c(2,3)" means "TWO layers, with respectively "2" and "3" nodes
# Here we can see the prediction results :
train.norm.pred <- predict(object = net.train, newdata = train.set_norm[,-1])
head(train.norm.pred)
## [,1]
## 1017 0.1628378
## 679 0.1643410
## 129 0.4921746
## 930 0.1816020
## 471 0.2282467
## 299 0.3578104
# Let's plot the neural net :
plot(net.train, rep = "best")
# Creating a "denormalization" function :
denormalize <- function(x, y) {
return (x*(max(y) - min(y)) + min(y))
}
# Choosing only the ORIGINAL prices :
only_price_train = train.set.dumm[, 1]
# Applying the function :
denorm.train = denormalize(train.norm.pred, only_price_train)
head(denorm.train)
## [,1]
## 1017 8933.883
## 679 8976.199
## 129 18204.716
## 930 9462.095
## 471 10775.145
## 299 14422.362
postResample:# First, create a (nx2) data frame with predictions and actual numbers :
df.train = data.frame("Prediction" = denorm.train, "Actual" = only_price_train)
# Now, compute the metrics :
RMSE_train_1 = postResample(denorm.train, only_price_train)
RMSE_train_1
## RMSE Rsquared MAE
## 1056.6133443 0.9198639 790.2279073
# Creating the data frame for comparisons, and already adding the first RMSE :
comparing_df = data.frame("Network_Type" = c("1 layer, 2 nodes", "1 layer, 5 nodes", "2 layers, 5 nodes in each"), "RMSE_train" = as.numeric(c(round(RMSE_train_1[1], 3), "NA", "NA")), "RMSE_valid" = as.numeric(c("NA", "NA", "NA")))
## Warning in data.frame(Network_Type = c("1 layer, 2 nodes", "1 layer, 5 nodes", :
## NAs introducidos por coerción
## Warning in data.frame(Network_Type = c("1 layer, 2 nodes", "1 layer, 5 nodes", :
## NAs introducidos por coerción
pander(comparing_df)
| Network_Type | RMSE_train | RMSE_valid |
|---|---|---|
| 1 layer, 2 nodes | 1057 | NA |
| 1 layer, 5 nodes | NA | NA |
| 2 layers, 5 nodes in each | NA | NA |
set.seed(1)
net.valid <- neuralnet(Price ~., data = valid.set_norm, hidden = 2)
valid.norm.pred <- predict(object = net.valid, newdata = valid.set_norm[,-1])
head(valid.norm.pred)
## [,1]
## 1 0.5912632
## 2 0.5211129
## 4 0.5382568
## 9 0.7391759
## 10 0.4411255
## 12 0.9224447
# Let's plot the neural net :
plot(net.valid, rep = "best")
# Applying the denormalization function :
only_price_valid = valid.set.dumm[, 1]
denorm.valid = denormalize(valid.norm.pred, only_price_valid)
head(denorm.valid)
## [,1]
## 1 16306.62
## 2 14994.81
## 4 15315.40
## 9 19072.59
## 10 13499.05
## 12 22499.71
# Creating a (nx2) data frame with predictions and actual numbers :
df.valid = data.frame("Prediction" = denorm.valid, "Actual" = only_price_valid)
# Now, compute the metrics :
RMSE_valid_1 = postResample(denorm.valid, only_price_valid)
RMSE_valid_1
## RMSE Rsquared MAE
## 927.6498088 0.9279509 726.5968885
comparing_df[1, 3] <- RMSE_valid_1[1]
pander(comparing_df)
| Network_Type | RMSE_train | RMSE_valid |
|---|---|---|
| 1 layer, 2 nodes | 1057 | 927.6 |
| 1 layer, 5 nodes | NA | NA |
| 2 layers, 5 nodes in each | NA | NA |
# Creating the function for the training data :
RMSE.for.train <- function(x) {
set.seed(1)
net.train <-
neuralnet(Price ~ ., data = train.set_norm, hidden = x)
train.norm.pred <-
predict(object = net.train, newdata = train.set_norm[, -1])
only_price_train = train.set.dumm[, 1]
denorm.train = denormalize(train.norm.pred, only_price_train)
df.train = data.frame("Prediction" = denorm.train, "Actual" = only_price_train)
RMSE_train = postResample(denorm.train, only_price_train)
return (RMSE_train[1])
}
# And same for the validation data :
RMSE.for.valid <- function(x) {
set.seed(1)
net.valid <-
neuralnet(Price ~ ., data = valid.set_norm, hidden = x)
valid.norm.pred <-
predict(object = net.valid, newdata = valid.set_norm[, -1])
only_price_train = valid.set.dumm[, 1]
denorm.valid = denormalize(valid.norm.pred, only_price_train)
df.valid = data.frame("Prediction" = denorm.valid, "Actual" = only_price_valid)
RMSE_valid = postResample(denorm.valid, only_price_valid)
return (RMSE_valid[1])
}
comparing_df data frame using
our function!# One layer, 5 nodes :
comparing_df[2, 2] <- RMSE.for.train(5)
comparing_df[2, 3] <- RMSE.for.valid(5)
# Two layers, 5 nodes each :
comparing_df[3, 2] <- RMSE.for.train(c(5, 5))
comparing_df[3, 3] <- RMSE.for.valid(c(5, 5))
# The FINAL table!
pander(comparing_df)
| Network_Type | RMSE_train | RMSE_valid |
|---|---|---|
| 1 layer, 2 nodes | 1057 | 927.6 |
| 1 layer, 5 nodes | 905.9 | 783.2 |
| 2 layers, 5 nodes in each | 948.8 | 815.1 |
pander(comparing_df)
| Network_Type | RMSE_train | RMSE_valid |
|---|---|---|
| 1 layer, 2 nodes | 1057 | 927.6 |
| 1 layer, 5 nodes | 905.9 | 783.2 |
| 2 layers, 5 nodes in each | 948.8 | 815.1 |
pander(comparing_df)
| Network_Type | RMSE_train | RMSE_valid |
|---|---|---|
| 1 layer, 2 nodes | 1057 | 927.6 |
| 1 layer, 5 nodes | 905.9 | 783.2 |
| 2 layers, 5 nodes in each | 948.8 | 815.1 |