Random Forest para casas Boston

Cargar librerías

library(randomForest) # PAra randomforest
library(caret) # Para hce divisiones
library(readr)
library(dplyr)
library(reshape)    # Para renombrar columnas

Cargar los datos

cat("Directorio de trabajo", getwd())
## Directorio de trabajo C:/Users/Usuario/Documents/Mis clases ITD/Semestre Enero Junio 2020/Analisis Inteligente de Datos/markdown
datos <- read.csv("../datos/BostonHousing.csv")
head(datos)
##      CRIM ZN INDUS CHAS   NOX    RM  AGE    DIS RAD TAX PTRATIO      B LSTAT
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   MEDV
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7

Explorar datos

str(datos)
## 'data.frame':    506 obs. of  14 variables:
##  $ CRIM   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ ZN     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ INDUS  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ CHAS   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NOX    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ RM     : num  6.58 6.42 7.18 7 7.15 ...
##  $ AGE    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ DIS    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ RAD    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ TAX    : int  296 242 242 222 222 222 311 311 311 311 ...
##  $ PTRATIO: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ B      : num  397 397 393 395 397 ...
##  $ LSTAT  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ MEDV   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
summary(datos)
##       CRIM                ZN             INDUS            CHAS        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08204   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       NOX               RM             AGE              DIS        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       RAD              TAX           PTRATIO            B         
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
##      LSTAT            MEDV      
##  Min.   : 1.73   Min.   : 5.00  
##  1st Qu.: 6.95   1st Qu.:17.02  
##  Median :11.36   Median :21.20  
##  Mean   :12.65   Mean   :22.53  
##  3rd Qu.:16.95   3rd Qu.:25.00  
##  Max.   :37.97   Max.   :50.00

Hacer conjunto de entrenamiento y conjunto de validación

set.seed(2020)
entrena = createDataPartition(datos$MEDV, p=0.7, list = FALSE)

datos.Entrena = datos[entrena,]
datos.Valida = datos[-entrena,]

head(datos.Entrena, 15)
##       CRIM   ZN INDUS CHAS   NOX    RM   AGE    DIS RAD TAX PTRATIO      B
## 1  0.00632 18.0  2.31    0 0.538 6.575  65.2 4.0900   1 296    15.3 396.90
## 2  0.02731  0.0  7.07    0 0.469 6.421  78.9 4.9671   2 242    17.8 396.90
## 3  0.02729  0.0  7.07    0 0.469 7.185  61.1 4.9671   2 242    17.8 392.83
## 4  0.03237  0.0  2.18    0 0.458 6.998  45.8 6.0622   3 222    18.7 394.63
## 5  0.06905  0.0  2.18    0 0.458 7.147  54.2 6.0622   3 222    18.7 396.90
## 6  0.02985  0.0  2.18    0 0.458 6.430  58.7 6.0622   3 222    18.7 394.12
## 7  0.08829 12.5  7.87    0 0.524 6.012  66.6 5.5605   5 311    15.2 395.60
## 8  0.14455 12.5  7.87    0 0.524 6.172  96.1 5.9505   5 311    15.2 396.90
## 9  0.21124 12.5  7.87    0 0.524 5.631 100.0 6.0821   5 311    15.2 386.63
## 10 0.17004 12.5  7.87    0 0.524 6.004  85.9 6.5921   5 311    15.2 386.71
## 12 0.11747 12.5  7.87    0 0.524 6.009  82.9 6.2267   5 311    15.2 396.90
## 13 0.09378 12.5  7.87    0 0.524 5.889  39.0 5.4509   5 311    15.2 390.50
## 14 0.62976  0.0  8.14    0 0.538 5.949  61.8 4.7075   4 307    21.0 396.90
## 15 0.63796  0.0  8.14    0 0.538 6.096  84.5 4.4619   4 307    21.0 380.02
## 16 0.62739  0.0  8.14    0 0.538 5.834  56.5 4.4986   4 307    21.0 395.62
##    LSTAT MEDV
## 1   4.98 24.0
## 2   9.14 21.6
## 3   4.03 34.7
## 4   2.94 33.4
## 5   5.33 36.2
## 6   5.21 28.7
## 7  12.43 22.9
## 8  19.15 27.1
## 9  29.93 16.5
## 10 17.10 18.9
## 12 13.27 18.9
## 13 15.71 21.7
## 14  8.26 20.4
## 15 10.26 18.2
## 16  8.47 19.9
head(datos.Valida, 15)
##       CRIM   ZN INDUS CHAS   NOX    RM   AGE    DIS RAD TAX PTRATIO      B
## 11 0.22489 12.5  7.87    0 0.524 6.377  94.3 6.3467   5 311    15.2 392.52
## 18 0.78420  0.0  8.14    0 0.538 5.990  81.7 4.2579   4 307    21.0 386.75
## 19 0.80271  0.0  8.14    0 0.538 5.456  36.6 3.7965   4 307    21.0 288.99
## 22 0.85204  0.0  8.14    0 0.538 5.965  89.2 4.0123   4 307    21.0 392.53
## 28 0.95577  0.0  8.14    0 0.538 6.047  88.8 4.4534   4 307    21.0 306.38
## 32 1.35472  0.0  8.14    0 0.538 6.072 100.0 4.1750   4 307    21.0 376.73
## 33 1.38799  0.0  8.14    0 0.538 5.950  82.0 3.9900   4 307    21.0 232.60
## 35 1.61282  0.0  8.14    0 0.538 6.096  96.9 3.7598   4 307    21.0 248.31
## 36 0.06417  0.0  5.96    0 0.499 5.933  68.2 3.3603   5 279    19.2 396.90
## 42 0.12744  0.0  6.91    0 0.448 6.770   2.9 5.7209   3 233    17.9 385.41
## 43 0.14150  0.0  6.91    0 0.448 6.169   6.6 5.7209   3 233    17.9 383.37
## 46 0.17142  0.0  6.91    0 0.448 5.682  33.8 5.1004   3 233    17.9 396.90
## 48 0.22927  0.0  6.91    0 0.448 6.030  85.5 5.6894   3 233    17.9 392.74
## 53 0.05360 21.0  5.64    0 0.439 6.511  21.1 6.8147   4 243    16.8 396.90
## 60 0.10328 25.0  5.13    0 0.453 5.927  47.2 6.9320   8 284    19.7 396.90
##    LSTAT MEDV
## 11 20.45 15.0
## 18 14.67 17.5
## 19 11.69 20.2
## 22 13.83 19.6
## 28 17.28 14.8
## 32 13.04 14.5
## 33 27.71 13.2
## 35 20.34 13.5
## 36  9.68 18.9
## 42  4.84 26.6
## 43  5.81 25.3
## 46 10.21 19.3
## 48 18.80 16.6
## 53  5.28 25.0
## 60  9.22 19.6

Realizar el modelo de random Forest

modelo <- randomForest(x = datos.Entrena[,1:13], y = datos.Entrena[,14], ntree=1000, xtest = datos.Valida[,1:13], ytest = datos.Valida[,14], importance = TRUE, keep.forest = TRUE)

modelo
## 
## Call:
##  randomForest(x = datos.Entrena[, 1:13], y = datos.Entrena[, 14],      xtest = datos.Valida[, 1:13], ytest = datos.Valida[, 14],      ntree = 1000, importance = TRUE, keep.forest = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 1000
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 11.97907
##                     % Var explained: 86.49
##                        Test set MSE: 9.14
##                     % Var explained: 87.72
modelo$importance
##            %IncMSE IncNodePurity
## CRIM     9.1209731     2014.4919
## ZN       0.6899801      249.6783
## INDUS    5.1589892     1287.9252
## CHAS     0.7447949      207.2129
## NOX     11.0733940     2458.6715
## RM      31.2493666     8230.4467
## AGE      3.9633101      866.2132
## DIS      7.6970953     1981.2095
## RAD      1.2821717      257.0120
## TAX      4.5394163      944.1180
## PTRATIO  6.7907138     1760.4709
## B        2.0575237      702.9492
## LSTAT   66.2273159     9997.7758

Graficando el modelo

plot(datos.Entrena$MEDV, predict(modelo, newdata = datos.Entrena[,]), xlab = "Actual", ylab="Predichos")

abline(0,1)

plot(datos.Valida$MEDV, predict(modelo, newdata = datos.Valida[,]), xlab = "Actual", ylab="Predichos")

abline(0,1)

Prediccioes específicas

head(datos.Valida)
##       CRIM   ZN INDUS CHAS   NOX    RM   AGE    DIS RAD TAX PTRATIO      B
## 11 0.22489 12.5  7.87    0 0.524 6.377  94.3 6.3467   5 311    15.2 392.52
## 18 0.78420  0.0  8.14    0 0.538 5.990  81.7 4.2579   4 307    21.0 386.75
## 19 0.80271  0.0  8.14    0 0.538 5.456  36.6 3.7965   4 307    21.0 288.99
## 22 0.85204  0.0  8.14    0 0.538 5.965  89.2 4.0123   4 307    21.0 392.53
## 28 0.95577  0.0  8.14    0 0.538 6.047  88.8 4.4534   4 307    21.0 306.38
## 32 1.35472  0.0  8.14    0 0.538 6.072 100.0 4.1750   4 307    21.0 376.73
##    LSTAT MEDV
## 11 20.45 15.0
## 18 14.67 17.5
## 19 11.69 20.2
## 22 13.83 19.6
## 28 17.28 14.8
## 32 13.04 14.5
head(predict(modelo, newdata = datos.Valida[,]))
##       11       18       19       22       28       32 
## 21.18267 17.97345 18.29781 18.73298 15.64002 18.57603