Árboles de regresión y Random Forest

Precio de venta

Datos

Tenemos datos sobre los datos de varias casas. Veamos la cantidad de NAs en los datos.

library(tidyverse)
library(caret)
library(caTools)
library(rpart)
library(randomForest)
library(rattle)
library(DataExplorer)
library(scales)
library(kableExtra)
data <- read.csv("C:/Users/luisa/Downloads/house_prices.csv",
                 fileEncoding = "UTF-8-BOM")
kbl(colSums(is.na(data))) %>%
  kable_paper() %>%
  scroll_box(width = "900px", height = "200px")

	x
Id	0
MSSubClass	0
MSZoning	0
LotFrontage	259
LotArea	0
Street	0
Alley	1369
LotShape	0
LandContour	0
Utilities	0
LotConfig	0
LandSlope	0
Neighborhood	0
Condition1	0
Condition2	0
BldgType	0
HouseStyle	0
OverallQual	0
OverallCond	0
YearBuilt	0
YearRemodAdd	0
RoofStyle	0
RoofMatl	0
Exterior1st	0
Exterior2nd	0
MasVnrType	8
MasVnrArea	8
ExterQual	0
ExterCond	0
Foundation	0
BsmtQual	37
BsmtCond	37
BsmtExposure	38
BsmtFinType1	37
BsmtFinSF1	0
BsmtFinType2	38
BsmtFinSF2	0
BsmtUnfSF	0
TotalBsmtSF	0
Heating	0
HeatingQC	0
CentralAir	0
Electrical	1
X1stFlrSF	0
X2ndFlrSF	0
LowQualFinSF	0
GrLivArea	0
BsmtFullBath	0
BsmtHalfBath	0
FullBath	0
HalfBath	0
BedroomAbvGr	0
KitchenAbvGr	0
KitchenQual	0
TotRmsAbvGrd	0
Functional	0
Fireplaces	0
FireplaceQu	690
GarageType	81
GarageYrBlt	81
GarageFinish	81
GarageCars	0
GarageArea	0
GarageQual	81
GarageCond	81
PavedDrive	0
WoodDeckSF	0
OpenPorchSF	0
EnclosedPorch	0
X3SsnPorch	0
ScreenPorch	0
PoolArea	0
PoolQC	1453
Fence	1179
MiscFeature	1406
MiscVal	0
MoSold	0
YrSold	0
SaleType	0
SaleCondition	0
SalePrice	0

Limpiamos los datos:

datos <- data[,-1]
datos$Alley[is.na(datos$Alley)] <- "No Alley"
datos$LotFrontage[is.na(datos$LotFrontage)] <- 0
datos$BsmtQual[is.na(datos$BsmtQual)] <- "No Basement"
datos$BsmtCond[is.na(datos$BsmtCond)] <- "No Basement"
datos$BsmtExposure[is.na(datos$BsmtExposure)] <- "No Basement"
datos$BsmtFinType1[is.na(datos$BsmtFinType1)] <- "No Basement"
datos$BsmtFinType2[is.na(datos$BsmtFinType2)] <- "No Basement"
datos$FireplaceQu[is.na(datos$FireplaceQu)] <- "No Fireplace"
datos$PoolQC[is.na(datos$PoolQC)] <- "No Pool"
datos$Fence[is.na(datos$Fence)] <- "No Fence"
datos$MiscFeature[is.na(datos$MiscFeature)] <- "None"
datos <- na.omit(datos)

Y veamos la estructura de los nuevos datos.

introduce(datos)

##   rows columns discrete_columns continuous_columns all_missing_columns
## 1 1370      80               43                 37                   0
##   total_missing_values complete_rows total_observations memory_usage
## 1                    0          1370             109600       710968

plot_intro(datos)

plot_missing(datos[,1:20])

plot_missing(datos[,21:40])

plot_missing(datos[,41:60])

plot_missing(datos[,61:80])

plot_histogram(datos)

plot_bar(datos)

Árbol de regresión

Pasamos a ajustar un árbol de regresión a SalePrice de nuestros datos metiendo todas las variables.

set.seed(123)
split <- sample.split(datos$SalePrice, SplitRatio = 0.7)
train <- subset(datos, split == T)
test <- subset(datos, split == F)

tree <- rpart(SalePrice ~., data = train, method = 'anova', 
              control = rpart.control(minsplit = 50, cp = 0.0001))
printcp(tree)

## 
## Regression tree:
## rpart(formula = SalePrice ~ ., data = train, method = "anova", 
##     control = rpart.control(minsplit = 50, cp = 1e-04))
## 
## Variables actually used in tree construction:
##  [1] BedroomAbvGr BsmtFinSF1   BsmtFullBath BsmtUnfSF    Fireplaces  
##  [6] GarageArea   GarageType   GrLivArea    Neighborhood OpenPorchSF 
## [11] OverallCond  OverallQual  SaleType     TotalBsmtSF  WoodDeckSF  
## [16] X1stFlrSF    YearBuilt    YearRemodAdd
## 
## Root node error: 7.45e+12/1051 = 7088443518
## 
## n= 1051 
## 
##            CP nsplit rel error  xerror     xstd
## 1  0.47491499      0   1.00000 1.00161 0.088449
## 2  0.10614012      1   0.52509 0.52767 0.048232
## 3  0.07447327      2   0.41894 0.42188 0.046634
## 4  0.03115704      3   0.34447 0.36190 0.033022
## 5  0.02327429      4   0.31331 0.33568 0.032788
## 6  0.02118925      5   0.29004 0.30742 0.030139
## 7  0.01124349      6   0.26885 0.28797 0.029582
## 8  0.01003729      7   0.25761 0.27962 0.029649
## 9  0.00921973      8   0.24757 0.27326 0.029599
## 10 0.00795091      9   0.23835 0.27014 0.029610
## 11 0.00778853     10   0.23040 0.26634 0.029543
## 12 0.00727191     11   0.22261 0.26471 0.029520
## 13 0.00461859     12   0.21534 0.25238 0.029210
## 14 0.00399566     13   0.21072 0.25375 0.029496
## 15 0.00395063     14   0.20672 0.25084 0.029477
## 16 0.00372239     15   0.20277 0.25141 0.029492
## 17 0.00294424     16   0.19905 0.24818 0.029470
## 18 0.00192409     17   0.19611 0.24503 0.029438
## 19 0.00168560     18   0.19418 0.24318 0.029424
## 20 0.00147635     19   0.19250 0.24215 0.029394
## 21 0.00136776     20   0.19102 0.24238 0.029391
## 22 0.00127960     21   0.18965 0.24066 0.029388
## 23 0.00122897     22   0.18837 0.23991 0.029386
## 24 0.00108783     23   0.18715 0.23942 0.029388
## 25 0.00097237     24   0.18606 0.23772 0.029380
## 26 0.00093992     25   0.18509 0.23741 0.029376
## 27 0.00093871     26   0.18415 0.23729 0.029377
## 28 0.00080569     27   0.18321 0.23671 0.029383
## 29 0.00058327     28   0.18240 0.23592 0.029384
## 30 0.00054965     29   0.18182 0.23539 0.029382
## 31 0.00050335     30   0.18127 0.23534 0.029382
## 32 0.00041654     31   0.18076 0.23520 0.029387
## 33 0.00035842     32   0.18035 0.23467 0.029389
## 34 0.00010000     33   0.17999 0.23391 0.029391

De lo anterior tomamos solo las variables que ocupa el árbol y observamos la importancia de las variables en este modelo.

tree <- rpart(SalePrice ~ BedroomAbvGr + BsmtFinSF1 + BsmtFullBath + BsmtUnfSF + 
                Fireplaces + GarageArea + GarageType + GrLivArea + Neighborhood + 
                OpenPorchSF + OverallCond + OverallQual + SaleType + TotalBsmtSF +
                WoodDeckSF + X1stFlrSF + YearBuilt + YearRemodAdd, data = train,
              method = 'anova',
              control = rpart.control(minsplit = 50, cp = 0.0001))
tibble(cbind(names(tree$variable.importance),comma(tree$variable.importance)))

## # A tibble: 18 x 1
##    `cbind(names(tree$variable.importance), comma(tree$variable.i~ [,2]          
##    <chr>                                                          <chr>         
##  1 OverallQual                                                    4,908,293,032~
##  2 Neighborhood                                                   2,034,486,543~
##  3 TotalBsmtSF                                                    1,551,045,779~
##  4 GrLivArea                                                      1,313,629,719~
##  5 X1stFlrSF                                                      1,241,547,657~
##  6 GarageArea                                                     1,188,848,207~
##  7 YearBuilt                                                      474,955,490,8~
##  8 BsmtFinSF1                                                     280,268,404,0~
##  9 BedroomAbvGr                                                   234,863,274,2~
## 10 YearRemodAdd                                                   196,982,455,4~
## 11 BsmtUnfSF                                                      191,459,506,7~
## 12 SaleType                                                       112,830,519,3~
## 13 WoodDeckSF                                                     97,421,825,716
## 14 GarageType                                                     70,395,502,873
## 15 Fireplaces                                                     60,818,916,189
## 16 OpenPorchSF                                                    30,822,259,401
## 17 BsmtFullBath                                                   22,418,879,992
## 18 OverallCond                                                    18,339,098,572

Podemos observar que las 5 variables más importantes en el árbol son: la calidad de materiales y terminado (OverallQual), el vecindario (Neighborhood), el área total del sótano en pies cuadrados (TotalBsmtSF), superficie habitable en pies cuadrados (GrLivArea) y área en pies cuadrados del primer piso (X1stFlrSF). Es decir, entre mayor sea la calidad de los materiales, el área del sótano, la superficie habitable y el área del primer piso aunado a estar en un buen vecindario, mayor será el precio de venta de una casa.

Veamos que el modelo tiene un buen/medio nivel de exactitud. (Se nota más en la \(r^2\) y en el error relativo).

plotcp(tree)

rsq.rpart(tree)

## 
## Regression tree:
## rpart(formula = SalePrice ~ BedroomAbvGr + BsmtFinSF1 + BsmtFullBath + 
##     BsmtUnfSF + Fireplaces + GarageArea + GarageType + GrLivArea + 
##     Neighborhood + OpenPorchSF + OverallCond + OverallQual + 
##     SaleType + TotalBsmtSF + WoodDeckSF + X1stFlrSF + YearBuilt + 
##     YearRemodAdd, data = train, method = "anova", control = rpart.control(minsplit = 50, 
##     cp = 1e-04))
## 
## Variables actually used in tree construction:
##  [1] BedroomAbvGr BsmtFinSF1   BsmtFullBath BsmtUnfSF    Fireplaces  
##  [6] GarageArea   GarageType   GrLivArea    Neighborhood OpenPorchSF 
## [11] OverallCond  OverallQual  SaleType     TotalBsmtSF  WoodDeckSF  
## [16] X1stFlrSF    YearBuilt    YearRemodAdd
## 
## Root node error: 7.45e+12/1051 = 7088443518
## 
## n= 1051 
## 
##            CP nsplit rel error  xerror     xstd
## 1  0.47491499      0   1.00000 1.00090 0.088438
## 2  0.10614012      1   0.52509 0.52857 0.048258
## 3  0.07447327      2   0.41894 0.42266 0.046677
## 4  0.03115704      3   0.34447 0.38675 0.035382
## 5  0.02327429      4   0.31331 0.35427 0.035104
## 6  0.02118925      5   0.29004 0.33240 0.033840
## 7  0.01124349      6   0.26885 0.31318 0.033354
## 8  0.01003729      7   0.25761 0.30578 0.033410
## 9  0.00921973      8   0.24757 0.29685 0.033383
## 10 0.00795091      9   0.23835 0.29217 0.033404
## 11 0.00778853     10   0.23040 0.29099 0.033380
## 12 0.00727191     11   0.22261 0.28514 0.033274
## 13 0.00461859     12   0.21534 0.27665 0.033174
## 14 0.00399566     13   0.21072 0.27419 0.033093
## 15 0.00395063     14   0.20672 0.27291 0.033156
## 16 0.00372239     15   0.20277 0.27220 0.033148
## 17 0.00294424     16   0.19905 0.26870 0.033138
## 18 0.00192409     17   0.19611 0.26522 0.033117
## 19 0.00168560     18   0.19418 0.26399 0.033115
## 20 0.00147635     19   0.19250 0.26411 0.033137
## 21 0.00136776     20   0.19102 0.26432 0.033135
## 22 0.00127960     21   0.18965 0.26427 0.033140
## 23 0.00122897     22   0.18837 0.26411 0.033141
## 24 0.00108783     23   0.18715 0.26375 0.033142
## 25 0.00097237     24   0.18606 0.26261 0.033145
## 26 0.00093992     25   0.18509 0.26240 0.033146
## 27 0.00093871     26   0.18415 0.26240 0.033146
## 28 0.00080569     27   0.18321 0.26187 0.033146
## 29 0.00058327     28   0.18240 0.26108 0.033147
## 30 0.00054965     29   0.18182 0.26056 0.033143
## 31 0.00050335     30   0.18127 0.26027 0.033145
## 32 0.00041654     31   0.18076 0.26088 0.033160
## 33 0.00035842     32   0.18035 0.26078 0.033162
## 34 0.00010000     33   0.17999 0.26008 0.033164

Ahora con las predicciones, graficando los valores de venta que tenemos y los predichos, coloreando la diferencia que hay entre ellos.

pred <- predict(tree, newdata = test[,-80])
names(pred) <- NULL
dif <- abs(test$SalePrice - pred)
df_1 <- data.frame(Observed = test$SalePrice, Predicted = pred, 
                   Difference = dif)
summary(df_1$Difference)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     192    8143   15716   23977   30500  244830

ggplot(df_1, aes(Observed, Predicted, color = Difference)) + geom_point() +
  scale_color_gradient(low = "lavender blush", high = "red", label = comma, limits = c(0, 250000)) + 
  scale_y_continuous(label = comma, limits = c(0, 420000)) + 
  scale_x_continuous(label = comma, limits = c(0, 420000)) + 
  geom_abline() + theme_minimal()

Y por curiosidad, así se ve (apenas) nuestro árbol:

rpart.plot::rpart.plot(tree)

Random Forest

Ahora ajustaremos a los mismos datos de entrenamiento un random forest.

rtree <- randomForest(SalePrice ~., data = train)
rtree

## 
## Call:
##  randomForest(formula = SalePrice ~ ., data = train) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 26
## 
##           Mean of squared residuals: 913825819
##                     % Var explained: 87.11

Veamos la importancia de las variables en este modelo.

imp <- data.frame(Variable = rownames(rtree$importance), rtree$importance)
imp <- arrange(imp, desc(IncNodePurity))
rownames(imp) <- NULL
tibble(cbind(imp$Variable, comma(imp$IncNodePurity)))

## # A tibble: 79 x 1
##    `cbind(imp$Variable, comma(imp$IncNodePurity))`[,1] [,2]             
##    <chr>                                               <chr>            
##  1 OverallQual                                         1,960,711,015,744
##  2 GrLivArea                                           1,069,094,809,943
##  3 GarageCars                                          595,351,646,999  
##  4 TotalBsmtSF                                         528,831,258,998  
##  5 X1stFlrSF                                           337,355,520,461  
##  6 ExterQual                                           335,495,230,123  
##  7 YearBuilt                                           266,614,185,772  
##  8 GarageArea                                          226,717,426,909  
##  9 X2ndFlrSF                                           216,642,666,916  
## 10 BsmtQual                                            175,814,521,922  
## # ... with 69 more rows

Y de nuevo tenemos dentro de las 5 variables más importantes: la calidad de materiales y terminado (OverallQual), superficie habitable en pies cuadrados (GrLivArea), el área total del sótano en pies cuadrados (TotalBsmtSF) y área en pies cuadrados del primer piso (X1stFlrSF) y a diferencia del modelo con un árbol de regresión, ahora se tiene como importante el tamaño del garage (GarageCars) en autos en lugar del vecindario.

Finalmente veamos la misma gráfica de los precios predichos contra los observados.

predf <- predict(rtree, newdata = test[,-80])
names(predf) <- NULL
difr <- abs(test$SalePrice - predf)
dfrf <- data.frame(Observed = test$SalePrice, Predicted = predf, 
                   Difference = difr)
summary(dfrf$Difference)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##     87.45   4867.77  11038.45  15774.68  19783.85 279045.89

ggplot(dfrf, aes(Observed, Predicted, color = Difference)) + geom_point() +
  scale_color_gradient(low = "lavender blush", high = "red", label = comma, limits = c(0, 300000)) + 
  scale_y_continuous(label = comma, limits = c(0, 450000)) + 
  scale_x_continuous(label = comma, limits = c(0, 450000)) + 
  geom_abline() + theme_minimal()

Podemos ver que el modelo de random forest predice mejor los precios en general, ya que la diferencia absoluta entre los precios reales y los predichos es menor que la diferencia con el árbol de regresión, aunque en el de random forest si hay un valor que predice peor, que es justo donde se da el máximo de las diferencias.

Esto se ve aún mas claro en el siguiente boxplot.

df <- data.frame(Difference = c(df_1$Difference,  dfrf$Difference), 
                 Type = c(rep("Observed", 319), rep("Predicted", 319)))

ggplot(df, aes(x = Type, y = Difference, fill = Type)) + geom_boxplot() +
  geom_jitter(width = 0.05, alpha = 0.1) + theme_minimal() +
  scale_y_continuous(label = comma, limits = c(0, 300000))

Árboles de regresión y Random Forest

Luis Alberto Abaunza Guevara

28/11/2020

Precio de venta

Datos

Árbol de regresión

Random Forest