Tenemos datos sobre los datos de varias casas. Veamos la cantidad de NAs en los datos.
library(tidyverse)
library(caret)
library(caTools)
library(rpart)
library(randomForest)
library(rattle)
library(DataExplorer)
library(scales)
library(kableExtra)
data <- read.csv("C:/Users/luisa/Downloads/house_prices.csv",
fileEncoding = "UTF-8-BOM")
kbl(colSums(is.na(data))) %>%
kable_paper() %>%
scroll_box(width = "900px", height = "200px")| x | |
|---|---|
| Id | 0 |
| MSSubClass | 0 |
| MSZoning | 0 |
| LotFrontage | 259 |
| LotArea | 0 |
| Street | 0 |
| Alley | 1369 |
| LotShape | 0 |
| LandContour | 0 |
| Utilities | 0 |
| LotConfig | 0 |
| LandSlope | 0 |
| Neighborhood | 0 |
| Condition1 | 0 |
| Condition2 | 0 |
| BldgType | 0 |
| HouseStyle | 0 |
| OverallQual | 0 |
| OverallCond | 0 |
| YearBuilt | 0 |
| YearRemodAdd | 0 |
| RoofStyle | 0 |
| RoofMatl | 0 |
| Exterior1st | 0 |
| Exterior2nd | 0 |
| MasVnrType | 8 |
| MasVnrArea | 8 |
| ExterQual | 0 |
| ExterCond | 0 |
| Foundation | 0 |
| BsmtQual | 37 |
| BsmtCond | 37 |
| BsmtExposure | 38 |
| BsmtFinType1 | 37 |
| BsmtFinSF1 | 0 |
| BsmtFinType2 | 38 |
| BsmtFinSF2 | 0 |
| BsmtUnfSF | 0 |
| TotalBsmtSF | 0 |
| Heating | 0 |
| HeatingQC | 0 |
| CentralAir | 0 |
| Electrical | 1 |
| X1stFlrSF | 0 |
| X2ndFlrSF | 0 |
| LowQualFinSF | 0 |
| GrLivArea | 0 |
| BsmtFullBath | 0 |
| BsmtHalfBath | 0 |
| FullBath | 0 |
| HalfBath | 0 |
| BedroomAbvGr | 0 |
| KitchenAbvGr | 0 |
| KitchenQual | 0 |
| TotRmsAbvGrd | 0 |
| Functional | 0 |
| Fireplaces | 0 |
| FireplaceQu | 690 |
| GarageType | 81 |
| GarageYrBlt | 81 |
| GarageFinish | 81 |
| GarageCars | 0 |
| GarageArea | 0 |
| GarageQual | 81 |
| GarageCond | 81 |
| PavedDrive | 0 |
| WoodDeckSF | 0 |
| OpenPorchSF | 0 |
| EnclosedPorch | 0 |
| X3SsnPorch | 0 |
| ScreenPorch | 0 |
| PoolArea | 0 |
| PoolQC | 1453 |
| Fence | 1179 |
| MiscFeature | 1406 |
| MiscVal | 0 |
| MoSold | 0 |
| YrSold | 0 |
| SaleType | 0 |
| SaleCondition | 0 |
| SalePrice | 0 |
Limpiamos los datos:
datos <- data[,-1]
datos$Alley[is.na(datos$Alley)] <- "No Alley"
datos$LotFrontage[is.na(datos$LotFrontage)] <- 0
datos$BsmtQual[is.na(datos$BsmtQual)] <- "No Basement"
datos$BsmtCond[is.na(datos$BsmtCond)] <- "No Basement"
datos$BsmtExposure[is.na(datos$BsmtExposure)] <- "No Basement"
datos$BsmtFinType1[is.na(datos$BsmtFinType1)] <- "No Basement"
datos$BsmtFinType2[is.na(datos$BsmtFinType2)] <- "No Basement"
datos$FireplaceQu[is.na(datos$FireplaceQu)] <- "No Fireplace"
datos$PoolQC[is.na(datos$PoolQC)] <- "No Pool"
datos$Fence[is.na(datos$Fence)] <- "No Fence"
datos$MiscFeature[is.na(datos$MiscFeature)] <- "None"
datos <- na.omit(datos)Y veamos la estructura de los nuevos datos.
## rows columns discrete_columns continuous_columns all_missing_columns
## 1 1370 80 43 37 0
## total_missing_values complete_rows total_observations memory_usage
## 1 0 1370 109600 710968
Pasamos a ajustar un árbol de regresión a SalePrice de nuestros datos metiendo todas las variables.
set.seed(123)
split <- sample.split(datos$SalePrice, SplitRatio = 0.7)
train <- subset(datos, split == T)
test <- subset(datos, split == F)
tree <- rpart(SalePrice ~., data = train, method = 'anova',
control = rpart.control(minsplit = 50, cp = 0.0001))
printcp(tree)##
## Regression tree:
## rpart(formula = SalePrice ~ ., data = train, method = "anova",
## control = rpart.control(minsplit = 50, cp = 1e-04))
##
## Variables actually used in tree construction:
## [1] BedroomAbvGr BsmtFinSF1 BsmtFullBath BsmtUnfSF Fireplaces
## [6] GarageArea GarageType GrLivArea Neighborhood OpenPorchSF
## [11] OverallCond OverallQual SaleType TotalBsmtSF WoodDeckSF
## [16] X1stFlrSF YearBuilt YearRemodAdd
##
## Root node error: 7.45e+12/1051 = 7088443518
##
## n= 1051
##
## CP nsplit rel error xerror xstd
## 1 0.47491499 0 1.00000 1.00161 0.088449
## 2 0.10614012 1 0.52509 0.52767 0.048232
## 3 0.07447327 2 0.41894 0.42188 0.046634
## 4 0.03115704 3 0.34447 0.36190 0.033022
## 5 0.02327429 4 0.31331 0.33568 0.032788
## 6 0.02118925 5 0.29004 0.30742 0.030139
## 7 0.01124349 6 0.26885 0.28797 0.029582
## 8 0.01003729 7 0.25761 0.27962 0.029649
## 9 0.00921973 8 0.24757 0.27326 0.029599
## 10 0.00795091 9 0.23835 0.27014 0.029610
## 11 0.00778853 10 0.23040 0.26634 0.029543
## 12 0.00727191 11 0.22261 0.26471 0.029520
## 13 0.00461859 12 0.21534 0.25238 0.029210
## 14 0.00399566 13 0.21072 0.25375 0.029496
## 15 0.00395063 14 0.20672 0.25084 0.029477
## 16 0.00372239 15 0.20277 0.25141 0.029492
## 17 0.00294424 16 0.19905 0.24818 0.029470
## 18 0.00192409 17 0.19611 0.24503 0.029438
## 19 0.00168560 18 0.19418 0.24318 0.029424
## 20 0.00147635 19 0.19250 0.24215 0.029394
## 21 0.00136776 20 0.19102 0.24238 0.029391
## 22 0.00127960 21 0.18965 0.24066 0.029388
## 23 0.00122897 22 0.18837 0.23991 0.029386
## 24 0.00108783 23 0.18715 0.23942 0.029388
## 25 0.00097237 24 0.18606 0.23772 0.029380
## 26 0.00093992 25 0.18509 0.23741 0.029376
## 27 0.00093871 26 0.18415 0.23729 0.029377
## 28 0.00080569 27 0.18321 0.23671 0.029383
## 29 0.00058327 28 0.18240 0.23592 0.029384
## 30 0.00054965 29 0.18182 0.23539 0.029382
## 31 0.00050335 30 0.18127 0.23534 0.029382
## 32 0.00041654 31 0.18076 0.23520 0.029387
## 33 0.00035842 32 0.18035 0.23467 0.029389
## 34 0.00010000 33 0.17999 0.23391 0.029391
De lo anterior tomamos solo las variables que ocupa el árbol y observamos la importancia de las variables en este modelo.
tree <- rpart(SalePrice ~ BedroomAbvGr + BsmtFinSF1 + BsmtFullBath + BsmtUnfSF +
Fireplaces + GarageArea + GarageType + GrLivArea + Neighborhood +
OpenPorchSF + OverallCond + OverallQual + SaleType + TotalBsmtSF +
WoodDeckSF + X1stFlrSF + YearBuilt + YearRemodAdd, data = train,
method = 'anova',
control = rpart.control(minsplit = 50, cp = 0.0001))
tibble(cbind(names(tree$variable.importance),comma(tree$variable.importance)))## # A tibble: 18 x 1
## `cbind(names(tree$variable.importance), comma(tree$variable.i~ [,2]
## <chr> <chr>
## 1 OverallQual 4,908,293,032~
## 2 Neighborhood 2,034,486,543~
## 3 TotalBsmtSF 1,551,045,779~
## 4 GrLivArea 1,313,629,719~
## 5 X1stFlrSF 1,241,547,657~
## 6 GarageArea 1,188,848,207~
## 7 YearBuilt 474,955,490,8~
## 8 BsmtFinSF1 280,268,404,0~
## 9 BedroomAbvGr 234,863,274,2~
## 10 YearRemodAdd 196,982,455,4~
## 11 BsmtUnfSF 191,459,506,7~
## 12 SaleType 112,830,519,3~
## 13 WoodDeckSF 97,421,825,716
## 14 GarageType 70,395,502,873
## 15 Fireplaces 60,818,916,189
## 16 OpenPorchSF 30,822,259,401
## 17 BsmtFullBath 22,418,879,992
## 18 OverallCond 18,339,098,572
Podemos observar que las 5 variables más importantes en el árbol son: la calidad de materiales y terminado (OverallQual), el vecindario (Neighborhood), el área total del sótano en pies cuadrados (TotalBsmtSF), superficie habitable en pies cuadrados (GrLivArea) y área en pies cuadrados del primer piso (X1stFlrSF). Es decir, entre mayor sea la calidad de los materiales, el área del sótano, la superficie habitable y el área del primer piso aunado a estar en un buen vecindario, mayor será el precio de venta de una casa.
Veamos que el modelo tiene un buen/medio nivel de exactitud. (Se nota más en la \(r^2\) y en el error relativo).
##
## Regression tree:
## rpart(formula = SalePrice ~ BedroomAbvGr + BsmtFinSF1 + BsmtFullBath +
## BsmtUnfSF + Fireplaces + GarageArea + GarageType + GrLivArea +
## Neighborhood + OpenPorchSF + OverallCond + OverallQual +
## SaleType + TotalBsmtSF + WoodDeckSF + X1stFlrSF + YearBuilt +
## YearRemodAdd, data = train, method = "anova", control = rpart.control(minsplit = 50,
## cp = 1e-04))
##
## Variables actually used in tree construction:
## [1] BedroomAbvGr BsmtFinSF1 BsmtFullBath BsmtUnfSF Fireplaces
## [6] GarageArea GarageType GrLivArea Neighborhood OpenPorchSF
## [11] OverallCond OverallQual SaleType TotalBsmtSF WoodDeckSF
## [16] X1stFlrSF YearBuilt YearRemodAdd
##
## Root node error: 7.45e+12/1051 = 7088443518
##
## n= 1051
##
## CP nsplit rel error xerror xstd
## 1 0.47491499 0 1.00000 1.00090 0.088438
## 2 0.10614012 1 0.52509 0.52857 0.048258
## 3 0.07447327 2 0.41894 0.42266 0.046677
## 4 0.03115704 3 0.34447 0.38675 0.035382
## 5 0.02327429 4 0.31331 0.35427 0.035104
## 6 0.02118925 5 0.29004 0.33240 0.033840
## 7 0.01124349 6 0.26885 0.31318 0.033354
## 8 0.01003729 7 0.25761 0.30578 0.033410
## 9 0.00921973 8 0.24757 0.29685 0.033383
## 10 0.00795091 9 0.23835 0.29217 0.033404
## 11 0.00778853 10 0.23040 0.29099 0.033380
## 12 0.00727191 11 0.22261 0.28514 0.033274
## 13 0.00461859 12 0.21534 0.27665 0.033174
## 14 0.00399566 13 0.21072 0.27419 0.033093
## 15 0.00395063 14 0.20672 0.27291 0.033156
## 16 0.00372239 15 0.20277 0.27220 0.033148
## 17 0.00294424 16 0.19905 0.26870 0.033138
## 18 0.00192409 17 0.19611 0.26522 0.033117
## 19 0.00168560 18 0.19418 0.26399 0.033115
## 20 0.00147635 19 0.19250 0.26411 0.033137
## 21 0.00136776 20 0.19102 0.26432 0.033135
## 22 0.00127960 21 0.18965 0.26427 0.033140
## 23 0.00122897 22 0.18837 0.26411 0.033141
## 24 0.00108783 23 0.18715 0.26375 0.033142
## 25 0.00097237 24 0.18606 0.26261 0.033145
## 26 0.00093992 25 0.18509 0.26240 0.033146
## 27 0.00093871 26 0.18415 0.26240 0.033146
## 28 0.00080569 27 0.18321 0.26187 0.033146
## 29 0.00058327 28 0.18240 0.26108 0.033147
## 30 0.00054965 29 0.18182 0.26056 0.033143
## 31 0.00050335 30 0.18127 0.26027 0.033145
## 32 0.00041654 31 0.18076 0.26088 0.033160
## 33 0.00035842 32 0.18035 0.26078 0.033162
## 34 0.00010000 33 0.17999 0.26008 0.033164
Ahora con las predicciones, graficando los valores de venta que tenemos y los predichos, coloreando la diferencia que hay entre ellos.
pred <- predict(tree, newdata = test[,-80])
names(pred) <- NULL
dif <- abs(test$SalePrice - pred)
df_1 <- data.frame(Observed = test$SalePrice, Predicted = pred,
Difference = dif)
summary(df_1$Difference)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 192 8143 15716 23977 30500 244830
ggplot(df_1, aes(Observed, Predicted, color = Difference)) + geom_point() +
scale_color_gradient(low = "lavender blush", high = "red", label = comma, limits = c(0, 250000)) +
scale_y_continuous(label = comma, limits = c(0, 420000)) +
scale_x_continuous(label = comma, limits = c(0, 420000)) +
geom_abline() + theme_minimal()Y por curiosidad, así se ve (apenas) nuestro árbol:
Ahora ajustaremos a los mismos datos de entrenamiento un random forest.
##
## Call:
## randomForest(formula = SalePrice ~ ., data = train)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 26
##
## Mean of squared residuals: 913825819
## % Var explained: 87.11
Veamos la importancia de las variables en este modelo.
imp <- data.frame(Variable = rownames(rtree$importance), rtree$importance)
imp <- arrange(imp, desc(IncNodePurity))
rownames(imp) <- NULL
tibble(cbind(imp$Variable, comma(imp$IncNodePurity)))## # A tibble: 79 x 1
## `cbind(imp$Variable, comma(imp$IncNodePurity))`[,1] [,2]
## <chr> <chr>
## 1 OverallQual 1,960,711,015,744
## 2 GrLivArea 1,069,094,809,943
## 3 GarageCars 595,351,646,999
## 4 TotalBsmtSF 528,831,258,998
## 5 X1stFlrSF 337,355,520,461
## 6 ExterQual 335,495,230,123
## 7 YearBuilt 266,614,185,772
## 8 GarageArea 226,717,426,909
## 9 X2ndFlrSF 216,642,666,916
## 10 BsmtQual 175,814,521,922
## # ... with 69 more rows
Y de nuevo tenemos dentro de las 5 variables más importantes: la calidad de materiales y terminado (OverallQual), superficie habitable en pies cuadrados (GrLivArea), el área total del sótano en pies cuadrados (TotalBsmtSF) y área en pies cuadrados del primer piso (X1stFlrSF) y a diferencia del modelo con un árbol de regresión, ahora se tiene como importante el tamaño del garage (GarageCars) en autos en lugar del vecindario.
Finalmente veamos la misma gráfica de los precios predichos contra los observados.
predf <- predict(rtree, newdata = test[,-80])
names(predf) <- NULL
difr <- abs(test$SalePrice - predf)
dfrf <- data.frame(Observed = test$SalePrice, Predicted = predf,
Difference = difr)
summary(dfrf$Difference)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 87.45 4867.77 11038.45 15774.68 19783.85 279045.89
ggplot(dfrf, aes(Observed, Predicted, color = Difference)) + geom_point() +
scale_color_gradient(low = "lavender blush", high = "red", label = comma, limits = c(0, 300000)) +
scale_y_continuous(label = comma, limits = c(0, 450000)) +
scale_x_continuous(label = comma, limits = c(0, 450000)) +
geom_abline() + theme_minimal()Podemos ver que el modelo de random forest predice mejor los precios en general, ya que la diferencia absoluta entre los precios reales y los predichos es menor que la diferencia con el árbol de regresión, aunque en el de random forest si hay un valor que predice peor, que es justo donde se da el máximo de las diferencias.
Esto se ve aún mas claro en el siguiente boxplot.
df <- data.frame(Difference = c(df_1$Difference, dfrf$Difference),
Type = c(rep("Observed", 319), rep("Predicted", 319)))
ggplot(df, aes(x = Type, y = Difference, fill = Type)) + geom_boxplot() +
geom_jitter(width = 0.05, alpha = 0.1) + theme_minimal() +
scale_y_continuous(label = comma, limits = c(0, 300000))