# Arboles de regresion de prediccion de casas
# install.packages("rpart", "rpart.plot", "caret")
library(rpart) # Arboles
library(rpart.plot) # Visualizar y represenar árboles
## Warning: package 'rpart.plot' was built under R version 3.6.3
library(caret) # Para llevar a cabo particiones de conjuntos de datos en caso de...
## Loading required package: lattice
## Loading required package: ggplot2
library(dplyr) # Para select, filter, mutate, arange ....
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr) # Para leer datos
library(ggplot2) # Para grafica mas vistosas
library(reshape) # Para renombrar columnas
##
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
##
## rename
datos <- read_csv("C:/Users/david/Desktop/AID/DATOS/melb_data.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## Suburb = col_character(),
## Address = col_character(),
## Type = col_character(),
## Method = col_character(),
## SellerG = col_character(),
## Date = col_character(),
## CouncilArea = col_character(),
## Regionname = col_character()
## )
## See spec(...) for full column specifications.
head(datos)
## # A tibble: 6 x 21
## Suburb Address Rooms Type Price Method SellerG Date Distance Postcode
## <chr> <chr> <dbl> <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl>
## 1 Abbot~ 85 Tur~ 2 h 1.48e6 S Biggin 3/12~ 2.5 3067
## 2 Abbot~ 25 Blo~ 2 h 1.03e6 S Biggin 4/02~ 2.5 3067
## 3 Abbot~ 5 Char~ 3 h 1.46e6 SP Biggin 4/03~ 2.5 3067
## 4 Abbot~ 40 Fed~ 3 h 8.50e5 PI Biggin 4/03~ 2.5 3067
## 5 Abbot~ 55a Pa~ 4 h 1.60e6 VB Nelson 4/06~ 2.5 3067
## 6 Abbot~ 129 Ch~ 2 h 9.41e5 S Jellis 7/05~ 2.5 3067
## # ... with 11 more variables: Bedroom2 <dbl>, Bathroom <dbl>, Car <dbl>,
## # Landsize <dbl>, BuildingArea <dbl>, YearBuilt <dbl>, CouncilArea <chr>,
## # Lattitude <dbl>, Longtitude <dbl>, Regionname <chr>, Propertycount <dbl>
tail(datos)
## # A tibble: 6 x 21
## Suburb Address Rooms Type Price Method SellerG Date Distance Postcode
## <chr> <chr> <dbl> <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl>
## 1 Westm~ 9 Blac~ 3 h 5.82e5 S Red 26/0~ 16.5 3049
## 2 Wheel~ 12 Str~ 4 h 1.25e6 S Barry 26/0~ 16.7 3150
## 3 Willi~ 77 Mer~ 3 h 1.03e6 SP Willia~ 26/0~ 6.8 3016
## 4 Willi~ 83 Pow~ 3 h 1.17e6 S Raine 26/0~ 6.8 3016
## 5 Willi~ 96 Ver~ 4 h 2.50e6 PI Sweeney 26/0~ 6.8 3016
## 6 Yarra~ 6 Agne~ 4 h 1.28e6 SP Village 26/0~ 6.3 3013
## # ... with 11 more variables: Bedroom2 <dbl>, Bathroom <dbl>, Car <dbl>,
## # Landsize <dbl>, BuildingArea <dbl>, YearBuilt <dbl>, CouncilArea <chr>,
## # Lattitude <dbl>, Longtitude <dbl>, Regionname <chr>, Propertycount <dbl>
datos.Num <- select(datos, Price, Rooms, Distance, Bedroom2, Bathroom, Car, Landsize, BuildingArea, YearBuilt, Propertycount)
head(datos.Num)
## # A tibble: 6 x 10
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.48e6 2 2.5 2 1 1 202 NA NA
## 2 1.03e6 2 2.5 2 1 0 156 79 1900
## 3 1.46e6 3 2.5 3 2 0 134 150 1900
## 4 8.50e5 3 2.5 3 2 1 94 NA NA
## 5 1.60e6 4 2.5 3 1 2 120 142 2014
## 6 9.41e5 2 2.5 2 1 0 181 NA NA
## # ... with 1 more variable: Propertycount <dbl>
str(datos.Num)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 13580 obs. of 10 variables:
## $ Price : num 1480000 1035000 1465000 850000 1600000 ...
## $ Rooms : num 2 2 3 3 4 2 3 2 1 2 ...
## $ Distance : num 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
## $ Bedroom2 : num 2 2 3 3 3 2 4 2 1 3 ...
## $ Bathroom : num 1 1 2 2 1 1 2 1 1 1 ...
## $ Car : num 1 0 0 1 2 0 0 2 1 2 ...
## $ Landsize : num 202 156 134 94 120 181 245 256 0 220 ...
## $ BuildingArea : num NA 79 150 NA 142 NA 210 107 NA 75 ...
## $ YearBuilt : num NA 1900 1900 NA 2014 ...
## $ Propertycount: num 4019 4019 4019 4019 4019 ...
## - attr(*, "spec")=
## .. cols(
## .. Suburb = col_character(),
## .. Address = col_character(),
## .. Rooms = col_double(),
## .. Type = col_character(),
## .. Price = col_double(),
## .. Method = col_character(),
## .. SellerG = col_character(),
## .. Date = col_character(),
## .. Distance = col_double(),
## .. Postcode = col_double(),
## .. Bedroom2 = col_double(),
## .. Bathroom = col_double(),
## .. Car = col_double(),
## .. Landsize = col_double(),
## .. BuildingArea = col_double(),
## .. YearBuilt = col_double(),
## .. CouncilArea = col_character(),
## .. Lattitude = col_double(),
## .. Longtitude = col_double(),
## .. Regionname = col_character(),
## .. Propertycount = col_double()
## .. )
mediana.BA <- median(datos.Num$BuildingArea, na.rm = TRUE) # summary(datos.Num$BuildingArea)[3], como otra alternativa
mediana.YB <- median(datos.Num$YearBuilt, na.rm = TRUE) # summary(datos.Num$YearBuilt)[3], , como otra alternativa
mediana.C <- median(datos.Num$Car, na.rm = TRUE) # summary(datos.Num$Car)[3], , como otra alternativa
####Actualizar mutate() los NA por la medianas *Las vaiables que tienen NAs
head(datos.Num, 10) # Los primeros 10, se observan NAs
## # A tibble: 10 x 10
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.48e6 2 2.5 2 1 1 202 NA NA
## 2 1.03e6 2 2.5 2 1 0 156 79 1900
## 3 1.46e6 3 2.5 3 2 0 134 150 1900
## 4 8.50e5 3 2.5 3 2 1 94 NA NA
## 5 1.60e6 4 2.5 3 1 2 120 142 2014
## 6 9.41e5 2 2.5 2 1 0 181 NA NA
## 7 1.88e6 3 2.5 4 2 0 245 210 1910
## 8 1.64e6 2 2.5 2 1 2 256 107 1890
## 9 3.00e5 1 2.5 1 1 1 0 NA NA
## 10 1.10e6 2 2.5 3 1 2 220 75 1900
## # ... with 1 more variable: Propertycount <dbl>
datos.Num<- datos.Num %>%
mutate (BuildingArea = ifelse(is.na(BuildingArea), mediana.BA, BuildingArea))
datos.Num <- datos.Num %>%
mutate (YearBuilt = ifelse(is.na(YearBuilt), mediana.YB, YearBuilt))
datos.Num <- datos.Num %>%
mutate (Car = ifelse(is.na(Car), mediana.C, Car))
head(datos.Num, 10) # # Los primeros 10, YA NO se observan NAs
## # A tibble: 10 x 10
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.48e6 2 2.5 2 1 1 202 126 1970
## 2 1.03e6 2 2.5 2 1 0 156 79 1900
## 3 1.46e6 3 2.5 3 2 0 134 150 1900
## 4 8.50e5 3 2.5 3 2 1 94 126 1970
## 5 1.60e6 4 2.5 3 1 2 120 142 2014
## 6 9.41e5 2 2.5 2 1 0 181 126 1970
## 7 1.88e6 3 2.5 4 2 0 245 210 1910
## 8 1.64e6 2 2.5 2 1 2 256 107 1890
## 9 3.00e5 1 2.5 1 1 1 0 126 1970
## 10 1.10e6 2 2.5 3 1 2 220 75 1900
## # ... with 1 more variable: Propertycount <dbl>
set.seed(2020) # Semilla
entrena <- createDataPartition(datos.Num$Price, p=0.8, list = FALSE)
head(entrena)
## Resample1
## [1,] 1
## [2,] 3
## [3,] 4
## [4,] 5
## [5,] 8
## [6,] 9
nrow(entrena)
## [1] 10865
# Los registros que no estén en entrena serán los de validación
head(datos.Num[-entrena,])
## # A tibble: 6 x 10
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.03e6 2 2.5 2 1 0 156 79 1900
## 2 9.41e5 2 2.5 2 1 0 181 126 1970
## 3 1.88e6 3 2.5 4 2 0 245 210 1910
## 4 1.20e6 3 2.5 3 2 1 113 110 1880
## 5 1.33e6 4 2.5 4 2 2 780 135 1900
## 6 9.00e5 3 2.5 3 2 2 0 126 2010
## # ... with 1 more variable: Propertycount <dbl>
nrow(datos.Num[-entrena,])
## [1] 2715
# Ver los primeros seis datos con sólo variables numéricas
head(datos.Num)
## # A tibble: 6 x 10
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.48e6 2 2.5 2 1 1 202 126 1970
## 2 1.03e6 2 2.5 2 1 0 156 79 1900
## 3 1.46e6 3 2.5 3 2 0 134 150 1900
## 4 8.50e5 3 2.5 3 2 1 94 126 1970
## 5 1.60e6 4 2.5 3 1 2 120 142 2014
## 6 9.41e5 2 2.5 2 1 0 181 126 1970
## # ... with 1 more variable: Propertycount <dbl>
# Ahora a determinar conjuntos de datos de entrenamiento y luego head()
datos.Entrena <- datos.Num[entrena,]
head(datos.Entrena)
## # A tibble: 6 x 10
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.48e6 2 2.5 2 1 1 202 126 1970
## 2 1.46e6 3 2.5 3 2 0 134 150 1900
## 3 8.50e5 3 2.5 3 2 1 94 126 1970
## 4 1.60e6 4 2.5 3 1 2 120 142 2014
## 5 1.64e6 2 2.5 2 1 2 256 107 1890
## 6 3.00e5 1 2.5 1 1 1 0 126 1970
## # ... with 1 more variable: Propertycount <dbl>
summary(datos.Entrena)
## Price Rooms Distance Bedroom2
## Min. : 85000 Min. : 1.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 650000 1st Qu.: 2.000 1st Qu.: 6.10 1st Qu.: 2.000
## Median : 903000 Median : 3.000 Median : 9.20 Median : 3.000
## Mean :1074197 Mean : 2.937 Mean :10.12 Mean : 2.911
## 3rd Qu.:1330000 3rd Qu.: 3.000 3rd Qu.:13.00 3rd Qu.: 3.000
## Max. :9000000 Max. :10.000 Max. :48.10 Max. :10.000
## Bathroom Car Landsize BuildingArea
## Min. :0.000 Min. : 0.000 Min. : 0.0 Min. : 0.0
## 1st Qu.:1.000 1st Qu.: 1.000 1st Qu.: 178.0 1st Qu.: 122.0
## Median :1.000 Median : 2.000 Median : 438.0 Median : 126.0
## Mean :1.531 Mean : 1.606 Mean : 559.2 Mean : 140.6
## 3rd Qu.:2.000 3rd Qu.: 2.000 3rd Qu.: 650.0 3rd Qu.: 130.0
## Max. :8.000 Max. :10.000 Max. :433014.0 Max. :44515.0
## YearBuilt Propertycount
## Min. :1196 Min. : 249
## 1st Qu.:1960 1st Qu.: 4380
## Median :1970 Median : 6567
## Mean :1967 Mean : 7451
## 3rd Qu.:1975 3rd Qu.:10331
## Max. :2018 Max. :21650
# y conjunto de datos de validación y luego head()
datos.Valida <- datos.Num[-entrena,]
head(datos.Valida)
## # A tibble: 6 x 10
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.03e6 2 2.5 2 1 0 156 79 1900
## 2 9.41e5 2 2.5 2 1 0 181 126 1970
## 3 1.88e6 3 2.5 4 2 0 245 210 1910
## 4 1.20e6 3 2.5 3 2 1 113 110 1880
## 5 1.33e6 4 2.5 4 2 2 780 135 1900
## 6 9.00e5 3 2.5 3 2 2 0 126 2010
## # ... with 1 more variable: Propertycount <dbl>
modelo <- lm(Price ~ ., datos.Entrena)
modelo
##
## Call:
## lm(formula = Price ~ ., data = datos.Entrena)
##
## Coefficients:
## (Intercept) Rooms Distance Bedroom2 Bathroom
## 9.593e+06 2.161e+05 -3.139e+04 3.086e+04 2.516e+05
## Car Landsize BuildingArea YearBuilt Propertycount
## 6.198e+04 3.212e+00 3.283e+01 -4.783e+03 -1.371e+00
summary(modelo)
##
## Call:
## lm(formula = Price ~ ., data = datos.Entrena)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3455726 -275944 -81270 189356 8320281
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.593e+06 3.224e+05 29.760 < 2e-16 ***
## Rooms 2.161e+05 1.626e+04 13.289 < 2e-16 ***
## Distance -3.139e+04 8.601e+02 -36.499 < 2e-16 ***
## Bedroom2 3.086e+04 1.621e+04 1.904 0.05694 .
## Bathroom 2.516e+05 8.501e+03 29.591 < 2e-16 ***
## Car 6.198e+04 5.343e+03 11.601 < 2e-16 ***
## Landsize 3.212e+00 1.054e+00 3.046 0.00232 **
## BuildingArea 3.283e+01 1.059e+01 3.101 0.00194 **
## YearBuilt -4.783e+03 1.641e+02 -29.148 < 2e-16 ***
## Propertycount -1.371e+00 1.052e+00 -1.304 0.19233
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 477500 on 10855 degrees of freedom
## Multiple R-squared: 0.4369, Adjusted R-squared: 0.4365
## F-statistic: 935.9 on 9 and 10855 DF, p-value: < 2.2e-16
set.seed(2020) # Semilla
arbol <- rpart(formula = Price ~ ., data = datos.Entrena)
arbol
## n= 10865
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 10865 4.395022e+15 1074197.0
## 2) Rooms< 3.5 8171 1.906637e+15 925833.9
## 4) Rooms< 2.5 3478 4.325219e+14 721699.6
## 8) Landsize< 85.5 1401 6.828120e+13 556731.4 *
## 9) Landsize>=85.5 2077 3.003950e+14 832975.7 *
## 5) Rooms>=2.5 4693 1.221775e+15 1077119.0
## 10) Distance>=10.35 2448 4.399691e+14 895027.9 *
## 11) Distance< 10.35 2245 6.121293e+14 1275674.0 *
## 3) Rooms>=3.5 2694 1.763011e+15 1524190.0
## 6) Distance>=11.45 1239 3.202846e+14 1149013.0 *
## 7) Distance< 11.45 1455 1.119820e+15 1843670.0
## 14) Landsize< 708.5 1085 6.003640e+14 1658835.0
## 28) BuildingArea< 269 961 4.172618e+14 1571550.0 *
## 29) BuildingArea>=269 124 1.190380e+14 2335297.0 *
## 15) Landsize>=708.5 370 3.736890e+14 2385685.0
## 30) Bathroom< 3.5 334 2.722381e+14 2258367.0 *
## 31) Bathroom>=3.5 36 4.580559e+13 3566917.0 *
prp(arbol, type = 2, nn = TRUE,
fallen.leaves = TRUE, faclen = 4,
varlen = 8, shadow.col = "gray")
#### Ver las importancia de las variables en el modelo ctable * LA tabla significa resultados comprensibles de los árboles con diferentes números de nodos, el promedio y la desviación STd para cada uno de los árboles con tamaño especificaco * CP Factor de complejidad el árbol * Número de divisiones en el mejor árbol * El error relativo * El XError otro error * STD La desviació estándard
arbol$cptable
## CP nsplit rel error xerror xstd
## 1 0.16504448 0 1.0000000 1.0001594 0.03302636
## 2 0.07347106 1 0.8349555 0.8354398 0.02881189
## 3 0.05741496 2 0.7614845 0.7620931 0.02663619
## 4 0.03860646 3 0.7040695 0.7047829 0.02585386
## 5 0.03316630 4 0.6654630 0.6701232 0.02603533
## 6 0.01457653 5 0.6322967 0.6526007 0.02531599
## 7 0.01452682 6 0.6177202 0.6335403 0.02480111
## 8 0.01266097 7 0.6031934 0.6241236 0.02475134
## 9 0.01000000 8 0.5905324 0.6080163 0.02403389
plotcp(arbol)
buen modelo
arbol.Recortado <- prune(arbol, cp = 0.01417645)
prp(arbol.Recortado, type = 2, nn = TRUE,
fallen.leaves = TRUE, faclen = 4,
varlen = 8, shadow.col = "gray")
summary(datos.Valida)
## Price Rooms Distance Bedroom2
## Min. : 170000 Min. :1.000 Min. : 0.70 Min. : 0.000
## 1st Qu.: 648000 1st Qu.:2.000 1st Qu.: 6.25 1st Qu.: 2.000
## Median : 903000 Median :3.000 Median : 9.20 Median : 3.000
## Mean :1081633 Mean :2.943 Mean :10.19 Mean : 2.928
## 3rd Qu.:1329000 3rd Qu.:3.000 3rd Qu.:13.00 3rd Qu.: 3.000
## Max. :8000000 Max. :8.000 Max. :45.90 Max. :20.000
## Bathroom Car Landsize BuildingArea
## Min. :0.000 Min. : 0.000 Min. : 0.0 Min. : 0.0
## 1st Qu.:1.000 1st Qu.: 1.000 1st Qu.: 169.5 1st Qu.:123.0
## Median :1.000 Median : 2.000 Median : 448.0 Median :126.0
## Mean :1.547 Mean : 1.636 Mean : 555.4 Mean :135.7
## 3rd Qu.:2.000 3rd Qu.: 2.000 3rd Qu.: 652.5 3rd Qu.:128.0
## Max. :8.000 Max. :10.000 Max. :76000.0 Max. :719.0
## YearBuilt Propertycount
## Min. :1850 Min. : 389
## 1st Qu.:1960 1st Qu.: 4385
## Median :1970 Median : 6543
## Mean :1967 Mean : 7470
## 3rd Qu.:1970 3rd Qu.:10331
## Max. :2017 Max. :21650
prediccion.price <- predict(arbol, newdata = datos.Valida
)
# La predicción para la casa 1
datos.Valida[1,]
## # A tibble: 1 x 10
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.03e6 2 2.5 2 1 0 156 79 1900
## # ... with 1 more variable: Propertycount <dbl>
prediccion.price[1]
## 1
## 832975.7
nuevo.Dato <- data.frame( "Price" = c(0,0,0,0,0)
,"Rooms"= c(3,2,4,5,2)
, "Distance" = c(7,6,8,9,5)
, "Bedroom2" = c(3,4,2,3,2)
, "Bathroom" = c(2,1,3,4,3)
, "Car"= c(3,2,3,4,5)
, "Landsize"= c(400,450,460,480,500)
, "BuildingArea"= c(120,130,150,180,190)
, "YearBuilt"= c(1930,1940,1950,1960,1970)
,"Propertycount"= c(5000,5400,5600,5800,6000)
)
nuevo.Dato
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 1 0 3 7 3 2 3 400 120 1930
## 2 0 2 6 4 1 2 450 130 1940
## 3 0 4 8 2 3 3 460 150 1950
## 4 0 5 9 3 4 4 480 180 1960
## 5 0 2 5 2 3 5 500 190 1970
## Propertycount
## 1 5000
## 2 5400
## 3 5600
## 4 5800
## 5 6000
prediccion.price <- predict(arbol, newdata = nuevo.Dato
)
prediccion1.price <- predict(modelo, newdata = nuevo.Dato
)
# La predicción para la casa 1
datos.Valida[]
## # A tibble: 2,715 x 10
## Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.03e6 2 2.5 2 1 0 156 79 1900
## 2 9.41e5 2 2.5 2 1 0 181 126 1970
## 3 1.88e6 3 2.5 4 2 0 245 210 1910
## 4 1.20e6 3 2.5 3 2 1 113 110 1880
## 5 1.33e6 4 2.5 4 2 2 780 135 1900
## 6 9.00e5 3 2.5 3 2 2 0 126 2010
## 7 1.32e6 2 2.5 2 1 0 147 85 1900
## 8 1.18e6 2 2.5 2 1 0 162 91 1900
## 9 1.03e6 2 2.5 2 1 1 172 81 1930
## 10 4.40e5 2 13.5 2 1 1 192 126 1970
## # ... with 2,705 more rows, and 1 more variable: Propertycount <dbl>
prediccion.price[]
## 1 2 3 4 5
## 1275674.4 832975.7 1571549.7 1571549.7 832975.7
prediccion1.price[]
## 1 2 3 4 5
## 1571030 1055750 1881143 2363205 1572299