Las librerías

library(rpart)
library(readr)
library(rpart.plot)
library(dplyr)
library(ggplot2)
library(reshape)
library(caret)  # Para dividir conjunto de datos

Los datos

#En RUTA, reemplazar la ruta con su directorio de trabajo en donde se encuentre Sales.csv
ruta <- "D:/inspiron 1545/ESCUELA/ITD/OCTAVO/Analisis Inteligente de Datos/R/datos"
setwd(ruta)
datos <- read_csv("melb_data.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Suburb = col_character(),
##   Address = col_character(),
##   Type = col_character(),
##   Method = col_character(),
##   SellerG = col_character(),
##   Date = col_character(),
##   CouncilArea = col_character(),
##   Regionname = col_character()
## )
## See spec(...) for full column specifications.
head(datos)
## # A tibble: 6 x 21
##   Suburb Address Rooms Type   Price Method SellerG Date  Distance Postcode
##   <chr>  <chr>   <dbl> <chr>  <dbl> <chr>  <chr>   <chr>    <dbl>    <dbl>
## 1 Abbot~ 85 Tur~     2 h     1.48e6 S      Biggin  3/12~      2.5     3067
## 2 Abbot~ 25 Blo~     2 h     1.03e6 S      Biggin  4/02~      2.5     3067
## 3 Abbot~ 5 Char~     3 h     1.46e6 SP     Biggin  4/03~      2.5     3067
## 4 Abbot~ 40 Fed~     3 h     8.50e5 PI     Biggin  4/03~      2.5     3067
## 5 Abbot~ 55a Pa~     4 h     1.60e6 VB     Nelson  4/06~      2.5     3067
## 6 Abbot~ 129 Ch~     2 h     9.41e5 S      Jellis  7/05~      2.5     3067
## # ... with 11 more variables: Bedroom2 <dbl>, Bathroom <dbl>, Car <dbl>,
## #   Landsize <dbl>, BuildingArea <dbl>, YearBuilt <dbl>, CouncilArea <chr>,
## #   Lattitude <dbl>, Longtitude <dbl>, Regionname <chr>, Propertycount <dbl>
tail(datos)
## # A tibble: 6 x 21
##   Suburb Address Rooms Type   Price Method SellerG Date  Distance Postcode
##   <chr>  <chr>   <dbl> <chr>  <dbl> <chr>  <chr>   <chr>    <dbl>    <dbl>
## 1 Westm~ 9 Blac~     3 h     5.82e5 S      Red     26/0~     16.5     3049
## 2 Wheel~ 12 Str~     4 h     1.25e6 S      Barry   26/0~     16.7     3150
## 3 Willi~ 77 Mer~     3 h     1.03e6 SP     Willia~ 26/0~      6.8     3016
## 4 Willi~ 83 Pow~     3 h     1.17e6 S      Raine   26/0~      6.8     3016
## 5 Willi~ 96 Ver~     4 h     2.50e6 PI     Sweeney 26/0~      6.8     3016
## 6 Yarra~ 6 Agne~     4 h     1.28e6 SP     Village 26/0~      6.3     3013
## # ... with 11 more variables: Bedroom2 <dbl>, Bathroom <dbl>, Car <dbl>,
## #   Landsize <dbl>, BuildingArea <dbl>, YearBuilt <dbl>, CouncilArea <chr>,
## #   Lattitude <dbl>, Longtitude <dbl>, Regionname <chr>, Propertycount <dbl>

Explorar los conjuntos de datos

summary(datos)
##     Suburb            Address              Rooms            Type          
##  Length:13580       Length:13580       Min.   : 1.000   Length:13580      
##  Class :character   Class :character   1st Qu.: 2.000   Class :character  
##  Mode  :character   Mode  :character   Median : 3.000   Mode  :character  
##                                        Mean   : 2.938                     
##                                        3rd Qu.: 3.000                     
##                                        Max.   :10.000                     
##                                                                           
##      Price            Method            SellerG              Date          
##  Min.   :  85000   Length:13580       Length:13580       Length:13580      
##  1st Qu.: 650000   Class :character   Class :character   Class :character  
##  Median : 903000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1075684                                                           
##  3rd Qu.:1330000                                                           
##  Max.   :9000000                                                           
##                                                                            
##     Distance        Postcode       Bedroom2         Bathroom    
##  Min.   : 0.00   Min.   :3000   Min.   : 0.000   Min.   :0.000  
##  1st Qu.: 6.10   1st Qu.:3044   1st Qu.: 2.000   1st Qu.:1.000  
##  Median : 9.20   Median :3084   Median : 3.000   Median :1.000  
##  Mean   :10.14   Mean   :3105   Mean   : 2.915   Mean   :1.534  
##  3rd Qu.:13.00   3rd Qu.:3148   3rd Qu.: 3.000   3rd Qu.:2.000  
##  Max.   :48.10   Max.   :3977   Max.   :20.000   Max.   :8.000  
##                                                                 
##       Car           Landsize         BuildingArea     YearBuilt   
##  Min.   : 0.00   Min.   :     0.0   Min.   :    0   Min.   :1196  
##  1st Qu.: 1.00   1st Qu.:   177.0   1st Qu.:   93   1st Qu.:1940  
##  Median : 2.00   Median :   440.0   Median :  126   Median :1970  
##  Mean   : 1.61   Mean   :   558.4   Mean   :  152   Mean   :1965  
##  3rd Qu.: 2.00   3rd Qu.:   651.0   3rd Qu.:  174   3rd Qu.:1999  
##  Max.   :10.00   Max.   :433014.0   Max.   :44515   Max.   :2018  
##  NA's   :62                         NA's   :6450    NA's   :5375  
##  CouncilArea          Lattitude        Longtitude     Regionname       
##  Length:13580       Min.   :-38.18   Min.   :144.4   Length:13580      
##  Class :character   1st Qu.:-37.86   1st Qu.:144.9   Class :character  
##  Mode  :character   Median :-37.80   Median :145.0   Mode  :character  
##                     Mean   :-37.81   Mean   :145.0                     
##                     3rd Qu.:-37.76   3rd Qu.:145.1                     
##                     Max.   :-37.41   Max.   :145.5                     
##                                                                        
##  Propertycount  
##  Min.   :  249  
##  1st Qu.: 4380  
##  Median : 6555  
##  Mean   : 7454  
##  3rd Qu.:10331  
##  Max.   :21650  
## 
str(datos)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 13580 obs. of  21 variables:
##  $ Suburb       : chr  "Abbotsford" "Abbotsford" "Abbotsford" "Abbotsford" ...
##  $ Address      : chr  "85 Turner St" "25 Bloomburg St" "5 Charles St" "40 Federation La" ...
##  $ Rooms        : num  2 2 3 3 4 2 3 2 1 2 ...
##  $ Type         : chr  "h" "h" "h" "h" ...
##  $ Price        : num  1480000 1035000 1465000 850000 1600000 ...
##  $ Method       : chr  "S" "S" "SP" "PI" ...
##  $ SellerG      : chr  "Biggin" "Biggin" "Biggin" "Biggin" ...
##  $ Date         : chr  "3/12/2016" "4/02/2016" "4/03/2017" "4/03/2017" ...
##  $ Distance     : num  2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
##  $ Postcode     : num  3067 3067 3067 3067 3067 ...
##  $ Bedroom2     : num  2 2 3 3 3 2 4 2 1 3 ...
##  $ Bathroom     : num  1 1 2 2 1 1 2 1 1 1 ...
##  $ Car          : num  1 0 0 1 2 0 0 2 1 2 ...
##  $ Landsize     : num  202 156 134 94 120 181 245 256 0 220 ...
##  $ BuildingArea : num  NA 79 150 NA 142 NA 210 107 NA 75 ...
##  $ YearBuilt    : num  NA 1900 1900 NA 2014 ...
##  $ CouncilArea  : chr  "Yarra" "Yarra" "Yarra" "Yarra" ...
##  $ Lattitude    : num  -37.8 -37.8 -37.8 -37.8 -37.8 ...
##  $ Longtitude   : num  145 145 145 145 145 ...
##  $ Regionname   : chr  "Northern Metropolitan" "Northern Metropolitan" "Northern Metropolitan" "Northern Metropolitan" ...
##  $ Propertycount: num  4019 4019 4019 4019 4019 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Suburb = col_character(),
##   ..   Address = col_character(),
##   ..   Rooms = col_double(),
##   ..   Type = col_character(),
##   ..   Price = col_double(),
##   ..   Method = col_character(),
##   ..   SellerG = col_character(),
##   ..   Date = col_character(),
##   ..   Distance = col_double(),
##   ..   Postcode = col_double(),
##   ..   Bedroom2 = col_double(),
##   ..   Bathroom = col_double(),
##   ..   Car = col_double(),
##   ..   Landsize = col_double(),
##   ..   BuildingArea = col_double(),
##   ..   YearBuilt = col_double(),
##   ..   CouncilArea = col_character(),
##   ..   Lattitude = col_double(),
##   ..   Longtitude = col_double(),
##   ..   Regionname = col_character(),
##   ..   Propertycount = col_double()
##   .. )

Elegimos sólo las variables numéricas

datos.Num <- select(datos, Price, Rooms, Distance, Bedroom2, Bathroom, Car, Landsize, BuildingArea, YearBuilt, Propertycount)
head(datos.Num)
## # A tibble: 6 x 10
##    Price Rooms Distance Bedroom2 Bathroom   Car Landsize BuildingArea YearBuilt
##    <dbl> <dbl>    <dbl>    <dbl>    <dbl> <dbl>    <dbl>        <dbl>     <dbl>
## 1 1.48e6     2      2.5        2        1     1      202           NA        NA
## 2 1.03e6     2      2.5        2        1     0      156           79      1900
## 3 1.46e6     3      2.5        3        2     0      134          150      1900
## 4 8.50e5     3      2.5        3        2     1       94           NA        NA
## 5 1.60e6     4      2.5        3        1     2      120          142      2014
## 6 9.41e5     2      2.5        2        1     0      181           NA        NA
## # ... with 1 more variable: Propertycount <dbl>
str(datos.Num)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 13580 obs. of  10 variables:
##  $ Price        : num  1480000 1035000 1465000 850000 1600000 ...
##  $ Rooms        : num  2 2 3 3 4 2 3 2 1 2 ...
##  $ Distance     : num  2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
##  $ Bedroom2     : num  2 2 3 3 3 2 4 2 1 3 ...
##  $ Bathroom     : num  1 1 2 2 1 1 2 1 1 1 ...
##  $ Car          : num  1 0 0 1 2 0 0 2 1 2 ...
##  $ Landsize     : num  202 156 134 94 120 181 245 256 0 220 ...
##  $ BuildingArea : num  NA 79 150 NA 142 NA 210 107 NA 75 ...
##  $ YearBuilt    : num  NA 1900 1900 NA 2014 ...
##  $ Propertycount: num  4019 4019 4019 4019 4019 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Suburb = col_character(),
##   ..   Address = col_character(),
##   ..   Rooms = col_double(),
##   ..   Type = col_character(),
##   ..   Price = col_double(),
##   ..   Method = col_character(),
##   ..   SellerG = col_character(),
##   ..   Date = col_character(),
##   ..   Distance = col_double(),
##   ..   Postcode = col_double(),
##   ..   Bedroom2 = col_double(),
##   ..   Bathroom = col_double(),
##   ..   Car = col_double(),
##   ..   Landsize = col_double(),
##   ..   BuildingArea = col_double(),
##   ..   YearBuilt = col_double(),
##   ..   CouncilArea = col_character(),
##   ..   Lattitude = col_double(),
##   ..   Longtitude = col_double(),
##   ..   Regionname = col_character(),
##   ..   Propertycount = col_double()
##   .. )

Depurar y Limpiar los datos

mediana.BA <- median(datos.Num$BuildingArea, na.rm = TRUE)
# summary(datos.Num$BuildingArea)[3], como otra alternativa
mediana.YB <- median(datos.Num$YearBuilt, na.rm = TRUE)
# summary(datos.Num$YearBuilt)[3], como otra alternativa
mediana.C <- median(datos.Num$Car, na.rm = TRUE)
# summary(datos.Num$Car)[3], como otra alternativa

Actualizar mutate() los NA por las medianas

Las variables que tienen NAs

head(datos.Num, 10) #Los primeros 10, se observan NAs
## # A tibble: 10 x 10
##     Price Rooms Distance Bedroom2 Bathroom   Car Landsize BuildingArea YearBuilt
##     <dbl> <dbl>    <dbl>    <dbl>    <dbl> <dbl>    <dbl>        <dbl>     <dbl>
##  1 1.48e6     2      2.5        2        1     1      202           NA        NA
##  2 1.03e6     2      2.5        2        1     0      156           79      1900
##  3 1.46e6     3      2.5        3        2     0      134          150      1900
##  4 8.50e5     3      2.5        3        2     1       94           NA        NA
##  5 1.60e6     4      2.5        3        1     2      120          142      2014
##  6 9.41e5     2      2.5        2        1     0      181           NA        NA
##  7 1.88e6     3      2.5        4        2     0      245          210      1910
##  8 1.64e6     2      2.5        2        1     2      256          107      1890
##  9 3.00e5     1      2.5        1        1     1        0           NA        NA
## 10 1.10e6     2      2.5        3        1     2      220           75      1900
## # ... with 1 more variable: Propertycount <dbl>
datos.Num <- datos.Num %>% mutate (BuildingArea = ifelse(is.na(BuildingArea), mediana.BA, BuildingArea))

datos.Num <- datos.Num %>% mutate (YearBuilt = ifelse(is.na(YearBuilt), mediana.YB, YearBuilt))

datos.Num <- datos.Num %>% mutate (Car = ifelse(is.na(Car), mediana.C, Car))

head(datos.Num, 10)
## # A tibble: 10 x 10
##     Price Rooms Distance Bedroom2 Bathroom   Car Landsize BuildingArea YearBuilt
##     <dbl> <dbl>    <dbl>    <dbl>    <dbl> <dbl>    <dbl>        <dbl>     <dbl>
##  1 1.48e6     2      2.5        2        1     1      202          126      1970
##  2 1.03e6     2      2.5        2        1     0      156           79      1900
##  3 1.46e6     3      2.5        3        2     0      134          150      1900
##  4 8.50e5     3      2.5        3        2     1       94          126      1970
##  5 1.60e6     4      2.5        3        1     2      120          142      2014
##  6 9.41e5     2      2.5        2        1     0      181          126      1970
##  7 1.88e6     3      2.5        4        2     0      245          210      1910
##  8 1.64e6     2      2.5        2        1     2      256          107      1890
##  9 3.00e5     1      2.5        1        1     1        0          126      1970
## 10 1.10e6     2      2.5        3        1     2      220           75      1900
## # ... with 1 more variable: Propertycount <dbl>

Crear conjuntos de entrenamiento y conjuntos de validación

set.seed(2020) # Semilla
entrena <- createDataPartition(datos.Num$Price, p=0.7, list = FALSE)
head(entrena)
##      Resample1
## [1,]         1
## [2,]         3
## [3,]         4
## [4,]         5
## [5,]         7
## [6,]         9
nrow(entrena)
## [1] 9508
#Los registros que no estén en entrena serán los de valiación
head(datos.Num[-entrena,])
## # A tibble: 6 x 10
##    Price Rooms Distance Bedroom2 Bathroom   Car Landsize BuildingArea YearBuilt
##    <dbl> <dbl>    <dbl>    <dbl>    <dbl> <dbl>    <dbl>        <dbl>     <dbl>
## 1 1.03e6     2      2.5        2        1     0      156           79      1900
## 2 9.41e5     2      2.5        2        1     0      181          126      1970
## 3 1.64e6     2      2.5        2        1     2      256          107      1890
## 4 1.35e6     3      2.5        3        2     2      214          190      2005
## 5 7.50e5     2      2.5        2        2     1        0           94      2009
## 6 8.90e5     2      2.5        2        1     1      150           73      1985
## # ... with 1 more variable: Propertycount <dbl>
nrow(datos.Num[-entrena,])
## [1] 4072
#Ver los primeros seis datos con sólo variables numéricas
head(datos.Num)
## # A tibble: 6 x 10
##    Price Rooms Distance Bedroom2 Bathroom   Car Landsize BuildingArea YearBuilt
##    <dbl> <dbl>    <dbl>    <dbl>    <dbl> <dbl>    <dbl>        <dbl>     <dbl>
## 1 1.48e6     2      2.5        2        1     1      202          126      1970
## 2 1.03e6     2      2.5        2        1     0      156           79      1900
## 3 1.46e6     3      2.5        3        2     0      134          150      1900
## 4 8.50e5     3      2.5        3        2     1       94          126      1970
## 5 1.60e6     4      2.5        3        1     2      120          142      2014
## 6 9.41e5     2      2.5        2        1     0      181          126      1970
## # ... with 1 more variable: Propertycount <dbl>
#Ahora a determinar conjuntos de datos de entrenamiento y luego head()
datos.Entrena <- datos.Num[entrena,]
head(datos.Entrena)
## # A tibble: 6 x 10
##    Price Rooms Distance Bedroom2 Bathroom   Car Landsize BuildingArea YearBuilt
##    <dbl> <dbl>    <dbl>    <dbl>    <dbl> <dbl>    <dbl>        <dbl>     <dbl>
## 1 1.48e6     2      2.5        2        1     1      202          126      1970
## 2 1.46e6     3      2.5        3        2     0      134          150      1900
## 3 8.50e5     3      2.5        3        2     1       94          126      1970
## 4 1.60e6     4      2.5        3        1     2      120          142      2014
## 5 1.88e6     3      2.5        4        2     0      245          210      1910
## 6 3.00e5     1      2.5        1        1     1        0          126      1970
## # ... with 1 more variable: Propertycount <dbl>
summary(datos.Entrena)
##      Price             Rooms           Distance        Bedroom2     
##  Min.   :  85000   Min.   : 1.000   Min.   : 0.00   Min.   : 0.000  
##  1st Qu.: 650000   1st Qu.: 2.000   1st Qu.: 6.10   1st Qu.: 2.000  
##  Median : 903000   Median : 3.000   Median : 9.20   Median : 3.000  
##  Mean   :1078063   Mean   : 2.937   Mean   :10.13   Mean   : 2.911  
##  3rd Qu.:1330000   3rd Qu.: 3.000   3rd Qu.:13.00   3rd Qu.: 3.000  
##  Max.   :9000000   Max.   :10.000   Max.   :47.40   Max.   :10.000  
##     Bathroom          Car            Landsize         BuildingArea   
##  Min.   :0.000   Min.   : 0.000   Min.   :     0.0   Min.   :   0.0  
##  1st Qu.:1.000   1st Qu.: 1.000   1st Qu.:   178.0   1st Qu.: 123.0  
##  Median :1.000   Median : 2.000   Median :   443.5   Median : 126.0  
##  Mean   :1.529   Mean   : 1.613   Mean   :   579.6   Mean   : 136.8  
##  3rd Qu.:2.000   3rd Qu.: 2.000   3rd Qu.:   650.0   3rd Qu.: 129.0  
##  Max.   :8.000   Max.   :10.000   Max.   :433014.0   Max.   :6791.0  
##    YearBuilt    Propertycount  
##  Min.   :1830   Min.   :  389  
##  1st Qu.:1960   1st Qu.: 4386  
##  Median :1970   Median : 6567  
##  Mean   :1967   Mean   : 7453  
##  3rd Qu.:1975   3rd Qu.:10331  
##  Max.   :2018   Max.   :21650
# y conjunto de datos de validación y luego head()
datos.Valida <- datos.Num[-entrena,]
head(datos.Valida)
## # A tibble: 6 x 10
##    Price Rooms Distance Bedroom2 Bathroom   Car Landsize BuildingArea YearBuilt
##    <dbl> <dbl>    <dbl>    <dbl>    <dbl> <dbl>    <dbl>        <dbl>     <dbl>
## 1 1.03e6     2      2.5        2        1     0      156           79      1900
## 2 9.41e5     2      2.5        2        1     0      181          126      1970
## 3 1.64e6     2      2.5        2        1     2      256          107      1890
## 4 1.35e6     3      2.5        3        2     2      214          190      2005
## 5 7.50e5     2      2.5        2        2     1        0           94      2009
## 6 8.90e5     2      2.5        2        1     1      150           73      1985
## # ... with 1 more variable: Propertycount <dbl>
summary(datos.Valida)
##      Price             Rooms          Distance        Bedroom2     
##  Min.   : 170000   Min.   :1.000   Min.   : 0.00   Min.   : 0.000  
##  1st Qu.: 649500   1st Qu.:2.000   1st Qu.: 6.20   1st Qu.: 2.000  
##  Median : 902500   Median :3.000   Median : 9.20   Median : 3.000  
##  Mean   :1070130   Mean   :2.941   Mean   :10.15   Mean   : 2.923  
##  3rd Qu.:1330000   3rd Qu.:4.000   3rd Qu.:13.00   3rd Qu.: 3.000  
##  Max.   :8000000   Max.   :8.000   Max.   :48.10   Max.   :20.000  
##     Bathroom          Car           Landsize        BuildingArea    
##  Min.   :0.000   Min.   : 0.00   Min.   :    0.0   Min.   :    0.0  
##  1st Qu.:1.000   1st Qu.: 1.00   1st Qu.:  173.0   1st Qu.:  120.0  
##  Median :1.000   Median : 2.00   Median :  435.0   Median :  126.0  
##  Mean   :1.547   Mean   : 1.61   Mean   :  508.9   Mean   :  146.2  
##  3rd Qu.:2.000   3rd Qu.: 2.00   3rd Qu.:  653.0   3rd Qu.:  130.0  
##  Max.   :8.000   Max.   :10.00   Max.   :44500.0   Max.   :44515.0  
##    YearBuilt    Propertycount  
##  Min.   :1196   Min.   :  249  
##  1st Qu.:1960   1st Qu.: 4217  
##  Median :1970   Median : 6543  
##  Mean   :1967   Mean   : 7457  
##  3rd Qu.:1972   3rd Qu.:10331  
##  Max.   :2017   Max.   :21650

Construir el MODELO árbol

set.seed(2020) # Semilla

arbol <- rpart(formula = Price ~ ., data = datos.Entrena)
arbol
## n= 9508 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 9508 3.905176e+15 1078063.0  
##    2) Rooms< 3.5 7161 1.741856e+15  931007.8  
##      4) Rooms< 2.5 3023 3.911722e+14  724213.1  
##        8) Landsize< 85.5 1214 6.222679e+13  559018.9 *
##        9) Landsize>=85.5 1809 2.735839e+14  835073.1 *
##      5) Rooms>=2.5 4138 1.126966e+15 1082081.0  
##       10) Distance>=11.9 1576 2.382967e+14  832810.4 *
##       11) Distance< 11.9 2562 7.305050e+14 1235418.0  
##         22) BuildingArea< 156.5 2230 5.283540e+14 1181265.0 *
##         23) BuildingArea>=156.5 332 1.516845e+14 1599162.0 *
##    3) Rooms>=3.5 2347 1.535971e+15 1526746.0  
##      6) Distance>=11.45 1075 2.619142e+14 1136763.0 *
##      7) Distance< 11.45 1272 9.723911e+14 1856331.0  
##       14) BuildingArea< 246.5 1025 5.968351e+14 1695057.0  
##         28) Landsize< 708.5 803 3.644950e+14 1566809.0 *
##         29) Landsize>=708.5 222 1.713594e+14 2158948.0 *
##       15) BuildingArea>=246.5 247 2.382655e+14 2525584.0 *

Visualizar el árbol de regresión

prp(arbol, type = 2, nn = TRUE,
    fallen.leaves = TRUE, faclen = 4,
    varlen = 8, shadow.col = "gray")

Ver la importancia de las variables en el modelo

arbol$cptable
##           CP nsplit rel error    xerror       xstd
## 1 0.16064529      0 1.0000000 1.0001908 0.03335304
## 2 0.07724771      1 0.8393547 0.8397732 0.02940088
## 3 0.05728747      2 0.7621070 0.7627294 0.02731604
## 4 0.04050130      3 0.7048195 0.7055469 0.02633946
## 5 0.03515605      4 0.6643182 0.6669834 0.02651936
## 6 0.01561538      5 0.6291622 0.6395277 0.02549443
## 7 0.01417645      6 0.6135468 0.6211866 0.02488627
## 8 0.01292296      7 0.5993704 0.6079174 0.02482012
## 9 0.01000000      8 0.5864474 0.5990287 0.02442405

Podar el árbol prune()

plotcp(arbol)

#### Nodo siete es, servirá para ser un buen modelo

Podar el árbol

arbol.Recortado <- prune(arbol, cp = 0.01417645)

prp(arbol.Recortado, type = 2, nn = TRUE,
    fallen.leaves = TRUE, faclen = 4,
    varlen = 8, shadow.col = "gray")

Predicciones con el conjunto de datos de validación

summary(datos.Valida)
##      Price             Rooms          Distance        Bedroom2     
##  Min.   : 170000   Min.   :1.000   Min.   : 0.00   Min.   : 0.000  
##  1st Qu.: 649500   1st Qu.:2.000   1st Qu.: 6.20   1st Qu.: 2.000  
##  Median : 902500   Median :3.000   Median : 9.20   Median : 3.000  
##  Mean   :1070130   Mean   :2.941   Mean   :10.15   Mean   : 2.923  
##  3rd Qu.:1330000   3rd Qu.:4.000   3rd Qu.:13.00   3rd Qu.: 3.000  
##  Max.   :8000000   Max.   :8.000   Max.   :48.10   Max.   :20.000  
##     Bathroom          Car           Landsize        BuildingArea    
##  Min.   :0.000   Min.   : 0.00   Min.   :    0.0   Min.   :    0.0  
##  1st Qu.:1.000   1st Qu.: 1.00   1st Qu.:  173.0   1st Qu.:  120.0  
##  Median :1.000   Median : 2.00   Median :  435.0   Median :  126.0  
##  Mean   :1.547   Mean   : 1.61   Mean   :  508.9   Mean   :  146.2  
##  3rd Qu.:2.000   3rd Qu.: 2.00   3rd Qu.:  653.0   3rd Qu.:  130.0  
##  Max.   :8.000   Max.   :10.00   Max.   :44500.0   Max.   :44515.0  
##    YearBuilt    Propertycount  
##  Min.   :1196   Min.   :  249  
##  1st Qu.:1960   1st Qu.: 4217  
##  Median :1970   Median : 6543  
##  Mean   :1967   Mean   : 7457  
##  3rd Qu.:1972   3rd Qu.:10331  
##  Max.   :2017   Max.   :21650
prediccion.price <- predict(arbol, newdate = datos.Valida)
#La predicción para la casa 1
datos.Valida[1,]
## # A tibble: 1 x 10
##    Price Rooms Distance Bedroom2 Bathroom   Car Landsize BuildingArea YearBuilt
##    <dbl> <dbl>    <dbl>    <dbl>    <dbl> <dbl>    <dbl>        <dbl>     <dbl>
## 1 1.03e6     2      2.5        2        1     0      156           79      1900
## # ... with 1 more variable: Propertycount <dbl>
prediccion.price[1]
##        1 
## 835073.1

¿Una predicción con nuevos datos?

Price = 0
Rooms = 3
Distance = 7
Bedroom2 = 3
Bathroom = 2
Car = 3
Landsize = 400
BuildingArea = 120
YearBuilt = 1930
Propertycount = 5000
nuevo.Dato <- data.frame(Price,Rooms,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Propertycount)

colnames(nuevo.Dato) <- c("Price","Rooms","Distance","Bedroom2","Bathroom","Car","Landsize","BuildingArea","YearBuilt","Propertycount")

nuevo.Dato
##   Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 1     0     3        7        3        2   3      400          120      1930
##   Propertycount
## 1          5000
prediccion.rpice <- predict(arbol, newdata = nuevo.Dato)
#La predicción para la casa 1
datos.Valida[1,]
## # A tibble: 1 x 10
##    Price Rooms Distance Bedroom2 Bathroom   Car Landsize BuildingArea YearBuilt
##    <dbl> <dbl>    <dbl>    <dbl>    <dbl> <dbl>    <dbl>        <dbl>     <dbl>
## 1 1.03e6     2      2.5        2        1     0      156           79      1900
## # ... with 1 more variable: Propertycount <dbl>
prediccion.price[1]
##        1 
## 835073.1