Árboles de regresión para predecir el precio de casas en Melbourne

Las librerías

# Arboles de regresion de prediccion de casas

# install.packages("rpart", "rpart.plot", "caret") 
library(rpart)      # Arboles
library(rpart.plot) # Visualizar y represenar árboles
library(caret)      # Para llevar a cabo particiones de conjuntos de datos en caso de...
library(dplyr)      # Para select, filter, mutate, arange ....
library(readr)      # Para leer datos
library(ggplot2)    # Para grafica mas vistosas
library(reshape)    # Para renombrar columnas

Los datos

datos <- read.csv("C:/Users/JD/Documents/Analisis inteligente de datos/Datos/melb_data1.csv")
head(datos)
##       Suburb          Address Rooms Type   Price Method SellerG      Date
## 1 Abbotsford     85 Turner St     2    h 1480000      S  Biggin 3/12/2016
## 2 Abbotsford  25 Bloomburg St     2    h 1035000      S  Biggin 4/02/2016
## 3 Abbotsford     5 Charles St     3    h 1465000     SP  Biggin 4/03/2017
## 4 Abbotsford 40 Federation La     3    h  850000     PI  Biggin 4/03/2017
## 5 Abbotsford      55a Park St     4    h 1600000     VB  Nelson 4/06/2016
## 6 Abbotsford   129 Charles St     2    h  941000      S  Jellis 7/05/2016
##   Distance Postcode Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 1      2.5     3067        2        1   1      202           NA        NA
## 2      2.5     3067        2        1   0      156           79      1900
## 3      2.5     3067        3        2   0      134          150      1900
## 4      2.5     3067        3        2   1       94           NA        NA
## 5      2.5     3067        3        1   2      120          142      2014
## 6      2.5     3067        2        1   0      181           NA        NA
##   CouncilArea Lattitude Longtitude            Regionname Propertycount
## 1       Yarra  -37.7996   144.9984 Northern Metropolitan          4019
## 2       Yarra  -37.8079   144.9934 Northern Metropolitan          4019
## 3       Yarra  -37.8093   144.9944 Northern Metropolitan          4019
## 4       Yarra  -37.7969   144.9969 Northern Metropolitan          4019
## 5       Yarra  -37.8072   144.9941 Northern Metropolitan          4019
## 6       Yarra  -37.8041   144.9953 Northern Metropolitan          4019
tail(datos)
##              Suburb       Address Rooms Type   Price Method  SellerG       Date
## 13575   Westmeadows    9 Black St     3    h  582000      S      Red 26/08/2017
## 13576 Wheelers Hill  12 Strada Cr     4    h 1245000      S    Barry 26/08/2017
## 13577  Williamstown 77 Merrett Dr     3    h 1031000     SP Williams 26/08/2017
## 13578  Williamstown   83 Power St     3    h 1170000      S    Raine 26/08/2017
## 13579  Williamstown  96 Verdon St     4    h 2500000     PI  Sweeney 26/08/2017
## 13580    Yarraville    6 Agnes St     4    h 1285000     SP  Village 26/08/2017
##       Distance Postcode Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 13575     16.5     3049        3        2   2      256           NA        NA
## 13576     16.7     3150        4        2   2      652           NA      1981
## 13577      6.8     3016        3        2   2      333          133      1995
## 13578      6.8     3016        3        2   4      436           NA      1997
## 13579      6.8     3016        4        1   5      866          157      1920
## 13580      6.3     3013        4        1   1      362          112      1920
##       CouncilArea Lattitude Longtitude                 Regionname Propertycount
## 13575             -37.67917   144.8939      Northern Metropolitan          2474
## 13576             -37.90562   145.1676 South-Eastern Metropolitan          7392
## 13577             -37.85927   144.8790       Western Metropolitan          6380
## 13578             -37.85274   144.8874       Western Metropolitan          6380
## 13579             -37.85908   144.8930       Western Metropolitan          6380
## 13580             -37.81188   144.8845       Western Metropolitan          6543

Explorar los conjuntos de datos

summary(datos)
##             Suburb                  Address          Rooms        Type    
##  Reservoir     :  359   1/1 Clarendon St:    3   Min.   : 1.000   h:9449  
##  Richmond      :  260   13 Robinson St  :    3   1st Qu.: 2.000   t:1114  
##  Bentleigh East:  249   14 Arthur St    :    3   Median : 3.000   u:3017  
##  Preston       :  239   2 Bruce St      :    3   Mean   : 2.938           
##  Brunswick     :  222   28 Blair St     :    3   3rd Qu.: 3.000           
##  Essendon      :  220   36 Aberfeldie St:    3   Max.   :10.000           
##  (Other)       :12031   (Other)         :13562                            
##      Price         Method             SellerG             Date      
##  Min.   :  85000   PI:1564   Nelson       :1565   27/05/2017:  473  
##  1st Qu.: 650000   S :9022   Jellis       :1316   3/06/2017 :  395  
##  Median : 903000   SA:  92   hockingstuart:1167   12/08/2017:  387  
##  Mean   :1075684   SP:1703   Barry        :1011   17/06/2017:  374  
##  3rd Qu.:1330000   VB:1199   Ray          : 701   27/11/2016:  362  
##  Max.   :9000000             Marshall     : 659   29/07/2017:  341  
##                              (Other)      :7161   (Other)   :11248  
##     Distance        Postcode       Bedroom2         Bathroom    
##  Min.   : 0.00   Min.   :3000   Min.   : 0.000   Min.   :0.000  
##  1st Qu.: 6.10   1st Qu.:3044   1st Qu.: 2.000   1st Qu.:1.000  
##  Median : 9.20   Median :3084   Median : 3.000   Median :1.000  
##  Mean   :10.14   Mean   :3105   Mean   : 2.915   Mean   :1.534  
##  3rd Qu.:13.00   3rd Qu.:3148   3rd Qu.: 3.000   3rd Qu.:2.000  
##  Max.   :48.10   Max.   :3977   Max.   :20.000   Max.   :8.000  
##                                                                 
##       Car           Landsize         BuildingArea     YearBuilt   
##  Min.   : 0.00   Min.   :     0.0   Min.   :    0   Min.   :1196  
##  1st Qu.: 1.00   1st Qu.:   177.0   1st Qu.:   93   1st Qu.:1940  
##  Median : 2.00   Median :   440.0   Median :  126   Median :1970  
##  Mean   : 1.61   Mean   :   558.4   Mean   :  152   Mean   :1965  
##  3rd Qu.: 2.00   3rd Qu.:   651.0   3rd Qu.:  174   3rd Qu.:1999  
##  Max.   :10.00   Max.   :433014.0   Max.   :44515   Max.   :2018  
##  NA's   :62                         NA's   :6450    NA's   :5375  
##         CouncilArea     Lattitude        Longtitude   
##               :1369   Min.   :-38.18   Min.   :144.4  
##  Moreland     :1163   1st Qu.:-37.86   1st Qu.:144.9  
##  Boroondara   :1160   Median :-37.80   Median :145.0  
##  Moonee Valley: 997   Mean   :-37.81   Mean   :145.0  
##  Darebin      : 934   3rd Qu.:-37.76   3rd Qu.:145.1  
##  Glen Eira    : 848   Max.   :-37.41   Max.   :145.5  
##  (Other)      :7109                                   
##                       Regionname   Propertycount  
##  Southern Metropolitan     :4695   Min.   :  249  
##  Northern Metropolitan     :3890   1st Qu.: 4380  
##  Western Metropolitan      :2948   Median : 6555  
##  Eastern Metropolitan      :1471   Mean   : 7454  
##  South-Eastern Metropolitan: 450   3rd Qu.:10331  
##  Eastern Victoria          :  53   Max.   :21650  
##  (Other)                   :  73
str(datos)
## 'data.frame':    13580 obs. of  21 variables:
##  $ Suburb       : Factor w/ 314 levels "Abbotsford","Aberfeldie",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Address      : Factor w/ 13378 levels "1 Adelle Ct",..: 12795 5944 9815 9005 10590 2196 2143 13336 11083 1091 ...
##  $ Rooms        : int  2 2 3 3 4 2 3 2 1 2 ...
##  $ Type         : Factor w/ 3 levels "h","t","u": 1 1 1 1 1 1 1 1 3 1 ...
##  $ Price        : num  1480000 1035000 1465000 850000 1600000 ...
##  $ Method       : Factor w/ 5 levels "PI","S","SA",..: 2 2 4 1 5 2 2 2 2 2 ...
##  $ SellerG      : Factor w/ 268 levels "@Realty","Abercromby's",..: 24 24 24 24 165 114 165 165 24 24 ...
##  $ Date         : Factor w/ 58 levels "1/07/2017","10/09/2016",..: 46 48 49 49 50 53 53 57 57 57 ...
##  $ Distance     : num  2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
##  $ Postcode     : num  3067 3067 3067 3067 3067 ...
##  $ Bedroom2     : num  2 2 3 3 3 2 4 2 1 3 ...
##  $ Bathroom     : num  1 1 2 2 1 1 2 1 1 1 ...
##  $ Car          : num  1 0 0 1 2 0 0 2 1 2 ...
##  $ Landsize     : num  202 156 134 94 120 181 245 256 0 220 ...
##  $ BuildingArea : num  NA 79 150 NA 142 NA 210 107 NA 75 ...
##  $ YearBuilt    : num  NA 1900 1900 NA 2014 ...
##  $ CouncilArea  : Factor w/ 34 levels "","Banyule","Bayside",..: 33 33 33 33 33 33 33 33 33 33 ...
##  $ Lattitude    : num  -37.8 -37.8 -37.8 -37.8 -37.8 ...
##  $ Longtitude   : num  145 145 145 145 145 ...
##  $ Regionname   : Factor w/ 8 levels "Eastern Metropolitan",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Propertycount: num  4019 4019 4019 4019 4019 ...

Elegimos sólo las variables numéricas

datos.Num <- select(datos, Price, Rooms, Distance, Bedroom2, Bathroom, Car, Landsize, BuildingArea, YearBuilt, Propertycount) 
head(datos.Num)
##     Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 1 1480000     2      2.5        2        1   1      202           NA        NA
## 2 1035000     2      2.5        2        1   0      156           79      1900
## 3 1465000     3      2.5        3        2   0      134          150      1900
## 4  850000     3      2.5        3        2   1       94           NA        NA
## 5 1600000     4      2.5        3        1   2      120          142      2014
## 6  941000     2      2.5        2        1   0      181           NA        NA
##   Propertycount
## 1          4019
## 2          4019
## 3          4019
## 4          4019
## 5          4019
## 6          4019
str(datos.Num)
## 'data.frame':    13580 obs. of  10 variables:
##  $ Price        : num  1480000 1035000 1465000 850000 1600000 ...
##  $ Rooms        : int  2 2 3 3 4 2 3 2 1 2 ...
##  $ Distance     : num  2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
##  $ Bedroom2     : num  2 2 3 3 3 2 4 2 1 3 ...
##  $ Bathroom     : num  1 1 2 2 1 1 2 1 1 1 ...
##  $ Car          : num  1 0 0 1 2 0 0 2 1 2 ...
##  $ Landsize     : num  202 156 134 94 120 181 245 256 0 220 ...
##  $ BuildingArea : num  NA 79 150 NA 142 NA 210 107 NA 75 ...
##  $ YearBuilt    : num  NA 1900 1900 NA 2014 ...
##  $ Propertycount: num  4019 4019 4019 4019 4019 ...

Depurar, limpiar los datos

mediana.BA <- median(datos.Num$BuildingArea, na.rm = TRUE) # summary(datos.Num$BuildingArea)[3], como otra alternativa
mediana.YB <- median(datos.Num$YearBuilt, na.rm = TRUE)    # summary(datos.Num$YearBuilt)[3], , como otra alternativa
mediana.C <- median(datos.Num$Car, na.rm = TRUE)    # summary(datos.Num$Car)[3], , como otra alternativa

###Actualizar mutate() los NA por la medianas - Las vaiables que tienen NAs

head(datos.Num, 10) # Los primeros 10, se observan NAs
##      Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 1  1480000     2      2.5        2        1   1      202           NA        NA
## 2  1035000     2      2.5        2        1   0      156           79      1900
## 3  1465000     3      2.5        3        2   0      134          150      1900
## 4   850000     3      2.5        3        2   1       94           NA        NA
## 5  1600000     4      2.5        3        1   2      120          142      2014
## 6   941000     2      2.5        2        1   0      181           NA        NA
## 7  1876000     3      2.5        4        2   0      245          210      1910
## 8  1636000     2      2.5        2        1   2      256          107      1890
## 9   300000     1      2.5        1        1   1        0           NA        NA
## 10 1097000     2      2.5        3        1   2      220           75      1900
##    Propertycount
## 1           4019
## 2           4019
## 3           4019
## 4           4019
## 5           4019
## 6           4019
## 7           4019
## 8           4019
## 9           4019
## 10          4019
datos.Num<- datos.Num %>%
  mutate (BuildingArea = ifelse(is.na(BuildingArea), mediana.BA, BuildingArea))

datos.Num <- datos.Num %>%
  mutate (YearBuilt = ifelse(is.na(YearBuilt), mediana.YB, YearBuilt)) 

datos.Num <- datos.Num %>%
  mutate (Car = ifelse(is.na(Car), mediana.C, Car)) 


head(datos.Num, 10) # # Los primeros 10, YA NO se observan NAs
##      Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 1  1480000     2      2.5        2        1   1      202          126      1970
## 2  1035000     2      2.5        2        1   0      156           79      1900
## 3  1465000     3      2.5        3        2   0      134          150      1900
## 4   850000     3      2.5        3        2   1       94          126      1970
## 5  1600000     4      2.5        3        1   2      120          142      2014
## 6   941000     2      2.5        2        1   0      181          126      1970
## 7  1876000     3      2.5        4        2   0      245          210      1910
## 8  1636000     2      2.5        2        1   2      256          107      1890
## 9   300000     1      2.5        1        1   1        0          126      1970
## 10 1097000     2      2.5        3        1   2      220           75      1900
##    Propertycount
## 1           4019
## 2           4019
## 3           4019
## 4           4019
## 5           4019
## 6           4019
## 7           4019
## 8           4019
## 9           4019
## 10          4019

Crear conjuntos de entrenamiento y conjuntos de validación

set.seed(2020) # Semilla
entrena <- createDataPartition(datos.Num$Price, p=0.7, list = FALSE)
head(entrena)
##      Resample1
## [1,]         1
## [2,]         3
## [3,]         4
## [4,]         5
## [5,]         7
## [6,]         9
nrow(entrena)
## [1] 9508
# Los registros que no estén en entrena serán los de validación
head(datos.Num[-entrena,])
##      Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 2  1035000     2      2.5        2        1   0      156           79      1900
## 6   941000     2      2.5        2        1   0      181          126      1970
## 8  1636000     2      2.5        2        1   2      256          107      1890
## 12 1350000     3      2.5        3        2   2      214          190      2005
## 13  750000     2      2.5        2        2   1        0           94      2009
## 20  890000     2      2.5        2        1   1      150           73      1985
##    Propertycount
## 2           4019
## 6           4019
## 8           4019
## 12          4019
## 13          4019
## 20          4019
nrow(datos.Num[-entrena,])
## [1] 4072
# Ver los primeros seis datos con sólo variables numéricas
head(datos.Num)
##     Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 1 1480000     2      2.5        2        1   1      202          126      1970
## 2 1035000     2      2.5        2        1   0      156           79      1900
## 3 1465000     3      2.5        3        2   0      134          150      1900
## 4  850000     3      2.5        3        2   1       94          126      1970
## 5 1600000     4      2.5        3        1   2      120          142      2014
## 6  941000     2      2.5        2        1   0      181          126      1970
##   Propertycount
## 1          4019
## 2          4019
## 3          4019
## 4          4019
## 5          4019
## 6          4019
# Ahora a determinar conjuntos de datos de entrenamiento y luego head()
datos.Entrena <- datos.Num[entrena,]
head(datos.Entrena)
##     Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 1 1480000     2      2.5        2        1   1      202          126      1970
## 3 1465000     3      2.5        3        2   0      134          150      1900
## 4  850000     3      2.5        3        2   1       94          126      1970
## 5 1600000     4      2.5        3        1   2      120          142      2014
## 7 1876000     3      2.5        4        2   0      245          210      1910
## 9  300000     1      2.5        1        1   1        0          126      1970
##   Propertycount
## 1          4019
## 3          4019
## 4          4019
## 5          4019
## 7          4019
## 9          4019
summary(datos.Entrena)
##      Price             Rooms           Distance        Bedroom2     
##  Min.   :  85000   Min.   : 1.000   Min.   : 0.00   Min.   : 0.000  
##  1st Qu.: 650000   1st Qu.: 2.000   1st Qu.: 6.10   1st Qu.: 2.000  
##  Median : 903000   Median : 3.000   Median : 9.20   Median : 3.000  
##  Mean   :1078063   Mean   : 2.937   Mean   :10.13   Mean   : 2.911  
##  3rd Qu.:1330000   3rd Qu.: 3.000   3rd Qu.:13.00   3rd Qu.: 3.000  
##  Max.   :9000000   Max.   :10.000   Max.   :47.40   Max.   :10.000  
##     Bathroom          Car            Landsize         BuildingArea   
##  Min.   :0.000   Min.   : 0.000   Min.   :     0.0   Min.   :   0.0  
##  1st Qu.:1.000   1st Qu.: 1.000   1st Qu.:   178.0   1st Qu.: 123.0  
##  Median :1.000   Median : 2.000   Median :   443.5   Median : 126.0  
##  Mean   :1.529   Mean   : 1.613   Mean   :   579.6   Mean   : 136.8  
##  3rd Qu.:2.000   3rd Qu.: 2.000   3rd Qu.:   650.0   3rd Qu.: 129.0  
##  Max.   :8.000   Max.   :10.000   Max.   :433014.0   Max.   :6791.0  
##    YearBuilt    Propertycount  
##  Min.   :1830   Min.   :  389  
##  1st Qu.:1960   1st Qu.: 4386  
##  Median :1970   Median : 6567  
##  Mean   :1967   Mean   : 7453  
##  3rd Qu.:1975   3rd Qu.:10331  
##  Max.   :2018   Max.   :21650
# y conjunto de datos de validación y luego head()
datos.Valida <- datos.Num[-entrena,]
head(datos.Valida)
##      Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 2  1035000     2      2.5        2        1   0      156           79      1900
## 6   941000     2      2.5        2        1   0      181          126      1970
## 8  1636000     2      2.5        2        1   2      256          107      1890
## 12 1350000     3      2.5        3        2   2      214          190      2005
## 13  750000     2      2.5        2        2   1        0           94      2009
## 20  890000     2      2.5        2        1   1      150           73      1985
##    Propertycount
## 2           4019
## 6           4019
## 8           4019
## 12          4019
## 13          4019
## 20          4019
summary(datos.Valida)
##      Price             Rooms          Distance        Bedroom2     
##  Min.   : 170000   Min.   :1.000   Min.   : 0.00   Min.   : 0.000  
##  1st Qu.: 649500   1st Qu.:2.000   1st Qu.: 6.20   1st Qu.: 2.000  
##  Median : 902500   Median :3.000   Median : 9.20   Median : 3.000  
##  Mean   :1070130   Mean   :2.941   Mean   :10.15   Mean   : 2.923  
##  3rd Qu.:1330000   3rd Qu.:4.000   3rd Qu.:13.00   3rd Qu.: 3.000  
##  Max.   :8000000   Max.   :8.000   Max.   :48.10   Max.   :20.000  
##     Bathroom          Car           Landsize        BuildingArea    
##  Min.   :0.000   Min.   : 0.00   Min.   :    0.0   Min.   :    0.0  
##  1st Qu.:1.000   1st Qu.: 1.00   1st Qu.:  173.0   1st Qu.:  120.0  
##  Median :1.000   Median : 2.00   Median :  435.0   Median :  126.0  
##  Mean   :1.547   Mean   : 1.61   Mean   :  508.9   Mean   :  146.2  
##  3rd Qu.:2.000   3rd Qu.: 2.00   3rd Qu.:  653.0   3rd Qu.:  130.0  
##  Max.   :8.000   Max.   :10.00   Max.   :44500.0   Max.   :44515.0  
##    YearBuilt    Propertycount  
##  Min.   :1196   Min.   :  249  
##  1st Qu.:1960   1st Qu.: 4217  
##  Median :1970   Median : 6543  
##  Mean   :1967   Mean   : 7457  
##  3rd Qu.:1972   3rd Qu.:10331  
##  Max.   :2017   Max.   :21650

Contruir el MODELO árbol

set.seed(2020) # Semilla

arbol <- rpart(formula = Price  ~ ., data = datos.Entrena)
arbol
## n= 9508 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 9508 3.905176e+15 1078063.0  
##    2) Rooms< 3.5 7161 1.741856e+15  931007.8  
##      4) Rooms< 2.5 3023 3.911722e+14  724213.1  
##        8) Landsize< 85.5 1214 6.222679e+13  559018.9 *
##        9) Landsize>=85.5 1809 2.735839e+14  835073.1 *
##      5) Rooms>=2.5 4138 1.126966e+15 1082081.0  
##       10) Distance>=11.9 1576 2.382967e+14  832810.4 *
##       11) Distance< 11.9 2562 7.305050e+14 1235418.0  
##         22) BuildingArea< 156.5 2230 5.283540e+14 1181265.0 *
##         23) BuildingArea>=156.5 332 1.516845e+14 1599162.0 *
##    3) Rooms>=3.5 2347 1.535971e+15 1526746.0  
##      6) Distance>=11.45 1075 2.619142e+14 1136763.0 *
##      7) Distance< 11.45 1272 9.723911e+14 1856331.0  
##       14) BuildingArea< 246.5 1025 5.968351e+14 1695057.0  
##         28) Landsize< 708.5 803 3.644950e+14 1566809.0 *
##         29) Landsize>=708.5 222 1.713594e+14 2158948.0 *
##       15) BuildingArea>=246.5 247 2.382655e+14 2525584.0 *

Visualizar el árbol de regresión

prp(arbol, type = 2, nn = TRUE, 
    fallen.leaves = TRUE, faclen = 4,
    varlen = 8,  shadow.col = "gray")

### Ver las importancia de las variables en el modelo - ctable - LA tabla significa resultados comprensibles de los árboles con diferentes números de nodos, el promedio y la desviación STd para cada uno de los árboles con tamaño especificaco - CP Factor de complejidad el árbol - Número de divisiones en el mejor árbol - El error relativo - El XError otro error - STD La desviació estándard

arbol$cptable
##           CP nsplit rel error    xerror       xstd
## 1 0.16064529      0 1.0000000 1.0001908 0.03335304
## 2 0.07724771      1 0.8393547 0.8397732 0.02940088
## 3 0.05728747      2 0.7621070 0.7627294 0.02731604
## 4 0.04050130      3 0.7048195 0.7055469 0.02633946
## 5 0.03515605      4 0.6643182 0.6669834 0.02651936
## 6 0.01561538      5 0.6291622 0.6395277 0.02549443
## 7 0.01417645      6 0.6135468 0.6211866 0.02488627
## 8 0.01292296      7 0.5993704 0.6079174 0.02482012
## 9 0.01000000      8 0.5864474 0.5990287 0.02442405

Podar el árbol prune()

plotcp(arbol)

### El nodo siete es servirá para ser un buen modelo

Podar el árbol

arbol.Recortado <- prune(arbol, cp = 0.01417645)

prp(arbol.Recortado, type = 2, nn = TRUE, 
    fallen.leaves = TRUE, faclen = 4,
    varlen = 8,  shadow.col = "gray")

### Predicciones con el conjunto de datos de validación

summary(datos.Valida)
##      Price             Rooms          Distance        Bedroom2     
##  Min.   : 170000   Min.   :1.000   Min.   : 0.00   Min.   : 0.000  
##  1st Qu.: 649500   1st Qu.:2.000   1st Qu.: 6.20   1st Qu.: 2.000  
##  Median : 902500   Median :3.000   Median : 9.20   Median : 3.000  
##  Mean   :1070130   Mean   :2.941   Mean   :10.15   Mean   : 2.923  
##  3rd Qu.:1330000   3rd Qu.:4.000   3rd Qu.:13.00   3rd Qu.: 3.000  
##  Max.   :8000000   Max.   :8.000   Max.   :48.10   Max.   :20.000  
##     Bathroom          Car           Landsize        BuildingArea    
##  Min.   :0.000   Min.   : 0.00   Min.   :    0.0   Min.   :    0.0  
##  1st Qu.:1.000   1st Qu.: 1.00   1st Qu.:  173.0   1st Qu.:  120.0  
##  Median :1.000   Median : 2.00   Median :  435.0   Median :  126.0  
##  Mean   :1.547   Mean   : 1.61   Mean   :  508.9   Mean   :  146.2  
##  3rd Qu.:2.000   3rd Qu.: 2.00   3rd Qu.:  653.0   3rd Qu.:  130.0  
##  Max.   :8.000   Max.   :10.00   Max.   :44500.0   Max.   :44515.0  
##    YearBuilt    Propertycount  
##  Min.   :1196   Min.   :  249  
##  1st Qu.:1960   1st Qu.: 4217  
##  Median :1970   Median : 6543  
##  Mean   :1967   Mean   : 7457  
##  3rd Qu.:1972   3rd Qu.:10331  
##  Max.   :2017   Max.   :21650
prediccion.price <- predict(arbol, newdata = datos.Valida
)
# La predicción para la casa 1
datos.Valida[1,]
##     Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 2 1035000     2      2.5        2        1   0      156           79      1900
##   Propertycount
## 2          4019
prediccion.price[1]
##        2 
## 835073.1

Una predicción con nuevos datos ???

Price=0
Rooms=3
Distance=7
Bedroom2=3
Bathroom=2
Car=3
Landsize=400
BuildingArea=120
YearBuilt=1930
Propertycount=5000
nuevo.Dato <- data.frame(Price,Rooms, Distance, Bedroom2, Bathroom, Car, Landsize, BuildingArea, YearBuilt,
Propertycount)

colnames(nuevo.Dato) <- c("Price", "Rooms", "Distance", "Bedroom2", "Bathroom", "Car", "Landsize", "BuildingArea", "YearBuilt", "Propertycount")

nuevo.Dato
##   Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 1     0     3        7        3        2   3      400          120      1930
##   Propertycount
## 1          5000
prediccion.price <- predict(arbol, newdata = nuevo.Dato
)
# La predicción para la casa 1
datos.Valida[1,]
##     Price Rooms Distance Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt
## 2 1035000     2      2.5        2        1   0      156           79      1900
##   Propertycount
## 2          4019
prediccion.price[1]
##       1 
## 1181265