Importar la base de datos

# install.packages("rpart")
library(rpart)
# install.packages("rpart.plot")
library(rpart.plot)
# Cargar la base de datos
library(readr)
HousePriceData <- read_csv("~/Conexión de interfaces/Conexión de interfaces/HousePriceData.csv")
## Rows: 905 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Parking, City_Category
## dbl (8): Observation, Dist_Taxi, Dist_Market, Dist_Hospital, Carpet, Builtup...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Entender la base de datos

summary(HousePriceData)
##   Observation      Dist_Taxi      Dist_Market    Dist_Hospital  
##  Min.   :  1.0   Min.   :  146   Min.   : 1666   Min.   : 3227  
##  1st Qu.:237.0   1st Qu.: 6477   1st Qu.: 9367   1st Qu.:11302  
##  Median :469.0   Median : 8228   Median :11149   Median :13189  
##  Mean   :468.4   Mean   : 8235   Mean   :11022   Mean   :13091  
##  3rd Qu.:700.0   3rd Qu.: 9939   3rd Qu.:12675   3rd Qu.:14855  
##  Max.   :932.0   Max.   :20662   Max.   :20945   Max.   :23294  
##                                                                 
##      Carpet         Builtup        Parking          City_Category     
##  Min.   :  775   Min.   :  932   Length:905         Length:905        
##  1st Qu.: 1317   1st Qu.: 1579   Class :character   Class :character  
##  Median : 1478   Median : 1774   Mode  :character   Mode  :character  
##  Mean   : 1511   Mean   : 1794                                        
##  3rd Qu.: 1654   3rd Qu.: 1985                                        
##  Max.   :24300   Max.   :12730                                        
##  NA's   :7                                                            
##     Rainfall       House_Price       
##  Min.   :-110.0   Min.   :  1492000  
##  1st Qu.: 600.0   1st Qu.:  4623000  
##  Median : 780.0   Median :  5860000  
##  Mean   : 786.9   Mean   :  6083992  
##  3rd Qu.: 970.0   3rd Qu.:  7200000  
##  Max.   :1560.0   Max.   :150000000  
## 
str(HousePriceData)
## spc_tbl_ [905 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Observation  : num [1:905] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Dist_Taxi    : num [1:905] 9796 8294 11001 8301 10510 ...
##  $ Dist_Market  : num [1:905] 5250 8186 14399 11188 12629 ...
##  $ Dist_Hospital: num [1:905] 10703 12694 16991 12289 13921 ...
##  $ Carpet       : num [1:905] 1659 1461 1340 1451 1770 ...
##  $ Builtup      : num [1:905] 1961 1752 1609 1748 2111 ...
##  $ Parking      : chr [1:905] "Open" "Not Provided" "Not Provided" "Covered" ...
##  $ City_Category: chr [1:905] "CAT B" "CAT B" "CAT A" "CAT B" ...
##  $ Rainfall     : num [1:905] 530 210 720 620 450 760 1030 1020 680 1130 ...
##  $ House_Price  : num [1:905] 6649000 3982000 5401000 5373000 4662000 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Observation = col_double(),
##   ..   Dist_Taxi = col_double(),
##   ..   Dist_Market = col_double(),
##   ..   Dist_Hospital = col_double(),
##   ..   Carpet = col_double(),
##   ..   Builtup = col_double(),
##   ..   Parking = col_character(),
##   ..   City_Category = col_character(),
##   ..   Rainfall = col_double(),
##   ..   House_Price = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
head(HousePriceData)
## # A tibble: 6 × 10
##   Observation Dist_Taxi Dist_Market Dist_Hospital Carpet Builtup Parking     
##         <dbl>     <dbl>       <dbl>         <dbl>  <dbl>   <dbl> <chr>       
## 1           1      9796        5250         10703   1659    1961 Open        
## 2           2      8294        8186         12694   1461    1752 Not Provided
## 3           3     11001       14399         16991   1340    1609 Not Provided
## 4           4      8301       11188         12289   1451    1748 Covered     
## 5           5     10510       12629         13921   1770    2111 Not Provided
## 6           6      6665        5142          9972   1442    1733 Open        
## # ℹ 3 more variables: City_Category <chr>, Rainfall <dbl>, House_Price <dbl>

Crear árbol de decisión

HousePriceData <- HousePriceData[-348, ]
HousePriceData$Parking <- as.factor(HousePriceData$Parking)
HousePriceData$City_Category <- as.factor(HousePriceData$City_Category)
str(HousePriceData)
## tibble [904 × 10] (S3: tbl_df/tbl/data.frame)
##  $ Observation  : num [1:904] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Dist_Taxi    : num [1:904] 9796 8294 11001 8301 10510 ...
##  $ Dist_Market  : num [1:904] 5250 8186 14399 11188 12629 ...
##  $ Dist_Hospital: num [1:904] 10703 12694 16991 12289 13921 ...
##  $ Carpet       : num [1:904] 1659 1461 1340 1451 1770 ...
##  $ Builtup      : num [1:904] 1961 1752 1609 1748 2111 ...
##  $ Parking      : Factor w/ 4 levels "Covered","No Parking",..: 4 3 3 1 3 4 2 4 3 4 ...
##  $ City_Category: Factor w/ 3 levels "CAT A","CAT B",..: 2 2 1 2 2 2 1 3 2 3 ...
##  $ Rainfall     : num [1:904] 530 210 720 620 450 760 1030 1020 680 1130 ...
##  $ House_Price  : num [1:904] 6649000 3982000 5401000 5373000 4662000 ...
arbol_house <- rpart(House_Price~., data=HousePriceData)
options(scipen = 999)
rpart.plot(arbol_house)

Conclusiones

-El 35% de las viviendas que no están en City_Category = CAT B o CAT tienden a tener el precio más alto, con un valor promedio de 7.4e+6.

-Del 65% de las viviendas que sí están en City_Category = CAT B o CAT C, el precio promedio baja a 5.1e+6, pero se divide de la siguiente manera:

  • El 26% de ese total, cuando la vivienda está en City_Category = CAT C, cae en el precio más bajo del árbol con 4.5e+6.

  • El 39% restante de ese 65% corresponde a City_Category = CAT B, con un precio promedio de 5.5e+6, y en ese punto ya se vuelve importante la distancia al taxi; ya que si Dist_Taxi < 11e+3, el precio promedio es de 5.3e+6, representando al 33% de ese 39%; mientras que el 6% restante que cuenta con un Dist_Taxi > 11e+3 presentan un precio promedio de 6.5e+6

LS0tCnRpdGxlOiAiw4FyYm9sIGRlIERlY2lzacOzbiAtIEhvdXNlUHJpY2luZyIKYXV0aG9yOiAiSmVzw7pzIEdlcmFyZG8gU29sYW5vIETDrWF6IEEwMDIyODE1NSIKZGF0ZTogIjIwMjYtMDItMTkiCm91dHB1dDogCiAgaHRtbF9kb2N1bWVudDoKICAgIHRvYzogVFJVRQogICAgdG9jX2Zsb2F0OiBUUlVFCiAgICBjb2RlX2Rvd25sb2FkOiBUUlVFCiAgICB0aGVtZTogY29zbW8KLS0tCgo8Y2VudGVyPgohW10oaHR0cHM6Ly9zdGF0aWMud2lraWEubm9jb29raWUubmV0L2NvcmFsaW5lL2ltYWdlcy81LzVjL0NvcmFsaW5lLkRWRFJpcC5YdmlELUFSUk9XLmF2aV8wMDU2ODI3NjYuanBnL3JldmlzaW9uL2xhdGVzdD9jYj0yMDEyMDgyMTEwMzQ1NSkKPC9jZW50ZXI+IAoKIyA8c3BhbiBzdHlsZT0gImNvbG9yOmJsdWUiPiBJbXBvcnRhciBsYSBiYXNlIGRlIGRhdG9zIDwvc3Bhbj4KYGBge3J9CiMgaW5zdGFsbC5wYWNrYWdlcygicnBhcnQiKQpsaWJyYXJ5KHJwYXJ0KQojIGluc3RhbGwucGFja2FnZXMoInJwYXJ0LnBsb3QiKQpsaWJyYXJ5KHJwYXJ0LnBsb3QpCiMgQ2FyZ2FyIGxhIGJhc2UgZGUgZGF0b3MKbGlicmFyeShyZWFkcikKSG91c2VQcmljZURhdGEgPC0gcmVhZF9jc3YoIn4vQ29uZXhpw7NuIGRlIGludGVyZmFjZXMvQ29uZXhpw7NuIGRlIGludGVyZmFjZXMvSG91c2VQcmljZURhdGEuY3N2IikKYGBgCgojIDxzcGFuIHN0eWxlPSAiY29sb3I6Ymx1ZSI+IEVudGVuZGVyIGxhIGJhc2UgZGUgZGF0b3MgPC9zcGFuPgpgYGB7cn0Kc3VtbWFyeShIb3VzZVByaWNlRGF0YSkKc3RyKEhvdXNlUHJpY2VEYXRhKQpoZWFkKEhvdXNlUHJpY2VEYXRhKQpgYGAKCiMgPHNwYW4gc3R5bGU9ImNvbG9yOmJsdWUiPiBDcmVhciDDoXJib2wgZGUgZGVjaXNpw7NuIDwvc3Bhbj4KYGBge3J9CkhvdXNlUHJpY2VEYXRhIDwtIEhvdXNlUHJpY2VEYXRhWy0zNDgsIF0KSG91c2VQcmljZURhdGEkUGFya2luZyA8LSBhcy5mYWN0b3IoSG91c2VQcmljZURhdGEkUGFya2luZykKSG91c2VQcmljZURhdGEkQ2l0eV9DYXRlZ29yeSA8LSBhcy5mYWN0b3IoSG91c2VQcmljZURhdGEkQ2l0eV9DYXRlZ29yeSkKc3RyKEhvdXNlUHJpY2VEYXRhKQphcmJvbF9ob3VzZSA8LSBycGFydChIb3VzZV9QcmljZX4uLCBkYXRhPUhvdXNlUHJpY2VEYXRhKQpvcHRpb25zKHNjaXBlbiA9IDk5OSkKcnBhcnQucGxvdChhcmJvbF9ob3VzZSkKYGBgCgojIDxzcGFuIHN0eWxlPSJjb2xvcjpibHVlIj4gQ29uY2x1c2lvbmVzIDwvc3Bhbj4KCi1FbCAzNSUgZGUgbGFzIHZpdmllbmRhcyBxdWUgbm8gZXN0w6FuIGVuIENpdHlfQ2F0ZWdvcnkgPSBDQVQgQiBvIENBVCAgdGllbmRlbiBhIHRlbmVyIGVsIHByZWNpbyBtw6FzIGFsdG8sIGNvbiB1biB2YWxvciBwcm9tZWRpbyBkZSA3LjRlKzYuCgotRGVsIDY1JSBkZSBsYXMgdml2aWVuZGFzIHF1ZSBzw60gZXN0w6FuIGVuIENpdHlfQ2F0ZWdvcnkgPSBDQVQgQiBvIENBVCBDLCBlbCBwcmVjaW8gcHJvbWVkaW8gYmFqYSBhIDUuMWUrNiwgcGVybyBzZSBkaXZpZGUgZGUgbGEgc2lndWllbnRlIG1hbmVyYToKCiAgKiBFbCAyNiUgZGUgZXNlIHRvdGFsLCBjdWFuZG8gbGEgdml2aWVuZGEgZXN0w6EgZW4gQ2l0eV9DYXRlZ29yeSA9IENBVCBDLCBjYWUgZW4gZWwgcHJlY2lvIG3DoXMgYmFqbyBkZWwgw6FyYm9sIGNvbiA0LjVlKzYuCiAgCiAgKiBFbCAzOSUgcmVzdGFudGUgZGUgZXNlIDY1JSBjb3JyZXNwb25kZSBhIENpdHlfQ2F0ZWdvcnkgPSBDQVQgQiwgY29uIHVuIHByZWNpbyBwcm9tZWRpbyBkZSA1LjVlKzYsIHkgZW4gZXNlIHB1bnRvIHlhIHNlICAgICAgIHZ1ZWx2ZSBpbXBvcnRhbnRlIGxhIGRpc3RhbmNpYSBhbCB0YXhpOyB5YSBxdWUgc2kgRGlzdF9UYXhpIDwgIDExZSszLCBlbCBwcmVjaW8gcHJvbWVkaW8gZXMgZGUgNS4zZSs2LCByZXByZXNlbnRhbmRvIGFsICAgICAgMzMlIGRlIGVzZSAzOSU7IG1pZW50cmFzIHF1ZSBlbCA2JSByZXN0YW50ZSBxdWUgY3VlbnRhIGNvbiB1biBEaXN0X1RheGkgPiAgMTFlKzMgcHJlc2VudGFuIHVuIHByZWNpbyBwcm9tZWRpbyBkZSA2LjVlKzYKCg==