Instalar paquetes y llamar librerías

# install.packages("rpart")
library(rpart)
# install.packages("rpart.plot")
library(rpart.plot)

Importar la base de datos

house <- read.csv("C:/Users/joseo/Downloads/HousePriceData.csv")

Entender la base de datos

summary(house)
##   Observation      Dist_Taxi      Dist_Market    Dist_Hospital  
##  Min.   :  1.0   Min.   :  146   Min.   : 1666   Min.   : 3227  
##  1st Qu.:237.0   1st Qu.: 6477   1st Qu.: 9367   1st Qu.:11302  
##  Median :469.0   Median : 8228   Median :11149   Median :13189  
##  Mean   :468.4   Mean   : 8235   Mean   :11022   Mean   :13091  
##  3rd Qu.:700.0   3rd Qu.: 9939   3rd Qu.:12675   3rd Qu.:14855  
##  Max.   :932.0   Max.   :20662   Max.   :20945   Max.   :23294  
##                                                                 
##      Carpet         Builtup        Parking          City_Category     
##  Min.   :  775   Min.   :  932   Length:905         Length:905        
##  1st Qu.: 1317   1st Qu.: 1579   Class :character   Class :character  
##  Median : 1478   Median : 1774   Mode  :character   Mode  :character  
##  Mean   : 1511   Mean   : 1794                                        
##  3rd Qu.: 1654   3rd Qu.: 1985                                        
##  Max.   :24300   Max.   :12730                                        
##  NA's   :7                                                            
##     Rainfall       House_Price       
##  Min.   :-110.0   Min.   :  1492000  
##  1st Qu.: 600.0   1st Qu.:  4623000  
##  Median : 780.0   Median :  5860000  
##  Mean   : 786.9   Mean   :  6083992  
##  3rd Qu.: 970.0   3rd Qu.:  7200000  
##  Max.   :1560.0   Max.   :150000000  
## 
str(house)
## 'data.frame':    905 obs. of  10 variables:
##  $ Observation  : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Dist_Taxi    : int  9796 8294 11001 8301 10510 6665 13153 5882 7495 8233 ...
##  $ Dist_Market  : int  5250 8186 14399 11188 12629 5142 11869 9948 11589 7067 ...
##  $ Dist_Hospital: int  10703 12694 16991 12289 13921 9972 17811 13315 13370 11400 ...
##  $ Carpet       : int  1659 1461 1340 1451 1770 1442 1542 1261 1090 1030 ...
##  $ Builtup      : int  1961 1752 1609 1748 2111 1733 1858 1507 1321 1235 ...
##  $ Parking      : chr  "Open" "Not Provided" "Not Provided" "Covered" ...
##  $ City_Category: chr  "CAT B" "CAT B" "CAT A" "CAT B" ...
##  $ Rainfall     : int  530 210 720 620 450 760 1030 1020 680 1130 ...
##  $ House_Price  : int  6649000 3982000 5401000 5373000 4662000 4526000 7224000 3772000 4631000 4415000 ...
head(house)
##   Observation Dist_Taxi Dist_Market Dist_Hospital Carpet Builtup      Parking
## 1           1      9796        5250         10703   1659    1961         Open
## 2           2      8294        8186         12694   1461    1752 Not Provided
## 3           3     11001       14399         16991   1340    1609 Not Provided
## 4           4      8301       11188         12289   1451    1748      Covered
## 5           5     10510       12629         13921   1770    2111 Not Provided
## 6           6      6665        5142          9972   1442    1733         Open
##   City_Category Rainfall House_Price
## 1         CAT B      530     6649000
## 2         CAT B      210     3982000
## 3         CAT A      720     5401000
## 4         CAT B      620     5373000
## 5         CAT B      450     4662000
## 6         CAT B      760     4526000

Crear árbol de decisión

house <- house[-348, ]
house$Parking <- as.factor(house$Parking)
house$City_Category <- as.factor(house$City_Category)
str(house)
## 'data.frame':    904 obs. of  10 variables:
##  $ Observation  : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Dist_Taxi    : int  9796 8294 11001 8301 10510 6665 13153 5882 7495 8233 ...
##  $ Dist_Market  : int  5250 8186 14399 11188 12629 5142 11869 9948 11589 7067 ...
##  $ Dist_Hospital: int  10703 12694 16991 12289 13921 9972 17811 13315 13370 11400 ...
##  $ Carpet       : int  1659 1461 1340 1451 1770 1442 1542 1261 1090 1030 ...
##  $ Builtup      : int  1961 1752 1609 1748 2111 1733 1858 1507 1321 1235 ...
##  $ Parking      : Factor w/ 4 levels "Covered","No Parking",..: 4 3 3 1 3 4 2 4 3 4 ...
##  $ City_Category: Factor w/ 3 levels "CAT A","CAT B",..: 2 2 1 2 2 2 1 3 2 3 ...
##  $ Rainfall     : int  530 210 720 620 450 760 1030 1020 680 1130 ...
##  $ House_Price  : int  6649000 3982000 5401000 5373000 4662000 4526000 7224000 3772000 4631000 4415000 ...
arbol_house <- rpart (House_Price~., data=house)
options(scipen = 999)
rpart.plot(arbol_house)

Conclusiones

En general, el precio promedio de las casas es 5.9 millones (100%).

Las ramas más relevantes del árbol son:

  • 7.4 millones (35%): Si la casa no está en CAT B ni CAT C (probablemente CAT A). → Segmento con los precios más altos.

  • 4.5 millones (26%): Si la casa está en CAT C. → Segmento con los precios más bajos.

  • 6.5 millones (6%): Si está en CAT B y la distancia al taxi es ≥ 11,000. → Precio alto dentro de CAT B.

  • 5.3 millones (33%): Si está en CAT B y la distancia al taxi es < 11,000. → Precio intermedio.

Lo más relevante

  • La categoría de ciudad es la variable más importante.

  • CAT A (implícito) → precios más altos.

  • CAT C → precios más bajos.

  • En CAT B, la Dist_Taxi ajusta el precio.

LS0tDQp0aXRsZTogIkhvdXNlIFByaWNpbmcgMiINCmF1dGhvcjogIkpvc2UgTWlndWVsIE9ydGl6Ig0KZGF0ZTogIjE4LTItMjAyNiINCm91dHB1dDogDQogIGh0bWxfZG9jdW1lbnQ6DQogICAgdG9jOiBUUlVFDQogICAgdG9jX2Zsb2F0OiBUUlVFDQogICAgY29kZV9kb3dubG9hZDogVFJVRQ0KICAgIHRoZW1lOiBjb3Ntbw0KLS0tDQoNCjxjZW50ZXI+DQohW10oaHR0cHM6Ly9zdWJzdGFja2Nkbi5jb20vaW1hZ2UvZmV0Y2gvJHNfIWh3UXMhLGZfYXV0byxxX2F1dG86Z29vZCxmbF9wcm9ncmVzc2l2ZTpzdGVlcC9odHRwcyUzQSUyRiUyRmJ1Y2tldGVlci1lMDViYmM4NC1iYWEzLTQzN2UtOTUxOC1hZGIzMmJlNzc5ODQuczMuYW1hem9uYXdzLmNvbSUyRnB1YmxpYyUyRmltYWdlcyUyRjJmZTNlZTg4LTNkOTMtNDQ2Mi1iOGI5LThlY2Y1MzI0N2MxOV81MDB4MjU5LmdpZikNCg0KYGBge3Igc2V0dXAsIGluY2x1ZGU9RkFMU0V9DQprbml0cjo6b3B0c19jaHVuayRzZXQoZWNobyA9IFRSVUUpDQpgYGANCg0KIyBbSW5zdGFsYXIgcGFxdWV0ZXMgeSBsbGFtYXIgbGlicmVyw61hc117c3R5bGU9ImNvbG9yOmJsdWUifQ0KDQpgYGB7ciBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQ0KIyBpbnN0YWxsLnBhY2thZ2VzKCJycGFydCIpDQpsaWJyYXJ5KHJwYXJ0KQ0KIyBpbnN0YWxsLnBhY2thZ2VzKCJycGFydC5wbG90IikNCmxpYnJhcnkocnBhcnQucGxvdCkNCmBgYA0KDQojIFtJbXBvcnRhciBsYSBiYXNlIGRlIGRhdG9zXXtzdHlsZT0iY29sb3I6Ymx1ZSJ9DQoNCmBgYHtyfQ0KaG91c2UgPC0gcmVhZC5jc3YoIkM6L1VzZXJzL2pvc2VvL0Rvd25sb2Fkcy9Ib3VzZVByaWNlRGF0YS5jc3YiKQ0KYGBgDQoNCiMgW0VudGVuZGVyIGxhIGJhc2UgZGUgZGF0b3Nde3N0eWxlPSJjb2xvcjpibHVlIn0NCg0KYGBge3J9DQpzdW1tYXJ5KGhvdXNlKQ0Kc3RyKGhvdXNlKQ0KaGVhZChob3VzZSkNCmBgYA0KDQojIFtDcmVhciDDoXJib2wgZGUgZGVjaXNpw7NuXXtzdHlsZT0iY29sb3I6Ymx1ZSJ9DQoNCmBgYHtyfQ0KaG91c2UgPC0gaG91c2VbLTM0OCwgXQ0KaG91c2UkUGFya2luZyA8LSBhcy5mYWN0b3IoaG91c2UkUGFya2luZykNCmhvdXNlJENpdHlfQ2F0ZWdvcnkgPC0gYXMuZmFjdG9yKGhvdXNlJENpdHlfQ2F0ZWdvcnkpDQpzdHIoaG91c2UpDQphcmJvbF9ob3VzZSA8LSBycGFydCAoSG91c2VfUHJpY2V+LiwgZGF0YT1ob3VzZSkNCm9wdGlvbnMoc2NpcGVuID0gOTk5KQ0KcnBhcnQucGxvdChhcmJvbF9ob3VzZSkNCg0KYGBgDQoNCiMgW0NvbmNsdXNpb25lc117c3R5bGU9ImNvbG9yOmJsdWUifQ0KDQpFbiBnZW5lcmFsLCBlbCBwcmVjaW8gcHJvbWVkaW8gZGUgbGFzIGNhc2FzIGVzIDUuOSBtaWxsb25lcyAoMTAwJSkuDQoNCkxhcyByYW1hcyBtw6FzIHJlbGV2YW50ZXMgZGVsIMOhcmJvbCBzb246DQoNCi0gICA3LjQgbWlsbG9uZXMgKDM1JSk6IFNpIGxhIGNhc2Egbm8gZXN0w6EgZW4gQ0FUIEIgbmkgQ0FUIEMgKHByb2JhYmxlbWVudGUgQ0FUIEEpLiDihpIgU2VnbWVudG8gY29uIGxvcyBwcmVjaW9zIG3DoXMgYWx0b3MuDQoNCi0gICA0LjUgbWlsbG9uZXMgKDI2JSk6IFNpIGxhIGNhc2EgZXN0w6EgZW4gQ0FUIEMuIOKGkiBTZWdtZW50byBjb24gbG9zIHByZWNpb3MgbcOhcyBiYWpvcy4NCg0KLSAgIDYuNSBtaWxsb25lcyAoNiUpOiBTaSBlc3TDoSBlbiBDQVQgQiB5IGxhIGRpc3RhbmNpYSBhbCB0YXhpIGVzIOKJpSAxMSwwMDAuIOKGkiBQcmVjaW8gYWx0byBkZW50cm8gZGUgQ0FUIEIuDQoNCi0gICA1LjMgbWlsbG9uZXMgKDMzJSk6IFNpIGVzdMOhIGVuIENBVCBCIHkgbGEgZGlzdGFuY2lhIGFsIHRheGkgZXMgXDwgMTEsMDAwLiDihpIgUHJlY2lvIGludGVybWVkaW8uDQoNCkxvIG3DoXMgcmVsZXZhbnRlDQoNCi0gICBMYSBjYXRlZ29yw61hIGRlIGNpdWRhZCBlcyBsYSB2YXJpYWJsZSBtw6FzIGltcG9ydGFudGUuDQoNCi0gICBDQVQgQSAoaW1wbMOtY2l0bykg4oaSIHByZWNpb3MgbcOhcyBhbHRvcy4NCg0KLSAgIENBVCBDIOKGkiBwcmVjaW9zIG3DoXMgYmFqb3MuDQoNCi0gICBFbiBDQVQgQiwgbGEgRGlzdF9UYXhpIGFqdXN0YSBlbCBwcmVjaW8uDQo=