#install.packages("rpart")
library(rpart)
#install.packages("rpart.plot")
library(rpart.plot)
library(readr)
houses <- read_csv("HousePriceData.csv")
summary(houses)
## Observation Dist_Taxi Dist_Market Dist_Hospital
## Min. : 1.0 Min. : 146 Min. : 1666 Min. : 3227
## 1st Qu.:237.0 1st Qu.: 6477 1st Qu.: 9367 1st Qu.:11302
## Median :469.0 Median : 8228 Median :11149 Median :13189
## Mean :468.4 Mean : 8235 Mean :11022 Mean :13091
## 3rd Qu.:700.0 3rd Qu.: 9939 3rd Qu.:12675 3rd Qu.:14855
## Max. :932.0 Max. :20662 Max. :20945 Max. :23294
##
## Carpet Builtup Parking City_Category
## Min. : 775 Min. : 932 Length:905 Length:905
## 1st Qu.: 1317 1st Qu.: 1579 Class :character Class :character
## Median : 1478 Median : 1774 Mode :character Mode :character
## Mean : 1511 Mean : 1794
## 3rd Qu.: 1654 3rd Qu.: 1985
## Max. :24300 Max. :12730
## NA's :7
## Rainfall House_Price
## Min. :-110.0 Min. : 1492000
## 1st Qu.: 600.0 1st Qu.: 4623000
## Median : 780.0 Median : 5860000
## Mean : 786.9 Mean : 6083992
## 3rd Qu.: 970.0 3rd Qu.: 7200000
## Max. :1560.0 Max. :150000000
##
head(houses)
## # A tibble: 6 × 10
## Observation Dist_Taxi Dist_Market Dist_Hospital Carpet Builtup Parking
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 1 9796 5250 10703 1659 1961 Open
## 2 2 8294 8186 12694 1461 1752 Not Provided
## 3 3 11001 14399 16991 1340 1609 Not Provided
## 4 4 8301 11188 12289 1451 1748 Covered
## 5 5 10510 12629 13921 1770 2111 Not Provided
## 6 6 6665 5142 9972 1442 1733 Open
## # ℹ 3 more variables: City_Category <chr>, Rainfall <dbl>, House_Price <dbl>
str(houses)
## spc_tbl_ [905 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Observation : num [1:905] 1 2 3 4 5 6 7 8 9 10 ...
## $ Dist_Taxi : num [1:905] 9796 8294 11001 8301 10510 ...
## $ Dist_Market : num [1:905] 5250 8186 14399 11188 12629 ...
## $ Dist_Hospital: num [1:905] 10703 12694 16991 12289 13921 ...
## $ Carpet : num [1:905] 1659 1461 1340 1451 1770 ...
## $ Builtup : num [1:905] 1961 1752 1609 1748 2111 ...
## $ Parking : chr [1:905] "Open" "Not Provided" "Not Provided" "Covered" ...
## $ City_Category: chr [1:905] "CAT B" "CAT B" "CAT A" "CAT B" ...
## $ Rainfall : num [1:905] 530 210 720 620 450 760 1030 1020 680 1130 ...
## $ House_Price : num [1:905] 6649000 3982000 5401000 5373000 4662000 ...
## - attr(*, "spec")=
## .. cols(
## .. Observation = col_double(),
## .. Dist_Taxi = col_double(),
## .. Dist_Market = col_double(),
## .. Dist_Hospital = col_double(),
## .. Carpet = col_double(),
## .. Builtup = col_double(),
## .. Parking = col_character(),
## .. City_Category = col_character(),
## .. Rainfall = col_double(),
## .. House_Price = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
houses <- houses[-348, ]
houses$Parking <- as.factor(houses$Parking)
houses$Parking <- as.factor(houses$City_Category)
str(houses)
## tibble [904 × 10] (S3: tbl_df/tbl/data.frame)
## $ Observation : num [1:904] 1 2 3 4 5 6 7 8 9 10 ...
## $ Dist_Taxi : num [1:904] 9796 8294 11001 8301 10510 ...
## $ Dist_Market : num [1:904] 5250 8186 14399 11188 12629 ...
## $ Dist_Hospital: num [1:904] 10703 12694 16991 12289 13921 ...
## $ Carpet : num [1:904] 1659 1461 1340 1451 1770 ...
## $ Builtup : num [1:904] 1961 1752 1609 1748 2111 ...
## $ Parking : Factor w/ 3 levels "CAT A","CAT B",..: 2 2 1 2 2 2 1 3 2 3 ...
## $ City_Category: chr [1:904] "CAT B" "CAT B" "CAT A" "CAT B" ...
## $ Rainfall : num [1:904] 530 210 720 620 450 760 1030 1020 680 1130 ...
## $ House_Price : num [1:904] 6649000 3982000 5401000 5373000 4662000 ...
arbol_houses <- rpart(House_Price~., data= houses)
options(scipen = 999)
rpart.plot(arbol_houses)
El modelo de árbol de regresión segmenta a las observaciones en grupos en términos de precio promedio, con base en las variables Parking y Dist_Taxi.