Pemodelan Regresi (Regression Tree)

Library

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.2
## Warning: package 'ggplot2' was built under R version 4.3.2
## Warning: package 'tidyr' was built under R version 4.3.2
## Warning: package 'readr' was built under R version 4.3.2
## Warning: package 'stringr' was built under R version 4.3.2
## Warning: package 'lubridate' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# For decision tree model
library(rpart)
## Warning: package 'rpart' was built under R version 4.3.3
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.3.3
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
# For data visualization
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.3.3
library(ROCR)
## Warning: package 'ROCR' was built under R version 4.3.3
# Contains the data
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.3.3
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.3.3
## 
## Attaching package: 'Metrics'
## 
## The following objects are masked from 'package:caret':
## 
##     precision, recall
library(GGally)
## Warning: package 'GGally' was built under R version 4.3.2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(ggplot2)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.2
## corrplot 0.92 loaded

Data

dt <- read.csv("C:/Users/user/Downloads/Pakistan house price dataset.csv")
head(dt)
##   property_id location_id
## 1      237062        3325
## 2      346905        3236
## 3      386513         764
## 4      656161         340
## 5      841645        3226
## 6      850762        3390
##                                                                                                                                page_url
## 1                 https://www.zameen.com/Property/g_10_g_10_2_ground_floor_corner_apartment_with_green_lawn_for_sale-237062-3325-1.html
## 2                                    https://www.zameen.com/Property/e_11_2_services_society_flat_available_for_sale-346905-3236-1.html
## 3                                          https://www.zameen.com/Property/islamabad_g_15_house_is_available_for_sale-386513-764-1.html
## 4                   https://www.zameen.com/Property/islamabad_bani_gala_a_rare_minimalist_concept_in_a_quiet_location-656161-340-1.html
## 5                    https://www.zameen.com/Property/dha_valley_dha_homes_islamabad_dha_valley_8_marla_home_for_sale-841645-3226-1.html
## 6 https://www.zameen.com/Property/ghauri_town_ghauri_town_phase_1_house_is_available_for_sale_in_ghauri_town_phase_1-850762-3390-1.html
##   property_type    price    location      city     province_name latitude
## 1          Flat 10000000        G-10 Islamabad Islamabad Capital 33.67989
## 2          Flat  6900000        E-11 Islamabad Islamabad Capital 33.70099
## 3         House 16500000        G-15 Islamabad Islamabad Capital 33.63149
## 4         House 43500000   Bani Gala Islamabad Islamabad Capital 33.70757
## 5         House  7000000 DHA Defence Islamabad Islamabad Capital 33.49259
## 6         House 34500000 Ghauri Town Islamabad Islamabad Capital 33.62395
##   longitude baths      area  purpose bedrooms date_added        agency
## 1  73.01264     2   4 Marla For Sale        2   2/4/2019              
## 2  72.97149     3 5.6 Marla For Sale        3   5/4/2019              
## 3  72.92656     6   8 Marla For Sale        5  7/17/2019              
## 4  73.15120     4   2 Kanal For Sale        4   4/5/2019              
## 5  73.30134     3   8 Marla For Sale        3  7/10/2019 Easy Property
## 6  73.12659     8 1.6 Kanal For Sale        8   4/5/2019              
##                                          agent Area.Type Area.Size
## 1                                                  Marla       4.0
## 2                                                  Marla       5.6
## 3                                                  Marla       8.0
## 4                                                  Kanal       2.0
## 5 Muhammad Junaid Ceo Muhammad Shahid Director     Marla       8.0
## 6                                                  Kanal       1.6
##   Area.Category
## 1     0-5 Marla
## 2    5-10 Marla
## 3    5-10 Marla
## 4     1-5 Kanal
## 5    5-10 Marla
## 6     1-5 Kanal
str(dt)
## 'data.frame':    168446 obs. of  20 variables:
##  $ property_id  : int  237062 346905 386513 656161 841645 850762 937975 1258636 1402466 1418706 ...
##  $ location_id  : int  3325 3236 764 340 3226 3390 445 3241 376 3282 ...
##  $ page_url     : chr  "https://www.zameen.com/Property/g_10_g_10_2_ground_floor_corner_apartment_with_green_lawn_for_sale-237062-3325-1.html" "https://www.zameen.com/Property/e_11_2_services_society_flat_available_for_sale-346905-3236-1.html" "https://www.zameen.com/Property/islamabad_g_15_house_is_available_for_sale-386513-764-1.html" "https://www.zameen.com/Property/islamabad_bani_gala_a_rare_minimalist_concept_in_a_quiet_location-656161-340-1.html" ...
##  $ property_type: chr  "Flat" "Flat" "House" "House" ...
##  $ price        : int  10000000 6900000 16500000 43500000 7000000 34500000 27000000 7800000 50000000 40000000 ...
##  $ location     : chr  "G-10" "E-11" "G-15" "Bani Gala" ...
##  $ city         : chr  "Islamabad" "Islamabad" "Islamabad" "Islamabad" ...
##  $ province_name: chr  "Islamabad Capital" "Islamabad Capital" "Islamabad Capital" "Islamabad Capital" ...
##  $ latitude     : num  33.7 33.7 33.6 33.7 33.5 ...
##  $ longitude    : num  73 73 72.9 73.2 73.3 ...
##  $ baths        : int  2 3 6 4 3 8 8 2 7 5 ...
##  $ area         : chr  "4 Marla" "5.6 Marla" "8 Marla" "2 Kanal" ...
##  $ purpose      : chr  "For Sale" "For Sale" "For Sale" "For Sale" ...
##  $ bedrooms     : int  2 3 5 4 3 8 8 2 7 5 ...
##  $ date_added   : chr  "2/4/2019" "5/4/2019" "7/17/2019" "4/5/2019" ...
##  $ agency       : chr  "" "" "" "" ...
##  $ agent        : chr  "" "" "" "" ...
##  $ Area.Type    : chr  "Marla" "Marla" "Marla" "Kanal" ...
##  $ Area.Size    : num  4 5.6 8 2 8 1.6 1 6.2 1 1 ...
##  $ Area.Category: chr  "0-5 Marla" "5-10 Marla" "5-10 Marla" "1-5 Kanal" ...
dt <- dt[,-c(1,2,3,6,7,8,9,10,12,13,15,16,17,18,20)]
head(dt)
##   property_type    price baths bedrooms Area.Size
## 1          Flat 10000000     2        2       4.0
## 2          Flat  6900000     3        3       5.6
## 3         House 16500000     6        5       8.0
## 4         House 43500000     4        4       2.0
## 5         House  7000000     3        3       8.0
## 6         House 34500000     8        8       1.6
str(dt)
## 'data.frame':    168446 obs. of  5 variables:
##  $ property_type: chr  "Flat" "Flat" "House" "House" ...
##  $ price        : int  10000000 6900000 16500000 43500000 7000000 34500000 27000000 7800000 50000000 40000000 ...
##  $ baths        : int  2 3 6 4 3 8 8 2 7 5 ...
##  $ bedrooms     : int  2 3 5 4 3 8 8 2 7 5 ...
##  $ Area.Size    : num  4 5.6 8 2 8 1.6 1 6.2 1 1 ...

Eksplorasi Data

boxplot(dt$price,
        ylab = "Harga Rumah di Pakistan",
        main = "Boxplot Harga Rumah di Pakistan", col="light blue")
## Warning in x[floor(d)] + x[ceiling(d)]: NAs produced by integer overflow

boxplot(dt$baths,
        ylab = "Harga Rumah di Pakistan",
        main = "Boxplot Harga Rumah di Pakistan", col="light blue")

boxplot(dt$bedrooms,
        ylab = "Harga Rumah di Pakistan",
        main = "Boxplot Harga Rumah di Pakistan", col="light blue")

boxplot(dt$Area.Size,
        ylab = "Harga Rumah di Pakistan",
        main = "Boxplot Harga Rumah di Pakistan", col="light blue")

data_cor <- dt[,-c(1)]
corr <- cor(data_cor)
corrplot(corr, method = 'number', type = 'lower', tl.col = 'blue', tl.srt = 90)

sum(is.na(dt))
## [1] 0
dt <- dt[!duplicated(dt),]

Preprocessing

train <- createDataPartition((dt$price), p=0.7, list=FALSE)
#train = sample(1:nrow(data), 200)
data.train=dt[train,]
data.test=dt[-train,]

Model Awal

fit.tree = rpart(price ~ property_type + Area.Size + bedrooms + baths, data = data.train, method = 'anova')
fit.tree
## n= 41765 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 41765 8.518713e+19 20864400  
##    2) bedrooms< 4.5 30078 2.545222e+19 12423690  
##      4) property_type=Flat,Lower Portion,Room,Upper Portion 16677 2.073453e+18  6781891 *
##      5) property_type=Farm House,House,Penthouse 13401 2.218735e+19 19444680 *
##    3) bedrooms>=4.5 11687 5.207689e+19 42587660  
##      6) Area.Size>=4.05 6434 1.768096e+19 28166480 *
##      7) Area.Size< 4.05 5253 3.141893e+19 60251070  
##       14) Area.Size< 1.25 2600 5.190260e+18 46552860 *
##       15) Area.Size>=1.25 2653 2.526269e+19 73675620 *

Visualisasi Pohon model awal

rpart.plot(fit.tree)

Variable Importance

fit.tree$variable.importance
##      bedrooms         baths     Area.Size property_type 
##  8.047735e+18  4.774375e+18  4.357507e+18  1.202194e+18
fit.tree$variable.importance %>% 
   data.frame() %>%
   rownames_to_column(var = "Feature") %>%
   rename(Overall = '.') %>%
   ggplot(aes(x = fct_reorder(Feature, Overall), y = Overall)) +
   geom_pointrange(aes(ymin = 0, ymax = Overall), color = "cadetblue", size = .3) +
   theme_minimal() +
   coord_flip() +
   labs(x = "", y = "", title = "Variable Importance with Simple Classication")

Prediksi

pred.tree_train = predict(fit.tree, data.train)
pred.tree_test = predict(fit.tree, data.test)

Akurasi dan Evaluasi

mse_train <- mse(pred.tree_train, data.train$price)
rmse_train <- rmse(pred.tree_train, data.train$price)
mape_train <- mape(pred.tree_train, data.train$price)
mae_train <- mae(pred.tree_train, data.train$price)

mse_test <- mse(pred.tree_test, data.test$price)
rmse_test <- rmse(pred.tree_test, data.test$price)
mape_test <- mape(pred.tree_test, data.test$price)
mae_test <- mae(pred.tree_test, data.test$price)
df_eval <- data.frame("MSE" = c(mse_train, mse_test),
          "RMSE" = c(rmse_train,rmse_test),
          "MAPE" = c(mape_train,mape_test),
          "MAE" = c(mae_train, mae_test))
rownames(df_eval) <- c("Data Train", "Data Test")
round(df_eval, 2)
##                     MSE     RMSE MAPE      MAE
## Data Train 1.733382e+15 41633907 0.97 19051739
## Data Test  2.225276e+15 47172833 1.03 19833194

Optimisasi

printcp(fit.tree)
## 
## Regression tree:
## rpart(formula = price ~ property_type + Area.Size + bedrooms + 
##     baths, data = data.train, method = "anova")
## 
## Variables actually used in tree construction:
## [1] Area.Size     bedrooms      property_type
## 
## Root node error: 8.5187e+19/41765 = 2.0397e+15
## 
## n= 41765 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.089896      0   1.00000 1.00003 0.065314
## 2 0.034947      1   0.91010 0.91021 0.063419
## 3 0.013986      2   0.87516 0.87549 0.063819
## 4 0.011340      3   0.86117 0.86151 0.063575
## 5 0.010000      4   0.84983 0.85275 0.063295

Pemodelan kedua

fit.tree$cptable[which.min(fit.tree$cptable[,"xerror"]),"CP"]
## [1] 0.01
plotcp(fit.tree, upper = "splits")

bestcp <-fit.tree$cptable[which.min(fit.tree$cptable[,"xerror"]),"CP"]
pruned.tree <- prune(fit.tree, cp = bestcp)
rpart.plot(fit.tree)

pred.prune_train = predict(pruned.tree, data.train)
pred.prune_test = predict(pruned.tree, data.test)
mse_prune_train <- mse(pred.prune_train, data.train$price)
rmse_prune_train <- rmse(pred.prune_train, data.train$price)
mape_prune_train <- mape(pred.prune_train, data.train$price)
mae_prune_train <- mae(pred.prune_train, data.train$price)

mse_prune_test <- mse(pred.prune_test, data.test$price)
rmse_prune_test <- rmse(pred.prune_test, data.test$price)
mape_prune_test <- mape(pred.prune_test, data.test$price)
mae_prune_test <- mae(pred.prune_test, data.test$price)
df_prune_eval <- data.frame("MSE" = c(mse_prune_train, mse_prune_test),
          "RMSE" = c(rmse_prune_train,rmse_prune_test),
          "MAPE" = c(mape_prune_train,mape_prune_test),
          "MAE" = c(mae_prune_train, mae_prune_test))
rownames(df_prune_eval) <- c("Data Train", "Data Test")
round(df_prune_eval, 2)
##                     MSE     RMSE MAPE      MAE
## Data Train 1.733382e+15 41633907 0.97 19051739
## Data Test  2.225276e+15 47172833 1.03 19833194

Hyperparameter tuning

tuneGrid <- expand.grid(cp = seq(0.01, 0.5, by = 0.01))
set.seed(123)
ctrl <- trainControl(method = "cv", number = 10)
tune <- train(price ~ property_type + Area.Size + bedrooms + baths, data = data.train, method = "rpart", trControl = ctrl, tuneGrid = tuneGrid)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
tune
## CART 
## 
## 41765 samples
##     4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 37587, 37589, 37589, 37589, 37588, 37589, ... 
## Resampling results across tuning parameters:
## 
##   cp    RMSE      Rsquared    MAE     
##   0.01  41492736  0.14997454  19100208
##   0.02  41967021  0.13013859  19505662
##   0.03  41967021  0.13013859  19505662
##   0.04  42823410  0.09312775  20332540
##   0.05  42823410  0.09312775  20332540
##   0.06  42823410  0.09312775  20332540
##   0.07  42823410  0.09312775  20332540
##   0.08  42823410  0.09312775  20332540
##   0.09  44399219  0.07202789  21987290
##   0.10  44927593         NaN  22645922
##   0.11  44927593         NaN  22645922
##   0.12  44927593         NaN  22645922
##   0.13  44927593         NaN  22645922
##   0.14  44927593         NaN  22645922
##   0.15  44927593         NaN  22645922
##   0.16  44927593         NaN  22645922
##   0.17  44927593         NaN  22645922
##   0.18  44927593         NaN  22645922
##   0.19  44927593         NaN  22645922
##   0.20  44927593         NaN  22645922
##   0.21  44927593         NaN  22645922
##   0.22  44927593         NaN  22645922
##   0.23  44927593         NaN  22645922
##   0.24  44927593         NaN  22645922
##   0.25  44927593         NaN  22645922
##   0.26  44927593         NaN  22645922
##   0.27  44927593         NaN  22645922
##   0.28  44927593         NaN  22645922
##   0.29  44927593         NaN  22645922
##   0.30  44927593         NaN  22645922
##   0.31  44927593         NaN  22645922
##   0.32  44927593         NaN  22645922
##   0.33  44927593         NaN  22645922
##   0.34  44927593         NaN  22645922
##   0.35  44927593         NaN  22645922
##   0.36  44927593         NaN  22645922
##   0.37  44927593         NaN  22645922
##   0.38  44927593         NaN  22645922
##   0.39  44927593         NaN  22645922
##   0.40  44927593         NaN  22645922
##   0.41  44927593         NaN  22645922
##   0.42  44927593         NaN  22645922
##   0.43  44927593         NaN  22645922
##   0.44  44927593         NaN  22645922
##   0.45  44927593         NaN  22645922
##   0.46  44927593         NaN  22645922
##   0.47  44927593         NaN  22645922
##   0.48  44927593         NaN  22645922
##   0.49  44927593         NaN  22645922
##   0.50  44927593         NaN  22645922
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.01.
plot(tune)

bestcp <- tune$bestTune$cp
pruned.tree <- prune(fit.tree, cp = bestcp)
rpart.plot(pruned.tree)

pred.tune_train = predict(pruned.tree, data.train)
pred.tune_test = predict(pruned.tree, data.test)
mse_tune_train <- mse(pred.tune_train, data.train$price)
rmse_tune_train <- rmse(pred.tune_train, data.train$price)
mape_tune_train <- mape(pred.tune_train, data.train$price)
mae_tune_train <- mae(pred.tune_train, data.train$price)

Overfitting

pred.tune_train = predict(pruned.tree, data.train)
pred.tune_test = predict(pruned.tree, data.test)
mse_tune_train <- mse(pred.tune_train, data.train$price)
rmse_tune_train <- rmse(pred.tune_train, data.train$price)
mape_tune_train <- mape(pred.tune_train, data.train$price)
mae_tune_train <- mae(pred.tune_train, data.train$price)
mse_tune_test <- mse(pred.tune_test, data.test$price)
rmse_tune_test <- rmse(pred.tune_test, data.test$price)
mape_tune_test <- mape(pred.tune_test, data.test$price)
mae_tune_test <- mae(pred.tune_test, data.test$price)
df_tune_eval <- data.frame("MSE" = c(mse_tune_train, mse_tune_test),
          "RMSE" = c(rmse_tune_train,rmse_tune_test),
          "MAPE" = c(mape_tune_train,mape_tune_test),
          "MAE" = c(mae_tune_train, mae_tune_test))
rownames(df_tune_eval) <- c("Data Train", "Data Test")
round(df_tune_eval, 2)
##                     MSE     RMSE MAPE      MAE
## Data Train 1.733382e+15 41633907 0.97 19051739
## Data Test  2.225276e+15 47172833 1.03 19833194