Pemodelan Regresi (Regression Tree)
Library
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.2
## Warning: package 'ggplot2' was built under R version 4.3.2
## Warning: package 'tidyr' was built under R version 4.3.2
## Warning: package 'readr' was built under R version 4.3.2
## Warning: package 'stringr' was built under R version 4.3.2
## Warning: package 'lubridate' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# For decision tree model
library(rpart)
## Warning: package 'rpart' was built under R version 4.3.3
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.3.3
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
# For data visualization
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.3.3
library(ROCR)
## Warning: package 'ROCR' was built under R version 4.3.3
# Contains the data
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.3.3
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.3.3
##
## Attaching package: 'Metrics'
##
## The following objects are masked from 'package:caret':
##
## precision, recall
library(GGally)
## Warning: package 'GGally' was built under R version 4.3.2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggplot2)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.2
## corrplot 0.92 loaded
Data
dt <- read.csv("C:/Users/user/Downloads/Pakistan house price dataset.csv")
head(dt)
## property_id location_id
## 1 237062 3325
## 2 346905 3236
## 3 386513 764
## 4 656161 340
## 5 841645 3226
## 6 850762 3390
## page_url
## 1 https://www.zameen.com/Property/g_10_g_10_2_ground_floor_corner_apartment_with_green_lawn_for_sale-237062-3325-1.html
## 2 https://www.zameen.com/Property/e_11_2_services_society_flat_available_for_sale-346905-3236-1.html
## 3 https://www.zameen.com/Property/islamabad_g_15_house_is_available_for_sale-386513-764-1.html
## 4 https://www.zameen.com/Property/islamabad_bani_gala_a_rare_minimalist_concept_in_a_quiet_location-656161-340-1.html
## 5 https://www.zameen.com/Property/dha_valley_dha_homes_islamabad_dha_valley_8_marla_home_for_sale-841645-3226-1.html
## 6 https://www.zameen.com/Property/ghauri_town_ghauri_town_phase_1_house_is_available_for_sale_in_ghauri_town_phase_1-850762-3390-1.html
## property_type price location city province_name latitude
## 1 Flat 10000000 G-10 Islamabad Islamabad Capital 33.67989
## 2 Flat 6900000 E-11 Islamabad Islamabad Capital 33.70099
## 3 House 16500000 G-15 Islamabad Islamabad Capital 33.63149
## 4 House 43500000 Bani Gala Islamabad Islamabad Capital 33.70757
## 5 House 7000000 DHA Defence Islamabad Islamabad Capital 33.49259
## 6 House 34500000 Ghauri Town Islamabad Islamabad Capital 33.62395
## longitude baths area purpose bedrooms date_added agency
## 1 73.01264 2 4 Marla For Sale 2 2/4/2019
## 2 72.97149 3 5.6 Marla For Sale 3 5/4/2019
## 3 72.92656 6 8 Marla For Sale 5 7/17/2019
## 4 73.15120 4 2 Kanal For Sale 4 4/5/2019
## 5 73.30134 3 8 Marla For Sale 3 7/10/2019 Easy Property
## 6 73.12659 8 1.6 Kanal For Sale 8 4/5/2019
## agent Area.Type Area.Size
## 1 Marla 4.0
## 2 Marla 5.6
## 3 Marla 8.0
## 4 Kanal 2.0
## 5 Muhammad Junaid Ceo Muhammad Shahid Director Marla 8.0
## 6 Kanal 1.6
## Area.Category
## 1 0-5 Marla
## 2 5-10 Marla
## 3 5-10 Marla
## 4 1-5 Kanal
## 5 5-10 Marla
## 6 1-5 Kanal
str(dt)
## 'data.frame': 168446 obs. of 20 variables:
## $ property_id : int 237062 346905 386513 656161 841645 850762 937975 1258636 1402466 1418706 ...
## $ location_id : int 3325 3236 764 340 3226 3390 445 3241 376 3282 ...
## $ page_url : chr "https://www.zameen.com/Property/g_10_g_10_2_ground_floor_corner_apartment_with_green_lawn_for_sale-237062-3325-1.html" "https://www.zameen.com/Property/e_11_2_services_society_flat_available_for_sale-346905-3236-1.html" "https://www.zameen.com/Property/islamabad_g_15_house_is_available_for_sale-386513-764-1.html" "https://www.zameen.com/Property/islamabad_bani_gala_a_rare_minimalist_concept_in_a_quiet_location-656161-340-1.html" ...
## $ property_type: chr "Flat" "Flat" "House" "House" ...
## $ price : int 10000000 6900000 16500000 43500000 7000000 34500000 27000000 7800000 50000000 40000000 ...
## $ location : chr "G-10" "E-11" "G-15" "Bani Gala" ...
## $ city : chr "Islamabad" "Islamabad" "Islamabad" "Islamabad" ...
## $ province_name: chr "Islamabad Capital" "Islamabad Capital" "Islamabad Capital" "Islamabad Capital" ...
## $ latitude : num 33.7 33.7 33.6 33.7 33.5 ...
## $ longitude : num 73 73 72.9 73.2 73.3 ...
## $ baths : int 2 3 6 4 3 8 8 2 7 5 ...
## $ area : chr "4 Marla" "5.6 Marla" "8 Marla" "2 Kanal" ...
## $ purpose : chr "For Sale" "For Sale" "For Sale" "For Sale" ...
## $ bedrooms : int 2 3 5 4 3 8 8 2 7 5 ...
## $ date_added : chr "2/4/2019" "5/4/2019" "7/17/2019" "4/5/2019" ...
## $ agency : chr "" "" "" "" ...
## $ agent : chr "" "" "" "" ...
## $ Area.Type : chr "Marla" "Marla" "Marla" "Kanal" ...
## $ Area.Size : num 4 5.6 8 2 8 1.6 1 6.2 1 1 ...
## $ Area.Category: chr "0-5 Marla" "5-10 Marla" "5-10 Marla" "1-5 Kanal" ...
dt <- dt[,-c(1,2,3,6,7,8,9,10,12,13,15,16,17,18,20)]
head(dt)
## property_type price baths bedrooms Area.Size
## 1 Flat 10000000 2 2 4.0
## 2 Flat 6900000 3 3 5.6
## 3 House 16500000 6 5 8.0
## 4 House 43500000 4 4 2.0
## 5 House 7000000 3 3 8.0
## 6 House 34500000 8 8 1.6
str(dt)
## 'data.frame': 168446 obs. of 5 variables:
## $ property_type: chr "Flat" "Flat" "House" "House" ...
## $ price : int 10000000 6900000 16500000 43500000 7000000 34500000 27000000 7800000 50000000 40000000 ...
## $ baths : int 2 3 6 4 3 8 8 2 7 5 ...
## $ bedrooms : int 2 3 5 4 3 8 8 2 7 5 ...
## $ Area.Size : num 4 5.6 8 2 8 1.6 1 6.2 1 1 ...
Eksplorasi Data
boxplot(dt$price,
ylab = "Harga Rumah di Pakistan",
main = "Boxplot Harga Rumah di Pakistan", col="light blue")
## Warning in x[floor(d)] + x[ceiling(d)]: NAs produced by integer overflow

boxplot(dt$baths,
ylab = "Harga Rumah di Pakistan",
main = "Boxplot Harga Rumah di Pakistan", col="light blue")

boxplot(dt$bedrooms,
ylab = "Harga Rumah di Pakistan",
main = "Boxplot Harga Rumah di Pakistan", col="light blue")

boxplot(dt$Area.Size,
ylab = "Harga Rumah di Pakistan",
main = "Boxplot Harga Rumah di Pakistan", col="light blue")

data_cor <- dt[,-c(1)]
corr <- cor(data_cor)
corrplot(corr, method = 'number', type = 'lower', tl.col = 'blue', tl.srt = 90)

sum(is.na(dt))
## [1] 0
dt <- dt[!duplicated(dt),]
Preprocessing
train <- createDataPartition((dt$price), p=0.7, list=FALSE)
#train = sample(1:nrow(data), 200)
data.train=dt[train,]
data.test=dt[-train,]
Model Awal
fit.tree = rpart(price ~ property_type + Area.Size + bedrooms + baths, data = data.train, method = 'anova')
fit.tree
## n= 41765
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 41765 8.518713e+19 20864400
## 2) bedrooms< 4.5 30078 2.545222e+19 12423690
## 4) property_type=Flat,Lower Portion,Room,Upper Portion 16677 2.073453e+18 6781891 *
## 5) property_type=Farm House,House,Penthouse 13401 2.218735e+19 19444680 *
## 3) bedrooms>=4.5 11687 5.207689e+19 42587660
## 6) Area.Size>=4.05 6434 1.768096e+19 28166480 *
## 7) Area.Size< 4.05 5253 3.141893e+19 60251070
## 14) Area.Size< 1.25 2600 5.190260e+18 46552860 *
## 15) Area.Size>=1.25 2653 2.526269e+19 73675620 *
Visualisasi Pohon model awal
rpart.plot(fit.tree)

Variable Importance
fit.tree$variable.importance
## bedrooms baths Area.Size property_type
## 8.047735e+18 4.774375e+18 4.357507e+18 1.202194e+18
fit.tree$variable.importance %>%
data.frame() %>%
rownames_to_column(var = "Feature") %>%
rename(Overall = '.') %>%
ggplot(aes(x = fct_reorder(Feature, Overall), y = Overall)) +
geom_pointrange(aes(ymin = 0, ymax = Overall), color = "cadetblue", size = .3) +
theme_minimal() +
coord_flip() +
labs(x = "", y = "", title = "Variable Importance with Simple Classication")

Prediksi
pred.tree_train = predict(fit.tree, data.train)
pred.tree_test = predict(fit.tree, data.test)
Akurasi dan Evaluasi
mse_train <- mse(pred.tree_train, data.train$price)
rmse_train <- rmse(pred.tree_train, data.train$price)
mape_train <- mape(pred.tree_train, data.train$price)
mae_train <- mae(pred.tree_train, data.train$price)
mse_test <- mse(pred.tree_test, data.test$price)
rmse_test <- rmse(pred.tree_test, data.test$price)
mape_test <- mape(pred.tree_test, data.test$price)
mae_test <- mae(pred.tree_test, data.test$price)
df_eval <- data.frame("MSE" = c(mse_train, mse_test),
"RMSE" = c(rmse_train,rmse_test),
"MAPE" = c(mape_train,mape_test),
"MAE" = c(mae_train, mae_test))
rownames(df_eval) <- c("Data Train", "Data Test")
round(df_eval, 2)
## MSE RMSE MAPE MAE
## Data Train 1.733382e+15 41633907 0.97 19051739
## Data Test 2.225276e+15 47172833 1.03 19833194
Optimisasi
printcp(fit.tree)
##
## Regression tree:
## rpart(formula = price ~ property_type + Area.Size + bedrooms +
## baths, data = data.train, method = "anova")
##
## Variables actually used in tree construction:
## [1] Area.Size bedrooms property_type
##
## Root node error: 8.5187e+19/41765 = 2.0397e+15
##
## n= 41765
##
## CP nsplit rel error xerror xstd
## 1 0.089896 0 1.00000 1.00003 0.065314
## 2 0.034947 1 0.91010 0.91021 0.063419
## 3 0.013986 2 0.87516 0.87549 0.063819
## 4 0.011340 3 0.86117 0.86151 0.063575
## 5 0.010000 4 0.84983 0.85275 0.063295
Pemodelan kedua
fit.tree$cptable[which.min(fit.tree$cptable[,"xerror"]),"CP"]
## [1] 0.01
plotcp(fit.tree, upper = "splits")

bestcp <-fit.tree$cptable[which.min(fit.tree$cptable[,"xerror"]),"CP"]
pruned.tree <- prune(fit.tree, cp = bestcp)
rpart.plot(fit.tree)

pred.prune_train = predict(pruned.tree, data.train)
pred.prune_test = predict(pruned.tree, data.test)
mse_prune_train <- mse(pred.prune_train, data.train$price)
rmse_prune_train <- rmse(pred.prune_train, data.train$price)
mape_prune_train <- mape(pred.prune_train, data.train$price)
mae_prune_train <- mae(pred.prune_train, data.train$price)
mse_prune_test <- mse(pred.prune_test, data.test$price)
rmse_prune_test <- rmse(pred.prune_test, data.test$price)
mape_prune_test <- mape(pred.prune_test, data.test$price)
mae_prune_test <- mae(pred.prune_test, data.test$price)
df_prune_eval <- data.frame("MSE" = c(mse_prune_train, mse_prune_test),
"RMSE" = c(rmse_prune_train,rmse_prune_test),
"MAPE" = c(mape_prune_train,mape_prune_test),
"MAE" = c(mae_prune_train, mae_prune_test))
rownames(df_prune_eval) <- c("Data Train", "Data Test")
round(df_prune_eval, 2)
## MSE RMSE MAPE MAE
## Data Train 1.733382e+15 41633907 0.97 19051739
## Data Test 2.225276e+15 47172833 1.03 19833194
Hyperparameter tuning
tuneGrid <- expand.grid(cp = seq(0.01, 0.5, by = 0.01))
set.seed(123)
ctrl <- trainControl(method = "cv", number = 10)
tune <- train(price ~ property_type + Area.Size + bedrooms + baths, data = data.train, method = "rpart", trControl = ctrl, tuneGrid = tuneGrid)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
tune
## CART
##
## 41765 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 37587, 37589, 37589, 37589, 37588, 37589, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.01 41492736 0.14997454 19100208
## 0.02 41967021 0.13013859 19505662
## 0.03 41967021 0.13013859 19505662
## 0.04 42823410 0.09312775 20332540
## 0.05 42823410 0.09312775 20332540
## 0.06 42823410 0.09312775 20332540
## 0.07 42823410 0.09312775 20332540
## 0.08 42823410 0.09312775 20332540
## 0.09 44399219 0.07202789 21987290
## 0.10 44927593 NaN 22645922
## 0.11 44927593 NaN 22645922
## 0.12 44927593 NaN 22645922
## 0.13 44927593 NaN 22645922
## 0.14 44927593 NaN 22645922
## 0.15 44927593 NaN 22645922
## 0.16 44927593 NaN 22645922
## 0.17 44927593 NaN 22645922
## 0.18 44927593 NaN 22645922
## 0.19 44927593 NaN 22645922
## 0.20 44927593 NaN 22645922
## 0.21 44927593 NaN 22645922
## 0.22 44927593 NaN 22645922
## 0.23 44927593 NaN 22645922
## 0.24 44927593 NaN 22645922
## 0.25 44927593 NaN 22645922
## 0.26 44927593 NaN 22645922
## 0.27 44927593 NaN 22645922
## 0.28 44927593 NaN 22645922
## 0.29 44927593 NaN 22645922
## 0.30 44927593 NaN 22645922
## 0.31 44927593 NaN 22645922
## 0.32 44927593 NaN 22645922
## 0.33 44927593 NaN 22645922
## 0.34 44927593 NaN 22645922
## 0.35 44927593 NaN 22645922
## 0.36 44927593 NaN 22645922
## 0.37 44927593 NaN 22645922
## 0.38 44927593 NaN 22645922
## 0.39 44927593 NaN 22645922
## 0.40 44927593 NaN 22645922
## 0.41 44927593 NaN 22645922
## 0.42 44927593 NaN 22645922
## 0.43 44927593 NaN 22645922
## 0.44 44927593 NaN 22645922
## 0.45 44927593 NaN 22645922
## 0.46 44927593 NaN 22645922
## 0.47 44927593 NaN 22645922
## 0.48 44927593 NaN 22645922
## 0.49 44927593 NaN 22645922
## 0.50 44927593 NaN 22645922
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.01.
plot(tune)

bestcp <- tune$bestTune$cp
pruned.tree <- prune(fit.tree, cp = bestcp)
rpart.plot(pruned.tree)

pred.tune_train = predict(pruned.tree, data.train)
pred.tune_test = predict(pruned.tree, data.test)
mse_tune_train <- mse(pred.tune_train, data.train$price)
rmse_tune_train <- rmse(pred.tune_train, data.train$price)
mape_tune_train <- mape(pred.tune_train, data.train$price)
mae_tune_train <- mae(pred.tune_train, data.train$price)
Overfitting
pred.tune_train = predict(pruned.tree, data.train)
pred.tune_test = predict(pruned.tree, data.test)
mse_tune_train <- mse(pred.tune_train, data.train$price)
rmse_tune_train <- rmse(pred.tune_train, data.train$price)
mape_tune_train <- mape(pred.tune_train, data.train$price)
mae_tune_train <- mae(pred.tune_train, data.train$price)
mse_tune_test <- mse(pred.tune_test, data.test$price)
rmse_tune_test <- rmse(pred.tune_test, data.test$price)
mape_tune_test <- mape(pred.tune_test, data.test$price)
mae_tune_test <- mae(pred.tune_test, data.test$price)
df_tune_eval <- data.frame("MSE" = c(mse_tune_train, mse_tune_test),
"RMSE" = c(rmse_tune_train,rmse_tune_test),
"MAPE" = c(mape_tune_train,mape_tune_test),
"MAE" = c(mae_tune_train, mae_tune_test))
rownames(df_tune_eval) <- c("Data Train", "Data Test")
round(df_tune_eval, 2)
## MSE RMSE MAPE MAE
## Data Train 1.733382e+15 41633907 0.97 19051739
## Data Test 2.225276e+15 47172833 1.03 19833194