LATIHAN TPM PRAK 4

Library

library(dplyr)
library(tidyverse)
# For decision tree model
library(rpart)
library(caret)
# For data visualization
library(rpart.plot)
library(ROCR)
# Contains the data
library(ISLR)
library(Metrics)

Input Data

# Membaca data
data <- read.csv("C:\\Users\\Ghonniyu\\Documents\\Semester 6\\TPM\\Pakistan house price dataset.csv")

# Melihat Struktur data
str(data)
## 'data.frame':    168446 obs. of  20 variables:
##  $ property_id  : int  237062 346905 386513 656161 841645 850762 937975 1258636 1402466 1418706 ...
##  $ location_id  : int  3325 3236 764 340 3226 3390 445 3241 376 3282 ...
##  $ page_url     : chr  "https://www.zameen.com/Property/g_10_g_10_2_ground_floor_corner_apartment_with_green_lawn_for_sale-237062-3325-1.html" "https://www.zameen.com/Property/e_11_2_services_society_flat_available_for_sale-346905-3236-1.html" "https://www.zameen.com/Property/islamabad_g_15_house_is_available_for_sale-386513-764-1.html" "https://www.zameen.com/Property/islamabad_bani_gala_a_rare_minimalist_concept_in_a_quiet_location-656161-340-1.html" ...
##  $ property_type: chr  "Flat" "Flat" "House" "House" ...
##  $ price        : int  10000000 6900000 16500000 43500000 7000000 34500000 27000000 7800000 50000000 40000000 ...
##  $ location     : chr  "G-10" "E-11" "G-15" "Bani Gala" ...
##  $ city         : chr  "Islamabad" "Islamabad" "Islamabad" "Islamabad" ...
##  $ province_name: chr  "Islamabad Capital" "Islamabad Capital" "Islamabad Capital" "Islamabad Capital" ...
##  $ latitude     : num  33.7 33.7 33.6 33.7 33.5 ...
##  $ longitude    : num  73 73 72.9 73.2 73.3 ...
##  $ baths        : int  2 3 6 4 3 8 8 2 7 5 ...
##  $ area         : chr  "4 Marla" "5.6 Marla" "8 Marla" "2 Kanal" ...
##  $ purpose      : chr  "For Sale" "For Sale" "For Sale" "For Sale" ...
##  $ bedrooms     : int  2 3 5 4 3 8 8 2 7 5 ...
##  $ date_added   : chr  "2/4/2019" "5/4/2019" "7/17/2019" "4/5/2019" ...
##  $ agency       : chr  "" "" "" "" ...
##  $ agent        : chr  "" "" "" "" ...
##  $ Area.Type    : chr  "Marla" "Marla" "Marla" "Kanal" ...
##  $ Area.Size    : num  4 5.6 8 2 8 1.6 1 6.2 1 1 ...
##  $ Area.Category: chr  "0-5 Marla" "5-10 Marla" "5-10 Marla" "1-5 Kanal" ...

Penentuan Peubah

data <- data[, c( "price", "purpose","Area.Type","Area.Size","baths", "bedrooms")]

str(data)
## 'data.frame':    168446 obs. of  6 variables:
##  $ price    : int  10000000 6900000 16500000 43500000 7000000 34500000 27000000 7800000 50000000 40000000 ...
##  $ purpose  : chr  "For Sale" "For Sale" "For Sale" "For Sale" ...
##  $ Area.Type: chr  "Marla" "Marla" "Marla" "Kanal" ...
##  $ Area.Size: num  4 5.6 8 2 8 1.6 1 6.2 1 1 ...
##  $ baths    : int  2 3 6 4 3 8 8 2 7 5 ...
##  $ bedrooms : int  2 3 5 4 3 8 8 2 7 5 ...
head(data)
##      price  purpose Area.Type Area.Size baths bedrooms
## 1 10000000 For Sale     Marla       4.0     2        2
## 2  6900000 For Sale     Marla       5.6     3        3
## 3 16500000 For Sale     Marla       8.0     6        5
## 4 43500000 For Sale     Kanal       2.0     4        4
## 5  7000000 For Sale     Marla       8.0     3        3
## 6 34500000 For Sale     Kanal       1.6     8        8

Preprocessing

set.seed(012)
train <- createDataPartition((data$price), p=0.7, list=FALSE)
#train = sample(1:nrow(price.H), 200)
price.train=data[train,]
price.test=data[-train,]

Pembuatan Model Awal

fit.tree = rpart(price ~ ., data=price.train, method = 'anova')
fit.tree
## n= 117913 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 117913 1.468941e+20  17761760.00  
##    2) Area.Type=Marla 96597 1.768520e+19  11220710.00  
##      4) purpose=For Rent 24958 2.013903e+14     49009.18 *
##      5) purpose=For Sale 71639 1.348488e+19  15112770.00  
##       10) Area.Size< 8.55 52435 2.980604e+18  10357750.00 *
##       11) Area.Size>=8.55 19204 6.081618e+18  28095960.00 *
##    3) Area.Type=Kanal 21316 1.063468e+20  47403630.00  
##      6) purpose=For Rent 8427 5.904662e+14    225828.10 *
##      7) purpose=For Sale 12889 7.532675e+19  78249110.00  
##       14) Area.Size< 1.25 9549 9.913909e+18  56597470.00 *
##       15) Area.Size>=1.25 3340 4.813807e+19 140150700.00  
##         30) bedrooms< 5.5 1725 2.201988e+19 116193100.00 *
##         31) bedrooms>=5.5 1615 2.407056e+19 165740200.00  
##           62) Area.Size< 3.15 1412 9.012291e+18 145359400.00 *
##           63) Area.Size>=3.15 203 1.039215e+19 307502500.00 *

Visualisasi Pohon Klasifikasi Awal

# Visualizing the unpruned tree
rpart.plot(fit.tree,type=2,extra=101,under=TRUE,cex=0.9)

Variable Importance

# Checking the order of variable importance
fit.tree$variable.importance
##    Area.Size      purpose    Area.Type     bedrooms        baths 
## 4.458127e+19 3.521963e+19 2.286204e+19 1.317745e+19 9.860935e+18
fit.tree$variable.importance %>% 
   data.frame() %>%
   rownames_to_column(var = "Feature") %>%
   rename(Overall = '.') %>%
   ggplot(aes(x = fct_reorder(Feature, Overall), y = Overall)) +
   geom_pointrange(aes(ymin = 0, ymax = Overall), color = "cadetblue", size = .3) +
   theme_minimal() +
   coord_flip() +
   labs(x = "", y = "", title = "Variable Importance with Simple Classication")

Prediksi Klasifikasi

pred.tree_train = predict(fit.tree, price.train)
pred.tree_test = predict(fit.tree, price.test)

Akurasi dan Evaluasi

mse_train <- mse(pred.tree_train, price.train$Area.Size)
rmse_train <- rmse(pred.tree_train, price.train$Area.Size)
mape_train <- mape(pred.tree_train, price.train$Area.Size)
mae_train <- mae(pred.tree_train, price.train$Area.Size)

mse_test <- mse(pred.tree_test, price.test$Area.Size)
rmse_test <- rmse(pred.tree_test, price.test$Area.Size)
mape_test <- mape(pred.tree_test, price.test$Area.Size)
mae_test <- mae(pred.tree_test, price.test$Area.Size)
df_eval <- data.frame("MSE" = c(mse_train, mse_test),
          "RMSE" = c(rmse_train,rmse_test),
          "MAPE" = c(mape_train,mape_test),
          "MAE" = c(mae_train, mae_test))
rownames(df_eval) <- c("Data Train", "Data Test")
round(df_eval, 2)
##                     MSE     RMSE MAPE      MAE
## Data Train 1.049011e+15 32388438    1 17761754
## Data Test  1.062596e+15 32597488    1 17822957

COBA DENGAN PEUBAH LAIN

mse_train <- mse(pred.tree_train, price.train$baths)
rmse_train <- rmse(pred.tree_train, price.train$baths)
mape_train <- mape(pred.tree_train, price.train$baths)
mae_train <- mae(pred.tree_train, price.train$baths)

mse_test <- mse(pred.tree_test, price.test$baths)
rmse_test <- rmse(pred.tree_test, price.test$baths)
mape_test <- mape(pred.tree_test, price.test$baths)
mae_test <- mae(pred.tree_test, price.test$baths)
df_eval <- data.frame("MSE" = c(mse_train, mse_test),
          "RMSE" = c(rmse_train,rmse_test),
          "MAPE" = c(mape_train,mape_test),
          "MAE" = c(mae_train, mae_test))
rownames(df_eval) <- c("Data Train", "Data Test")
round(df_eval, 2)
##                     MSE     RMSE MAPE      MAE
## Data Train 1.049011e+15 32388439    1 17761757
## Data Test  1.062596e+15 32597489    1 17822960

Optimisasi

printcp(fit.tree)
## 
## Regression tree:
## rpart(formula = price ~ ., data = price.train, method = "anova")
## 
## Variables actually used in tree construction:
## [1] Area.Size Area.Type bedrooms  purpose  
## 
## Root node error: 1.4689e+20/117913 = 1.2458e+15
## 
## n= 117913 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.183403      0   1.00000 1.00004 0.039411
## 2 0.117600      2   0.63319 0.63338 0.033777
## 3 0.029350      3   0.51559 0.51612 0.029469
## 4 0.022852      5   0.45689 0.45742 0.029451
## 5 0.010000      7   0.41119 0.41399 0.026344

Pemodelan Kedua

# Explicitly request the lowest cp value
fit.tree$cptable[which.min(fit.tree$cptable[,"xerror"]),"CP"]
## [1] 0.01
plotcp(fit.tree, upper = "splits")

bestcp <-fit.tree$cptable[which.min(fit.tree$cptable[,"xerror"]),"CP"]
pruned.tree <- prune(fit.tree, cp = bestcp)
rpart.plot(fit.tree)

rpart.plot(pruned.tree)

# Alternate specification 
pred.prune_train = predict(pruned.tree, price.train)
pred.prune_test = predict(pruned.tree, price.test)
mse_prune_train <- mse(pred.prune_train, price.train$baths)
rmse_prune_train <- rmse(pred.prune_train, price.train$baths)
mape_prune_train <- mape(pred.prune_train, price.train$baths)
mae_prune_train <- mae(pred.prune_train, price.train$baths)

mse_prune_test <- mse(pred.prune_test, price.test$baths)
rmse_prune_test <- rmse(pred.prune_test, price.test$baths)
mape_prune_test <- mape(pred.prune_test, price.test$baths)
mae_prune_test <- mae(pred.prune_test, price.test$baths)
df_prune_eval <- data.frame("MSE" = c(mse_prune_train, mse_prune_test),
          "RMSE" = c(rmse_prune_train,rmse_prune_test),
          "MAPE" = c(mape_prune_train,mape_prune_test),
          "MAE" = c(mae_prune_train, mae_prune_test))
rownames(df_prune_eval) <- c("Data Train", "Data Test")
round(df_prune_eval, 2)
##                     MSE     RMSE MAPE      MAE
## Data Train 1.049011e+15 32388439    1 17761757
## Data Test  1.062596e+15 32597489    1 17822960

Model berpotensi untuk dikembangkan.

Feature Extraction Feature Engineering Feature Selection (Variable Importance atau RFE) Penggunaan hyperparameter yang lebih banyak Penggunaan model yang lebih advance (RF, XGBoost, dsb)

BONUS

tree_rent <- rpart(price ~ baths + bedrooms + Area.Type + Area.Size, 
                   data = subset(data, purpose == "For Rent"), 
                   method = "anova")

tree_sale <- rpart(price ~ baths + bedrooms + Area.Type + Area.Size, 
                   data = subset(data, purpose == "For Sale"), 
                   method = "anova")

# Visualisasi kedua pohon
par(mfrow = c(1,2)) # Menampilkan dua pohon berdampingan
rpart.plot(tree_rent, main = "For Rent")
rpart.plot(tree_sale, main = "For Sale")