LATIHAN TPM PRAK 4
Library
Input Data
# Membaca data
data <- read.csv("C:\\Users\\Ghonniyu\\Documents\\Semester 6\\TPM\\Pakistan house price dataset.csv")
# Melihat Struktur data
str(data)
## 'data.frame': 168446 obs. of 20 variables:
## $ property_id : int 237062 346905 386513 656161 841645 850762 937975 1258636 1402466 1418706 ...
## $ location_id : int 3325 3236 764 340 3226 3390 445 3241 376 3282 ...
## $ page_url : chr "https://www.zameen.com/Property/g_10_g_10_2_ground_floor_corner_apartment_with_green_lawn_for_sale-237062-3325-1.html" "https://www.zameen.com/Property/e_11_2_services_society_flat_available_for_sale-346905-3236-1.html" "https://www.zameen.com/Property/islamabad_g_15_house_is_available_for_sale-386513-764-1.html" "https://www.zameen.com/Property/islamabad_bani_gala_a_rare_minimalist_concept_in_a_quiet_location-656161-340-1.html" ...
## $ property_type: chr "Flat" "Flat" "House" "House" ...
## $ price : int 10000000 6900000 16500000 43500000 7000000 34500000 27000000 7800000 50000000 40000000 ...
## $ location : chr "G-10" "E-11" "G-15" "Bani Gala" ...
## $ city : chr "Islamabad" "Islamabad" "Islamabad" "Islamabad" ...
## $ province_name: chr "Islamabad Capital" "Islamabad Capital" "Islamabad Capital" "Islamabad Capital" ...
## $ latitude : num 33.7 33.7 33.6 33.7 33.5 ...
## $ longitude : num 73 73 72.9 73.2 73.3 ...
## $ baths : int 2 3 6 4 3 8 8 2 7 5 ...
## $ area : chr "4 Marla" "5.6 Marla" "8 Marla" "2 Kanal" ...
## $ purpose : chr "For Sale" "For Sale" "For Sale" "For Sale" ...
## $ bedrooms : int 2 3 5 4 3 8 8 2 7 5 ...
## $ date_added : chr "2/4/2019" "5/4/2019" "7/17/2019" "4/5/2019" ...
## $ agency : chr "" "" "" "" ...
## $ agent : chr "" "" "" "" ...
## $ Area.Type : chr "Marla" "Marla" "Marla" "Kanal" ...
## $ Area.Size : num 4 5.6 8 2 8 1.6 1 6.2 1 1 ...
## $ Area.Category: chr "0-5 Marla" "5-10 Marla" "5-10 Marla" "1-5 Kanal" ...
Penentuan Peubah
## 'data.frame': 168446 obs. of 6 variables:
## $ price : int 10000000 6900000 16500000 43500000 7000000 34500000 27000000 7800000 50000000 40000000 ...
## $ purpose : chr "For Sale" "For Sale" "For Sale" "For Sale" ...
## $ Area.Type: chr "Marla" "Marla" "Marla" "Kanal" ...
## $ Area.Size: num 4 5.6 8 2 8 1.6 1 6.2 1 1 ...
## $ baths : int 2 3 6 4 3 8 8 2 7 5 ...
## $ bedrooms : int 2 3 5 4 3 8 8 2 7 5 ...
## price purpose Area.Type Area.Size baths bedrooms
## 1 10000000 For Sale Marla 4.0 2 2
## 2 6900000 For Sale Marla 5.6 3 3
## 3 16500000 For Sale Marla 8.0 6 5
## 4 43500000 For Sale Kanal 2.0 4 4
## 5 7000000 For Sale Marla 8.0 3 3
## 6 34500000 For Sale Kanal 1.6 8 8
Preprocessing
Pembuatan Model Awal
## n= 117913
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 117913 1.468941e+20 17761760.00
## 2) Area.Type=Marla 96597 1.768520e+19 11220710.00
## 4) purpose=For Rent 24958 2.013903e+14 49009.18 *
## 5) purpose=For Sale 71639 1.348488e+19 15112770.00
## 10) Area.Size< 8.55 52435 2.980604e+18 10357750.00 *
## 11) Area.Size>=8.55 19204 6.081618e+18 28095960.00 *
## 3) Area.Type=Kanal 21316 1.063468e+20 47403630.00
## 6) purpose=For Rent 8427 5.904662e+14 225828.10 *
## 7) purpose=For Sale 12889 7.532675e+19 78249110.00
## 14) Area.Size< 1.25 9549 9.913909e+18 56597470.00 *
## 15) Area.Size>=1.25 3340 4.813807e+19 140150700.00
## 30) bedrooms< 5.5 1725 2.201988e+19 116193100.00 *
## 31) bedrooms>=5.5 1615 2.407056e+19 165740200.00
## 62) Area.Size< 3.15 1412 9.012291e+18 145359400.00 *
## 63) Area.Size>=3.15 203 1.039215e+19 307502500.00 *
Visualisasi Pohon Klasifikasi Awal
Variable Importance
## Area.Size purpose Area.Type bedrooms baths
## 4.458127e+19 3.521963e+19 2.286204e+19 1.317745e+19 9.860935e+18
fit.tree$variable.importance %>%
data.frame() %>%
rownames_to_column(var = "Feature") %>%
rename(Overall = '.') %>%
ggplot(aes(x = fct_reorder(Feature, Overall), y = Overall)) +
geom_pointrange(aes(ymin = 0, ymax = Overall), color = "cadetblue", size = .3) +
theme_minimal() +
coord_flip() +
labs(x = "", y = "", title = "Variable Importance with Simple Classication")
Prediksi Klasifikasi
Akurasi dan Evaluasi
mse_train <- mse(pred.tree_train, price.train$Area.Size)
rmse_train <- rmse(pred.tree_train, price.train$Area.Size)
mape_train <- mape(pred.tree_train, price.train$Area.Size)
mae_train <- mae(pred.tree_train, price.train$Area.Size)
mse_test <- mse(pred.tree_test, price.test$Area.Size)
rmse_test <- rmse(pred.tree_test, price.test$Area.Size)
mape_test <- mape(pred.tree_test, price.test$Area.Size)
mae_test <- mae(pred.tree_test, price.test$Area.Size)
df_eval <- data.frame("MSE" = c(mse_train, mse_test),
"RMSE" = c(rmse_train,rmse_test),
"MAPE" = c(mape_train,mape_test),
"MAE" = c(mae_train, mae_test))
rownames(df_eval) <- c("Data Train", "Data Test")
round(df_eval, 2)
## MSE RMSE MAPE MAE
## Data Train 1.049011e+15 32388438 1 17761754
## Data Test 1.062596e+15 32597488 1 17822957
COBA DENGAN PEUBAH LAIN
mse_train <- mse(pred.tree_train, price.train$baths)
rmse_train <- rmse(pred.tree_train, price.train$baths)
mape_train <- mape(pred.tree_train, price.train$baths)
mae_train <- mae(pred.tree_train, price.train$baths)
mse_test <- mse(pred.tree_test, price.test$baths)
rmse_test <- rmse(pred.tree_test, price.test$baths)
mape_test <- mape(pred.tree_test, price.test$baths)
mae_test <- mae(pred.tree_test, price.test$baths)
df_eval <- data.frame("MSE" = c(mse_train, mse_test),
"RMSE" = c(rmse_train,rmse_test),
"MAPE" = c(mape_train,mape_test),
"MAE" = c(mae_train, mae_test))
rownames(df_eval) <- c("Data Train", "Data Test")
round(df_eval, 2)
## MSE RMSE MAPE MAE
## Data Train 1.049011e+15 32388439 1 17761757
## Data Test 1.062596e+15 32597489 1 17822960
Optimisasi
##
## Regression tree:
## rpart(formula = price ~ ., data = price.train, method = "anova")
##
## Variables actually used in tree construction:
## [1] Area.Size Area.Type bedrooms purpose
##
## Root node error: 1.4689e+20/117913 = 1.2458e+15
##
## n= 117913
##
## CP nsplit rel error xerror xstd
## 1 0.183403 0 1.00000 1.00004 0.039411
## 2 0.117600 2 0.63319 0.63338 0.033777
## 3 0.029350 3 0.51559 0.51612 0.029469
## 4 0.022852 5 0.45689 0.45742 0.029451
## 5 0.010000 7 0.41119 0.41399 0.026344
Pemodelan Kedua
# Explicitly request the lowest cp value
fit.tree$cptable[which.min(fit.tree$cptable[,"xerror"]),"CP"]
## [1] 0.01
bestcp <-fit.tree$cptable[which.min(fit.tree$cptable[,"xerror"]),"CP"]
pruned.tree <- prune(fit.tree, cp = bestcp)
rpart.plot(fit.tree)
# Alternate specification
pred.prune_train = predict(pruned.tree, price.train)
pred.prune_test = predict(pruned.tree, price.test)
mse_prune_train <- mse(pred.prune_train, price.train$baths)
rmse_prune_train <- rmse(pred.prune_train, price.train$baths)
mape_prune_train <- mape(pred.prune_train, price.train$baths)
mae_prune_train <- mae(pred.prune_train, price.train$baths)
mse_prune_test <- mse(pred.prune_test, price.test$baths)
rmse_prune_test <- rmse(pred.prune_test, price.test$baths)
mape_prune_test <- mape(pred.prune_test, price.test$baths)
mae_prune_test <- mae(pred.prune_test, price.test$baths)
df_prune_eval <- data.frame("MSE" = c(mse_prune_train, mse_prune_test),
"RMSE" = c(rmse_prune_train,rmse_prune_test),
"MAPE" = c(mape_prune_train,mape_prune_test),
"MAE" = c(mae_prune_train, mae_prune_test))
rownames(df_prune_eval) <- c("Data Train", "Data Test")
round(df_prune_eval, 2)
## MSE RMSE MAPE MAE
## Data Train 1.049011e+15 32388439 1 17761757
## Data Test 1.062596e+15 32597489 1 17822960
Model berpotensi untuk dikembangkan.
Feature Extraction Feature Engineering Feature Selection (Variable Importance atau RFE) Penggunaan hyperparameter yang lebih banyak Penggunaan model yang lebih advance (RF, XGBoost, dsb)
BONUS
tree_rent <- rpart(price ~ baths + bedrooms + Area.Type + Area.Size,
data = subset(data, purpose == "For Rent"),
method = "anova")
tree_sale <- rpart(price ~ baths + bedrooms + Area.Type + Area.Size,
data = subset(data, purpose == "For Sale"),
method = "anova")
# Visualisasi kedua pohon
par(mfrow = c(1,2)) # Menampilkan dua pohon berdampingan
rpart.plot(tree_rent, main = "For Rent")
rpart.plot(tree_sale, main = "For Sale")