Analisis ini bertujuan membandingkan performa tiga metode pemodelan: - Generalized Random Forest (GRF) - Random Forest (RF) - Generalized Linear Mixed Model (GLMM)
Metode evaluasi menggunakan 10-fold cross-validation dengan 100 kali pengulangan. Setelah model terbaik diperoleh, dilakukan deteksi pencilan menggunakan standardized residual. Data kemudian ditransformasi menggunakan winsorization berbasis IQR, dan pemodelan diulang hingga diperoleh model terbaik setelah transformasi.
# Load library
library(grf)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(lme4)
## Loading required package: Matrix
library(caret)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## Loading required package: lattice
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
##
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
##
## expand, pack, unpack
library(DescTools)
##
## Attaching package: 'DescTools'
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
# Load dataset
data <- read.csv("D:/Download/rawfixs.csv")
# Cek data
str(data)
## 'data.frame': 25686 obs. of 29 variables:
## $ Nomor.urut.RT : int 16702 16720 16717 16711 16699 16714 16708 16696 16693 16705 ...
## $ No.Urut.Nks : int 17685 17685 17685 17685 17685 17685 17685 17685 17685 17685 ...
## $ No.Urut.Ruta : int 124503 261097 296866 84174 95901 36919 212233 232632 311648 30008 ...
## $ Provinsi : int 32 32 32 32 32 32 32 32 32 32 ...
## $ Kabupaten.Kota : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Hubungan.Dengan.Kepala.Rumah.Tangga..Krt.: int 1 1 1 1 1 1 1 1 1 1 ...
## $ REGION : int 2 2 2 2 2 2 2 2 2 2 ...
## $ GENDER : int 1 1 1 1 1 1 1 1 1 1 ...
## $ AGE : int 49 40 58 69 57 31 33 32 37 30 ...
## $ EDU : int 1 1 4 1 1 1 1 2 1 1 ...
## $ SAVING : num 100 0 66.7 0 0 ...
## $ ILLITERATE : num 100 100 100 100 100 ...
## $ FOOD : int 0 0 0 0 0 0 0 1 0 1 ...
## $ PLACE : int 1 1 1 1 1 1 0 1 1 1 ...
## $ HOUSE : int 96 60 63 42 108 98 42 56 36 96 ...
## $ OTPLACE : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ROOF : int 3 3 3 3 3 3 3 3 3 3 ...
## $ WALL : int 4 4 4 2 4 4 4 4 3 4 ...
## $ FLOOR : int 5 5 5 2 5 5 5 5 2 5 ...
## $ DEFECATION : int 1 1 1 1 1 1 1 1 1 1 ...
## $ WATER : int 3 3 3 3 3 4 3 3 3 3 ...
## $ ELECTRICITY : int 1 1 1 1 1 1 1 1 1 1 ...
## $ CREDIT : int 0 0 0 0 0 0 0 0 0 0 ...
## $ LAND : int 1 1 1 1 0 0 0 0 0 0 ...
## $ INCOME : int 0 0 0 0 0 0 0 0 1 0 ...
## $ KKS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AID : int 0 0 0 0 0 0 0 0 0 0 ...
## $ MICRO : int 1 0 0 0 1 0 0 0 0 0 ...
## $ EXPENDITURE : num 30.55 7.71 39.7 7.36 19.61 ...
summary(data)
## Nomor.urut.RT No.Urut.Nks No.Urut.Ruta Provinsi Kabupaten.Kota
## Min. : 4279 Min. : 7 Min. : 7 Min. :32 Min. : 1.00
## 1st Qu.:114517 1st Qu.: 8100 1st Qu.: 82890 1st Qu.:32 1st Qu.: 6.00
## Median :191808 Median :16908 Median :160399 Median :32 Median :13.00
## Mean :181524 Mean :16604 Mean :162151 Mean :32 Mean :28.28
## 3rd Qu.:259275 3rd Qu.:24591 3rd Qu.:243245 3rd Qu.:32 3rd Qu.:72.00
## Max. :338803 Max. :32971 Max. :326034 Max. :32 Max. :79.00
## Hubungan.Dengan.Kepala.Rumah.Tangga..Krt. REGION GENDER
## Min. :1 Min. :1.000 Min. :1.00
## 1st Qu.:1 1st Qu.:1.000 1st Qu.:1.00
## Median :1 Median :1.000 Median :1.00
## Mean :1 Mean :1.345 Mean :1.15
## 3rd Qu.:1 3rd Qu.:2.000 3rd Qu.:1.00
## Max. :1 Max. :2.000 Max. :2.00
## AGE EDU SAVING ILLITERATE
## Min. :15.00 Min. :1.000 Min. : 0.00 Min. : 0.00
## 1st Qu.:38.00 1st Qu.:1.000 1st Qu.: 0.00 1st Qu.: 75.00
## Median :48.00 Median :2.000 Median : 28.57 Median :100.00
## Mean :48.57 Mean :1.974 Mean : 33.63 Mean : 89.46
## 3rd Qu.:58.00 3rd Qu.:3.000 3rd Qu.: 50.00 3rd Qu.:100.00
## Max. :97.00 Max. :4.000 Max. :100.00 Max. :100.00
## FOOD PLACE HOUSE OTPLACE
## Min. :0.00 Min. :0.0000 Min. : 36.0 Min. :0.0000
## 1st Qu.:0.00 1st Qu.:1.0000 1st Qu.: 40.0 1st Qu.:0.0000
## Median :0.00 Median :1.0000 Median : 60.0 Median :0.0000
## Mean :0.23 Mean :0.8051 Mean : 70.6 Mean :0.0934
## 3rd Qu.:0.00 3rd Qu.:1.0000 3rd Qu.: 84.0 3rd Qu.:0.0000
## Max. :1.00 Max. :1.0000 Max. :650.0 Max. :1.0000
## ROOF WALL FLOOR DEFECATION
## Min. :1.00 Min. :1.000 Min. :1.000 Min. :0.0000
## 1st Qu.:3.00 1st Qu.:4.000 1st Qu.:5.000 1st Qu.:1.0000
## Median :3.00 Median :4.000 Median :5.000 Median :1.0000
## Mean :2.87 Mean :3.733 Mean :4.413 Mean :0.9666
## 3rd Qu.:3.00 3rd Qu.:4.000 3rd Qu.:5.000 3rd Qu.:1.0000
## Max. :4.00 Max. :4.000 Max. :5.000 Max. :1.0000
## WATER ELECTRICITY CREDIT LAND
## Min. :1.000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:3.000 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :3.000 Median :1.0000 Median :0.0000 Median :1.0000
## Mean :3.498 Mean :0.9042 Mean :0.2488 Mean :0.7134
## 3rd Qu.:4.000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :5.000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## INCOME KKS AID MICRO
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.00000
## Mean :0.0793 Mean :0.1186 Mean :0.2361 Mean :0.07981
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## EXPENDITURE
## Min. : 1.596
## 1st Qu.: 6.615
## Median : 10.393
## Mean : 14.744
## 3rd Qu.: 17.295
## Max. :401.382
# Konversi variabel yang diperlukan
# Misalnya, pastikan variabel kategorikal adalah faktor
data$REGION <- as.factor(data$REGION)
# Fungsi evaluasi performa
evaluate_grf <- function(model, test_data, response, predictors) {
preds <- predict(model, newdata = test_data[predictors])$prediction
mse <- mean((test_data[[response]] - preds)^2)
return(mse)
}
# Deteksi pencilan dengan standardized residual
detect_outliers_grf <- function(model, data, response, predictors) {
residuals <- data[[response]] - predict(model, newdata = data[predictors])$prediction
standardized_residuals <- (residuals - mean(residuals)) / sd(residuals)
return(abs(standardized_residuals) > 3)
}
# Deteksi pencilan dengan standardized residual RF
detect_outliers_rf <- function(model, data, response, predictors) {
residuals <- data[[response]] - predict(model, newdata = data[predictors])
standardized_residuals <- (residuals - mean(residuals)) / sd(residuals)
return(abs(standardized_residuals) > 3)
}
# Winsorization berbasis IQR
winsorize_iqr <- function(x) {
q1 <- quantile(x, 0.25, na.rm = TRUE)
q3 <- quantile(x, 0.75, na.rm = TRUE)
iqr <- q3 - q1
lower_bound <- q1 - 1.5 * iqr
upper_bound <- q3 + 1.5 * iqr
x[x < lower_bound] <- lower_bound
x[x > upper_bound] <- upper_bound
return(x)
}
# GRF Model
# KAB BOGOR
data1<-data %>% filter(Kabupaten.Kota == 1)
data1<-data1[,c(7:29)]
response <- "EXPENDITURE"
predictors <- setdiff(names(data1), c("EXPENDITURE", "REGION"))
# Setup cross-validation
############################
# m = 2
#########################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data1[,c(2:22)],
Y = train_data1[,c(23)],num.trees = 500, mtry = 2)
mse_fold[fold_idx] <- mean((test_data1[[response]] - predict(grf_model, newdata = test_data1[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse1_m2<-mse_values
############################
# m = 5
#########################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data1[,c(2:22)],
Y = train_data1[,c(23)],num.trees = 500, mtry = 5)
mse_fold[fold_idx] <- mean((test_data1[[response]] - predict(grf_model, newdata = test_data1[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse1_m5<-mse_values
###################################
## m = 9
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data1[,c(2:22)],
Y = train_data1[,c(23)],num.trees = 500, mtry = 9)
mse_fold[fold_idx] <- mean((test_data1[[response]] - predict(grf_model, newdata = test_data1[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse1_m9<-mse_values
# Buat dataframe dengan label
graf1 <- data.frame(
m = factor(c("m=2", "m=5", "m=9")), # Label kategori
MSE = c(mean(mse1_m2), mean(mse1_m5), mean(mse1_m9)) # Nilai MSE
)
library(ggplot2)
# Scatter plot dengan label "m=2", "m=5", "m=9" di sumbu X
ggplot(graf1, aes(x = m, y = MSE)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "",
x = "Banyak peubah penjelas",
y = "MSE"
) +
theme_minimal()
###################################
## k = 50
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data1[,c(2:22)],
Y = train_data1[,c(23)],num.trees = 50, mtry = 9)
mse_fold[fold_idx] <- mean((test_data1[[response]] - predict(grf_model, newdata = test_data1[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse1_k50<-mse_values
###################################
## k = 100
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data1[,c(2:22)],
Y = train_data1[,c(23)],num.trees = 100, mtry = 9)
mse_fold[fold_idx] <- mean((test_data1[[response]] - predict(grf_model, newdata = test_data1[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse1_k100<-mse_values
###################################
## k = 150
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data1[,c(2:22)],
Y = train_data1[,c(23)],num.trees = 150, mtry = 9)
mse_fold[fold_idx] <- mean((test_data1[[response]] - predict(grf_model, newdata = test_data1[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse1_k150<-mse_values
###################################
## k = 250
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data1[,c(2:22)],
Y = train_data1[,c(23)],num.trees = 250, mtry = 9)
mse_fold[fold_idx] <- mean((test_data1[[response]] - predict(grf_model, newdata = test_data1[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse1_k250<-mse_values
###################################
## k = 500
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data1[,c(2:22)],
Y = train_data1[,c(23)],num.trees = 500, mtry = 9)
mse_fold[fold_idx] <- mean((test_data1[[response]] - predict(grf_model, newdata = test_data1[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse1_k500<-mse_values
# Dataframe dengan label k untuk sumbu X
graf2_data1 <- data.frame(
m = factor(c("k=50", "k=100", "k=150", "k=250", "k=500"), levels = c("k=50", "k=100", "k=150", "k=250", "k=500")), # Urutan sumbu X
MSE = c(mean(mse1_k50), mean(mse1_k100), mean(mse1_k150), mean(mse1_k250), mean(mse1_k500)) # Nilai MSE
)
# Scatter plot dengan ggplot2
library(ggplot2)
ggplot(graf2_data1, aes(x = m, y = MSE)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "Perbandingan MSE Berdasarkan Jumlah Pohon",
x = "Jumlah Pohon",
y = "MSE (Mean Squared Error)"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1), # Rotasi label sumbu X
legend.position = "none" # Tidak ada legenda
)
#KAB BEKASI
#########################################
# KAB BEKASI
data2<-data %>% filter(Kabupaten.Kota == 16)
data2<-data2[,c(7:29)]
response <- "EXPENDITURE"
predictors <- setdiff(names(data2), c("EXPENDITURE", "REGION"))
# Setup cross-validation
############################
# m = 2
#########################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data2[,c(2:22)],
Y = train_data2[,c(23)],num.trees = 500, mtry = 2)
mse_fold[fold_idx] <- mean((test_data2[[response]] - predict(grf_model, newdata = test_data2[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse2_m2<-mse_values
############################
# m = 5
#########################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data2[,c(2:22)],
Y = train_data2[,c(23)],num.trees = 500, mtry = 5)
mse_fold[fold_idx] <- mean((test_data2[[response]] - predict(grf_model, newdata = test_data2[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse2_m5<-mse_values
###################################
## m = 9
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data2[,c(2:22)],
Y = train_data2[,c(23)],num.trees = 500, mtry = 9)
mse_fold[fold_idx] <- mean((test_data2[[response]] - predict(grf_model, newdata = test_data2[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse2_m9<-mse_values
# Buat dataframe dengan label
graf1 <- data.frame(
m = factor(c("m=2", "m=5", "m=9")), # Label kategori
MSE = c(mean(mse2_m2), mean(mse2_m5), mean(mse2_m9)) # Nilai MSE
)
library(ggplot2)
# Scatter plot dengan label "m=2", "m=5", "m=9" di sumbu X
ggplot(graf1, aes(x = m, y = MSE)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "",
x = "Banyak peubah penjelas",
y = "MSE"
) +
theme_minimal()
###################################
## k = 50
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data2[,c(2:22)],
Y = train_data2[,c(23)],num.trees = 50, mtry = 9)
mse_fold[fold_idx] <- mean((test_data2[[response]] - predict(grf_model, newdata = test_data2[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse2_k50<-mse_values
###################################
## k = 100
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data2[,c(2:22)],
Y = train_data2[,c(23)],num.trees = 100, mtry = 9)
mse_fold[fold_idx] <- mean((test_data2[[response]] - predict(grf_model, newdata = test_data2[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse2_k100<-mse_values
###################################
## k = 150
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data2[,c(2:22)],
Y = train_data2[,c(23)],num.trees = 150, mtry = 9)
mse_fold[fold_idx] <- mean((test_data2[[response]] - predict(grf_model, newdata = test_data2[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse2_k150<-mse_values
###################################
## k = 250
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data2[,c(2:22)],
Y = train_data2[,c(23)],num.trees = 250, mtry = 9)
mse_fold[fold_idx] <- mean((test_data2[[response]] - predict(grf_model, newdata = test_data2[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse2_k250<-mse_values
###################################
## k = 500
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data2[,c(2:22)],
Y = train_data2[,c(23)],num.trees = 500, mtry = 9)
mse_fold[fold_idx] <- mean((test_data2[[response]] - predict(grf_model, newdata = test_data2[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse2_k500<-mse_values
graf2_data2 <- data.frame(
m = factor(c("k=50", "k=100", "k=150", "k=250", "k=500"), levels = c("k=50", "k=100", "k=150", "k=250", "k=500")), # Urutan sumbu X
MSE = c(mean(mse2_k50), mean(mse2_k100), mean(mse2_k150), mean(mse2_k250), mean(mse2_k500)) # Nilai MSE
)
# Scatter plot dengan ggplot2
library(ggplot2)
ggplot(graf2_data2, aes(x = m, y = MSE)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "Perbandingan MSE Berdasarkan Jumlah Pohon",
x = "Jumlah Pohon",
y = "MSE (Mean Squared Error)"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1), # Rotasi label sumbu X
legend.position = "none" # Tidak ada legenda
)
#KOTA BANDUNG
#########################################
data3<-data %>% filter(Kabupaten.Kota == 73)
data3<-data3[,c(7:29)]
response <- "EXPENDITURE"
predictors <- setdiff(names(data3), c("EXPENDITURE", "REGION"))
# Setup cross-validation
############################
# m = 2
#########################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data3$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data3), folds[[fold_idx]])
train_data3 <- data3[train_idx, ]
test_data3 <- data3[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data3[,c(2:22)],
Y = train_data3[,c(23)],num.trees = 500, mtry = 2)
mse_fold[fold_idx] <- mean((test_data3[[response]] - predict(grf_model, newdata = test_data3[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse3_m2<-mse_values
############################
# m = 5
#########################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data3$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data3), folds[[fold_idx]])
train_data3 <- data3[train_idx, ]
test_data3 <- data3[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data3[,c(2:22)],
Y = train_data3[,c(23)],num.trees = 500, mtry = 5)
mse_fold[fold_idx] <- mean((test_data3[[response]] - predict(grf_model, newdata = test_data3[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse3_m5<-mse_values
###################################
## m = 9
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data3$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data3), folds[[fold_idx]])
train_data3 <- data3[train_idx, ]
test_data3 <- data3[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data3[,c(2:22)],
Y = train_data3[,c(23)],num.trees = 500, mtry = 9)
mse_fold[fold_idx] <- mean((test_data3[[response]] - predict(grf_model, newdata = test_data3[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse3_m9<-mse_values
# Buat dataframe dengan label
graf1 <- data.frame(
m = factor(c("m=2", "m=5", "m=9")), # Label kategori
MSE = c(mean(mse3_m2), mean(mse3_m5), mean(mse3_m9)) # Nilai MSE
)
library(ggplot2)
# Scatter plot dengan label "m=2", "m=5", "m=9" di sumbu X
ggplot(graf1, aes(x = m, y = MSE)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "",
x = "Banyak peubah penjelas",
y = "MSE"
) +
theme_minimal()
###################################
## k = 50
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data3$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data3), folds[[fold_idx]])
train_data3 <- data3[train_idx, ]
test_data3 <- data3[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data3[,c(2:22)],
Y = train_data3[,c(23)],num.trees = 50, mtry = 9)
mse_fold[fold_idx] <- mean((test_data3[[response]] - predict(grf_model, newdata = test_data3[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse3_k50<-mse_values
###################################
## k = 100
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data3$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data3), folds[[fold_idx]])
train_data3 <- data3[train_idx, ]
test_data3 <- data3[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data3[,c(2:22)],
Y = train_data3[,c(23)],num.trees = 100, mtry = 9)
mse_fold[fold_idx] <- mean((test_data3[[response]] - predict(grf_model, newdata = test_data3[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse3_k100<-mse_values
###################################
## k = 150
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data3$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data3), folds[[fold_idx]])
train_data3 <- data3[train_idx, ]
test_data3 <- data3[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data3[,c(2:22)],
Y = train_data3[,c(23)],num.trees = 150, mtry = 9)
mse_fold[fold_idx] <- mean((test_data3[[response]] - predict(grf_model, newdata = test_data3[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse3_k150<-mse_values
###################################
## k = 250
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data3$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data3), folds[[fold_idx]])
train_data3 <- data3[train_idx, ]
test_data3 <- data3[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data3[,c(2:22)],
Y = train_data3[,c(23)],num.trees = 250, mtry = 9)
mse_fold[fold_idx] <- mean((test_data3[[response]] - predict(grf_model, newdata = test_data3[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse3_k250<-mse_values
###################################
## k = 500
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data3$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data3), folds[[fold_idx]])
train_data3 <- data3[train_idx, ]
test_data3 <- data3[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data3[,c(2:22)],
Y = train_data3[,c(23)],num.trees = 500, mtry = 9)
mse_fold[fold_idx] <- mean((test_data3[[response]] - predict(grf_model, newdata = test_data3[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse3_k500<-mse_values
# Buat dataframe dengan label
graf2_data3 <- data.frame(
m = factor(c("k=50", "k=100", "k=150", "k=250", "k=500"), levels = c("k=50", "k=100", "k=150", "k=250", "k=500")), # Urutan sumbu X
MSE = c(mean(mse3_k50), mean(mse3_k100), mean(mse3_k150), mean(mse3_k250), mean(mse3_k500)) # Nilai MSE
)
# Scatter plot dengan ggplot2
library(ggplot2)
ggplot(graf2_data3, aes(x = m, y = MSE)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "Perbandingan MSE Berdasarkan Jumlah Pohon",
x = "Jumlah Pohon",
y = "MSE (Mean Squared Error)"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1), # Rotasi label sumbu X
legend.position = "none" # Tidak ada legenda
)
#KAB KUNINGAN
#########################################
data4<-data %>% filter(Kabupaten.Kota == 8)
data4<-data4[,c(7:29)]
response <- "EXPENDITURE"
predictors <- setdiff(names(data4), c("EXPENDITURE", "REGION"))
# Setup cross-validation
############################
# m = 2
#########################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data4$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data4), folds[[fold_idx]])
train_data4 <- data4[train_idx, ]
test_data4 <- data4[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data4[,c(2:22)],
Y = train_data4[,c(23)],num.trees = 500, mtry = 2)
mse_fold[fold_idx] <- mean((test_data4[[response]] - predict(grf_model, newdata = test_data4[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse4_m2<-mse_values
############################
# m = 5
#########################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data4$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data4), folds[[fold_idx]])
train_data4 <- data4[train_idx, ]
test_data4 <- data4[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data4[,c(2:22)],
Y = train_data4[,c(23)],num.trees = 500, mtry = 5)
mse_fold[fold_idx] <- mean((test_data4[[response]] - predict(grf_model, newdata = test_data4[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse4_m5<-mse_values
###################################
## m = 9
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data4$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data4), folds[[fold_idx]])
train_data4 <- data4[train_idx, ]
test_data4 <- data4[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data4[,c(2:22)],
Y = train_data4[,c(23)],num.trees = 500, mtry = 9)
mse_fold[fold_idx] <- mean((test_data4[[response]] - predict(grf_model, newdata = test_data4[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse4_m9<-mse_values
# Buat dataframe dengan label
graf1 <- data.frame(
m = factor(c("m=2", "m=5", "m=9")), # Label kategori
MSE = c(mean(mse4_m2), mean(mse4_m5), mean(mse4_m9)) # Nilai MSE
)
library(ggplot2)
# Scatter plot dengan label "m=2", "m=5", "m=9" di sumbu X
ggplot(graf1, aes(x = m, y = MSE)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "",
x = "Banyak peubah penjelas",
y = "MSE"
) +
theme_minimal()
###################################
## k = 50
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data4$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data4), folds[[fold_idx]])
train_data4 <- data4[train_idx, ]
test_data4 <- data4[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data4[,c(2:22)],
Y = train_data4[,c(23)],num.trees = 50, mtry = 9)
mse_fold[fold_idx] <- mean((test_data4[[response]] - predict(grf_model, newdata = test_data4[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse4_k50<-mse_values
###################################
## k = 100
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data4$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data4), folds[[fold_idx]])
train_data4 <- data4[train_idx, ]
test_data4 <- data4[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data4[,c(2:22)],
Y = train_data4[,c(23)],num.trees = 100, mtry = 9)
mse_fold[fold_idx] <- mean((test_data4[[response]] - predict(grf_model, newdata = test_data4[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse4_k100<-mse_values
###################################
## k = 150
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data4$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data4), folds[[fold_idx]])
train_data4 <- data4[train_idx, ]
test_data4 <- data4[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data4[,c(2:22)],
Y = train_data4[,c(23)],num.trees = 150, mtry = 9)
mse_fold[fold_idx] <- mean((test_data4[[response]] - predict(grf_model, newdata = test_data4[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse4_k150<-mse_values
###################################
## k = 250
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data4$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data4), folds[[fold_idx]])
train_data4 <- data4[train_idx, ]
test_data4 <- data4[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data4[,c(2:22)],
Y = train_data4[,c(23)],num.trees = 250, mtry = 9)
mse_fold[fold_idx] <- mean((test_data4[[response]] - predict(grf_model, newdata = test_data4[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse4_k250<-mse_values
###################################
## k = 500
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data4$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data4), folds[[fold_idx]])
train_data4 <- data4[train_idx, ]
test_data4 <- data4[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data4[,c(2:22)],
Y = train_data4[,c(23)],num.trees = 500, mtry = 9)
mse_fold[fold_idx] <- mean((test_data4[[response]] - predict(grf_model, newdata = test_data4[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse4_k500<-mse_values
# Buat dataframe dengan label
graf2_data4 <- data.frame(
m = factor(c("k=50", "k=100", "k=150", "k=250", "k=500"), levels = c("k=50", "k=100", "k=150", "k=250", "k=500")), # Urutan sumbu X
MSE = c(mean(mse4_k50), mean(mse4_k100), mean(mse4_k150), mean(mse4_k250), mean(mse4_k500)) # Nilai MSE
)
# Scatter plot dengan ggplot2
library(ggplot2)
ggplot(graf2_data4, aes(x = m, y = MSE)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "Perbandingan MSE Berdasarkan Jumlah Pohon",
x = "Jumlah Pohon",
y = "MSE (Mean Squared Error)"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1), # Rotasi label sumbu X
legend.position = "none" # Tidak ada legenda
)
#KOTA SUKABUMI
#########################################
data5<-data %>% filter(Kabupaten.Kota == 72)
data5<-data5[,c(7:29)]
response <- "EXPENDITURE"
predictors <- setdiff(names(data5), c("EXPENDITURE", "REGION"))
# Setup cross-validation
############################
# m = 2
#########################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data5$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data5), folds[[fold_idx]])
train_data5 <- data5[train_idx, ]
test_data5 <- data5[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data5[,c(2:22)],
Y = train_data5[,c(23)],num.trees = 500, mtry = 2)
mse_fold[fold_idx] <- mean((test_data5[[response]] - predict(grf_model, newdata = test_data5[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse5_m2<-mse_values
############################
# m = 5
#########################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data5$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data5), folds[[fold_idx]])
train_data5 <- data5[train_idx, ]
test_data5 <- data5[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data5[,c(2:22)],
Y = train_data5[,c(23)],num.trees = 500, mtry = 5)
mse_fold[fold_idx] <- mean((test_data5[[response]] - predict(grf_model, newdata = test_data5[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse5_m5<-mse_values
###################################
## m = 9
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data5$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data5), folds[[fold_idx]])
train_data5 <- data5[train_idx, ]
test_data5 <- data5[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data5[,c(2:22)],
Y = train_data5[,c(23)],num.trees = 500, mtry = 9)
mse_fold[fold_idx] <- mean((test_data5[[response]] - predict(grf_model, newdata = test_data5[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse5_m9<-mse_values
# Buat dataframe dengan label
graf1 <- data.frame(
m = factor(c("m=2", "m=5", "m=9")), # Label kategori
MSE = c(mean(mse5_m2), mean(mse5_m5), mean(mse5_m9)) # Nilai MSE
)
library(ggplot2)
# Scatter plot dengan label "m=2", "m=5", "m=9" di sumbu X
ggplot(graf1, aes(x = m, y = MSE)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "",
x = "Banyak peubah penjelas",
y = "MSE"
) +
theme_minimal()
###################################
## k = 50
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data5$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data5), folds[[fold_idx]])
train_data5 <- data5[train_idx, ]
test_data5 <- data5[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data5[,c(2:22)],
Y = train_data5[,c(23)],num.trees = 50, mtry = 9)
mse_fold[fold_idx] <- mean((test_data5[[response]] - predict(grf_model, newdata = test_data5[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse5_k50<-mse_values
###################################
## k = 100
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data5$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data5), folds[[fold_idx]])
train_data5 <- data5[train_idx, ]
test_data5 <- data5[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data5[,c(2:22)],
Y = train_data5[,c(23)],num.trees = 100, mtry = 9)
mse_fold[fold_idx] <- mean((test_data5[[response]] - predict(grf_model, newdata = test_data5[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse5_k100<-mse_values
###################################
## k = 150
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data5$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data5), folds[[fold_idx]])
train_data5 <- data5[train_idx, ]
test_data5 <- data5[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data5[,c(2:22)],
Y = train_data5[,c(23)],num.trees = 150, mtry = 9)
mse_fold[fold_idx] <- mean((test_data5[[response]] - predict(grf_model, newdata = test_data5[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse5_k150<-mse_values
###################################
## k = 250
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data5$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data5), folds[[fold_idx]])
train_data5 <- data5[train_idx, ]
test_data5 <- data5[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data5[,c(2:22)],
Y = train_data5[,c(23)],num.trees = 250, mtry = 9)
mse_fold[fold_idx] <- mean((test_data5[[response]] - predict(grf_model, newdata = test_data5[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse5_k250<-mse_values
###################################
## k = 500
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data5$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data5), folds[[fold_idx]])
train_data5 <- data5[train_idx, ]
test_data5 <- data5[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data5[,c(2:22)],
Y = train_data5[,c(23)],num.trees = 500, mtry = 9)
mse_fold[fold_idx] <- mean((test_data5[[response]] - predict(grf_model, newdata = test_data5[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse5_k500<-mse_values
# Buat dataframe dengan label
graf2_data5 <- data.frame(
m = factor(c("k=50", "k=100", "k=150", "k=250", "k=500"), levels = c("k=50", "k=100", "k=150", "k=250", "k=500")), # Urutan sumbu X
MSE = c(mean(mse5_k50), mean(mse5_k100), mean(mse5_k150), mean(mse5_k250), mean(mse5_k500)) # Nilai MSE
)
# Scatter plot dengan ggplot2
library(ggplot2)
ggplot(graf2_data5, aes(x = m, y = MSE)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "Perbandingan MSE Berdasarkan Jumlah Pohon",
x = "Jumlah Pohon",
y = "MSE (Mean Squared Error)"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1), # Rotasi label sumbu X
legend.position = "none" # Tidak ada legenda
)
#KOTA BANJAR
#########################################
data6<-data %>% filter(Kabupaten.Kota == 79)
data6<-data6[,c(7:29)]
response <- "EXPENDITURE"
predictors <- setdiff(names(data6), c("EXPENDITURE", "REGION"))
# Setup cross-validation
############################
# m = 2
#########################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data6$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data6), folds[[fold_idx]])
train_data6 <- data6[train_idx, ]
test_data6 <- data6[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data6[,c(2:22)],
Y = train_data6[,c(23)],num.trees = 500, mtry = 2)
mse_fold[fold_idx] <- mean((test_data6[[response]] - predict(grf_model, newdata = test_data6[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse6_m2<-mse_values
############################
# m = 5
#########################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data6$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data6), folds[[fold_idx]])
train_data6 <- data6[train_idx, ]
test_data6 <- data6[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data6[,c(2:22)],
Y = train_data6[,c(23)],num.trees = 500, mtry = 5)
mse_fold[fold_idx] <- mean((test_data6[[response]] - predict(grf_model, newdata = test_data6[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse6_m5<-mse_values
###################################
## m = 9
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data6$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data6), folds[[fold_idx]])
train_data6 <- data6[train_idx, ]
test_data6 <- data6[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data6[,c(2:22)],
Y = train_data6[,c(23)],num.trees = 500, mtry = 9)
mse_fold[fold_idx] <- mean((test_data6[[response]] - predict(grf_model, newdata = test_data6[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse6_m9<-mse_values
# Buat dataframe dengan label
graf1 <- data.frame(
m = factor(c("m=2", "m=5", "m=9")), # Label kategori
MSE = c(mean(mse6_m2), mean(mse6_m5), mean(mse6_m9)) # Nilai MSE
)
library(ggplot2)
# Scatter plot dengan label "m=2", "m=5", "m=9" di sumbu X
ggplot(graf1, aes(x = m, y = MSE)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "",
x = "Banyak peubah penjelas",
y = "MSE"
) +
theme_minimal()
###################################
## k = 50
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data6$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data6), folds[[fold_idx]])
train_data6 <- data6[train_idx, ]
test_data6 <- data6[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data6[,c(2:22)],
Y = train_data6[,c(23)],num.trees = 50, mtry = 9)
mse_fold[fold_idx] <- mean((test_data6[[response]] - predict(grf_model, newdata = test_data6[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse6_k50<-mse_values
###################################
## k = 100
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data6$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data6), folds[[fold_idx]])
train_data6 <- data6[train_idx, ]
test_data6 <- data6[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data6[,c(2:22)],
Y = train_data6[,c(23)],num.trees = 100, mtry = 9)
mse_fold[fold_idx] <- mean((test_data6[[response]] - predict(grf_model, newdata = test_data6[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse6_k100<-mse_values
###################################
## k = 150
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data6$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data6), folds[[fold_idx]])
train_data6 <- data6[train_idx, ]
test_data6 <- data6[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data6[,c(2:22)],
Y = train_data6[,c(23)],num.trees = 150, mtry = 9)
mse_fold[fold_idx] <- mean((test_data6[[response]] - predict(grf_model, newdata = test_data6[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse6_k150<-mse_values
###################################
## k = 250
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data6$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data6), folds[[fold_idx]])
train_data6 <- data6[train_idx, ]
test_data6 <- data6[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data6[,c(2:22)],
Y = train_data6[,c(23)],num.trees = 250, mtry = 9)
mse_fold[fold_idx] <- mean((test_data6[[response]] - predict(grf_model, newdata = test_data6[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse6_k250<-mse_values
###################################
## k = 500
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data6$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data6), folds[[fold_idx]])
train_data6 <- data6[train_idx, ]
test_data6 <- data6[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data6[,c(2:22)],
Y = train_data6[,c(23)],num.trees = 500, mtry = 9)
mse_fold[fold_idx] <- mean((test_data6[[response]] - predict(grf_model, newdata = test_data6[predictors])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse6_k500<-mse_values
# Buat dataframe dengan label
graf2_data6 <- data.frame(
m = factor(c("k=50", "k=100", "k=150", "k=250", "k=500"), levels = c("k=50", "k=100", "k=150", "k=250", "k=500")), # Urutan sumbu X
MSE = c(mean(mse6_k50), mean(mse6_k100), mean(mse6_k150), mean(mse6_k250), mean(mse6_k500)) # Nilai MSE
)
# Scatter plot dengan ggplot2
library(ggplot2)
ggplot(graf2_data6, aes(x = m, y = MSE)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "Perbandingan MSE Berdasarkan Jumlah Pohon",
x = "Jumlah Pohon",
y = "MSE (Mean Squared Error)"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1), # Rotasi label sumbu X
legend.position = "none" # Tidak ada legenda
)
#GRAFIK GABUNGAN GRF
#Grafik Gabungan
# Contoh data MSE untuk 6 kabupaten/kota
graf3 <- data.frame(
m = rep(c("m=2", "m=5", "m=9"), times = 6), # Label m untuk sumbu X
MSE = c(mean(mse1_m2), mean(mse1_m5), mean(mse1_m9), # Kabupaten/kota 1
mean(mse2_m2), mean(mse2_m5), mean(mse2_m9), # Kabupaten/kota 2
mean(mse3_m2), mean(mse3_m5), mean(mse3_m9), # Kabupaten/kota 3
mean(mse4_m2), mean(mse4_m5), mean(mse4_m9), # Kabupaten/kota 4
mean(mse5_m2), mean(mse5_m5), mean(mse5_m9), # Kabupaten/kota 5
mean(mse6_m2), mean(mse6_m5), mean(mse6_m9)), # Kabupaten/kota 6
Kabupaten = rep(c("Kab. Bogor", "Kab. Bekasi", "Kota Bandung",
"Kab. Kuningan", "Kota Sukabumi", "Kota Banjar"),
each = 3) # Nama kabupaten/kota
)
library(ggplot2)
library(dplyr)
# Tambahkan kategori PDRB
graf3 <- graf3 %>%
mutate(PDRB = ifelse(Kabupaten %in% c("Kab. Bogor", "Kab. Bekasi", "Kota Bandung"),
"PDRB Tinggi",
"PDRB Rendah"))
# Grafik untuk MSE dengan PDRB Tinggi dan Rendah
ggplot(graf3, aes(x = m, y = MSE, color = Kabupaten, group = Kabupaten, shape = PDRB)) +
geom_point(size = 3) +
geom_line(linewidth = 1) +
scale_color_manual(
values = c("Kab. Bogor" = "#1E90FF", # Biru terang
"Kab. Bekasi" = "#87CEEB", # Biru muda
"Kota Bandung" = "#ADD8E6", # Biru sangat muda
"Kab. Kuningan" = "#800000", # Maroon
"Kota Sukabumi" = "#A52A2A", # Merah maroon lebih muda
"Kota Banjar" = "#CD5C5C") # Merah muda
) +
scale_shape_manual(
values = c("PDRB Tinggi" = 16, # Bulat
"PDRB Rendah" = 17) # Segitiga
) +
labs(
title = "Perbandingan MSE untuk Daerah PDRB Tinggi dan Rendah",
x = "Banyak peubah penjelas",
y = "MSE (Mean Squared Error)",
color = "Kabupaten/Kota",
shape = "Kategori PDRB"
) +
theme_minimal() +
theme(legend.position = "right")
#Grafik Gabungan
# Contoh data MSE untuk 6 kabupaten/kota
graf4 <- data.frame(
m = rep(c("k=50", "k=100", "k=150", "k=250", "k=500"), times = 6), # Label m untuk sumbu X
MSE = c(mean(mse1_k50), mean(mse1_k100), mean(mse1_k150),mean(mse1_k250), mean(mse1_k500), # Kabupaten/kota 1
mean(mse2_k50), mean(mse2_k100), mean(mse2_k150),mean(mse2_k250), mean(mse2_k500), # Kabupaten/kota 2
mean(mse3_k50), mean(mse3_k100), mean(mse3_k150),mean(mse3_k250), mean(mse3_k500), # Kabupaten/kota 3
mean(mse4_k50), mean(mse4_k100), mean(mse4_k150),mean(mse4_k250), mean(mse4_k500), # Kabupaten/kota 4
mean(mse5_k50), mean(mse5_k100), mean(mse5_k150),mean(mse5_k250), mean(mse5_k500), # Kabupaten/kota 5
mean(mse6_k50), mean(mse6_k100), mean(mse6_k150),mean(mse6_k250), mean(mse6_k500)), # Kabupaten/kota 6
Kabupaten = rep(c("Kab. Bogor", "Kab. Bekasi", "Kota Bandung",
"Kab. Kuningan", "Kota Sukabumi", "Kota Banjar"),
each = 5) # Nama kabupaten/kota
)
library(ggplot2)
graf4 <- graf4 %>%
mutate(m = factor(m, levels = c("k=50", "k=100", "k=150", "k=250", "k=500")),
PDRB = ifelse(Kabupaten %in% c("Kab. Bogor", "Kab. Bekasi", "Kota Bandung"),
"PDRB Tinggi",
"PDRB Rendah"))
# Grafik untuk jumlah pohon dengan sumbu x terurut
ggplot(graf4, aes(x = m, y = MSE, color = Kabupaten, group = Kabupaten, shape = PDRB)) +
geom_point(size = 3) +
geom_line(linewidth = 1) +
scale_color_manual(
values = c("Kab. Bogor" = "#1E90FF", # Biru terang
"Kab. Bekasi" = "#87CEEB", # Biru muda
"Kota Bandung" = "#ADD8E6", # Biru sangat muda
"Kab. Kuningan" = "#800000", # Maroon
"Kota Sukabumi" = "#A52A2A", # Merah maroon lebih muda
"Kota Banjar" = "#CD5C5C") # Merah muda
) +
scale_shape_manual(
values = c("PDRB Tinggi" = 16, # Bulat
"PDRB Rendah" = 17) # Segitiga
) +
labs(
title = "Perbandingan MSE untuk Daerah PDRB Tinggi dan Rendah",
x = "Jumlah pohon",
y = "MSE (Mean Squared Error)",
color = "Kabupaten/Kota",
shape = "Kategori PDRB"
) +
theme_minimal() +
theme(legend.position = "right")
## Deteksi Generalized Random Forest (GRF)
# Deteksi pencilan dan transformasi BOGOR
# Menghitung kuartil Q1 dan Q3
Q1 <- quantile(data1$EXPENDITURE, 0.25)
Q3 <- quantile(data1$EXPENDITURE, 0.75)
# Menghitung IQR
IQR_value <- Q3 - Q1
# Menghitung batas bawah dan atas
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- detect_outliers_grf(regression_forest(X = data1[,c(2:22)],Y = data1[,c(23)],num.trees = 500, mtry = 9), data1, response, predictors)
data1outlier<-sum(outliers)
data1$EXPENDITURE_TRANSFORMED <- data1$EXPENDITURE
data1$EXPENDITURE_TRANSFORMED[outliers] <- ifelse(data1$EXPENDITURE_TRANSFORMED[outliers] < lower_bound, lower_bound, upper_bound)
# Deteksi pencilan dan transformasi BEKASI
# Menghitung kuartil Q1 dan Q3
Q1 <- quantile(data2$EXPENDITURE, 0.25)
Q3 <- quantile(data2$EXPENDITURE, 0.75)
# Menghitung IQR
IQR_value <- Q3 - Q1
# Menghitung batas bawah dan atas
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- detect_outliers_grf(regression_forest(X = data2[,c(2:22)],Y = data2[,c(23)],num.trees = 500, mtry = 9), data2, response, predictors)
data2outlier<-sum(outliers)
data2$EXPENDITURE_TRANSFORMED <- data2$EXPENDITURE
data2$EXPENDITURE_TRANSFORMED[outliers] <- ifelse(data2$EXPENDITURE_TRANSFORMED[outliers] < lower_bound, lower_bound, upper_bound)
# Deteksi pencilan dan transformasi BANDUNG
# Menghitung kuartil Q1 dan Q3
Q1 <- quantile(data3$EXPENDITURE, 0.25)
Q3 <- quantile(data3$EXPENDITURE, 0.75)
# Menghitung IQR
IQR_value <- Q3 - Q1
# Menghitung batas bawah dan atas
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- detect_outliers_grf(regression_forest(X = data3[,c(2:22)],Y = data3[,c(23)],num.trees = 500, mtry = 9), data3, response, predictors)
data3outlier<-sum(outliers)
data3$EXPENDITURE_TRANSFORMED <- data3$EXPENDITURE
data3$EXPENDITURE_TRANSFORMED[outliers] <- ifelse(data3$EXPENDITURE_TRANSFORMED[outliers] < lower_bound, lower_bound, upper_bound)
# Deteksi pencilan dan transformasi KUNINGAN
# Menghitung kuartil Q1 dan Q3
Q1 <- quantile(data4$EXPENDITURE, 0.25)
Q3 <- quantile(data4$EXPENDITURE, 0.75)
# Menghitung IQR
IQR_value <- Q3 - Q1
# Menghitung batas bawah dan atas
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- detect_outliers_grf(regression_forest(X = data4[,c(2:22)],Y = data4[,c(23)],num.trees = 500, mtry = 9), data4, response, predictors)
data4outlier<-sum(outliers)
data4$EXPENDITURE_TRANSFORMED <- data4$EXPENDITURE
data4$EXPENDITURE_TRANSFORMED[outliers] <- ifelse(data4$EXPENDITURE_TRANSFORMED[outliers] < lower_bound, lower_bound, upper_bound)
# Deteksi pencilan dan transformasi SUKABUMI
# Menghitung kuartil Q1 dan Q3
Q1 <- quantile(data5$EXPENDITURE, 0.25)
Q3 <- quantile(data5$EXPENDITURE, 0.75)
# Menghitung IQR
IQR_value <- Q3 - Q1
# Menghitung batas bawah dan atas
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- detect_outliers_grf(regression_forest(X = data5[,c(2:22)],Y = data5[,c(23)],num.trees = 500, mtry = 9), data5, response, predictors)
data5outlier<-sum(outliers)
data5$EXPENDITURE_TRANSFORMED <- data5$EXPENDITURE
data5$EXPENDITURE_TRANSFORMED[outliers] <- ifelse(data5$EXPENDITURE_TRANSFORMED[outliers] < lower_bound, lower_bound, upper_bound)
# Deteksi pencilan dan transformasi BANJAR
# Menghitung kuartil Q1 dan Q3
Q1 <- quantile(data6$EXPENDITURE, 0.25)
Q3 <- quantile(data6$EXPENDITURE, 0.75)
# Menghitung IQR
IQR_value <- Q3 - Q1
# Menghitung batas bawah dan atas
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- detect_outliers_grf(regression_forest(X = data6[,c(2:22)],Y = data6[,c(23)],num.trees = 500, mtry = 9), data6, response, predictors)
data6outlier<-sum(outliers)
data6$EXPENDITURE_TRANSFORMED <- data6$EXPENDITURE
data6$EXPENDITURE_TRANSFORMED[outliers] <- ifelse(data6$EXPENDITURE_TRANSFORMED[outliers] < lower_bound, lower_bound, upper_bound)
#Pemodelan GRF setelah ditransformasi
# Pemodelan ulang setelah transformasi
# BOGOR
set.seed(123)
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE_TRANSFORMED, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data1[,c(2:22)],
Y = train_data1$EXPENDITURE_TRANSFORMED,num.trees = 500, mtry = 9)
mse_fold[fold_idx] <- mean((test_data1$EXPENDITURE_TRANSFORMED - predict(grf_model, newdata = test_data1[,c(2:22)])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse1_best<-mse_values
##################################
#BEKASI
#############################
set.seed(123)
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE_TRANSFORMED, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data2[,c(2:22)],
Y = train_data2$EXPENDITURE_TRANSFORMED,num.trees = 500, mtry = 9)
mse_fold[fold_idx] <- mean((test_data2$EXPENDITURE_TRANSFORMED - predict(grf_model, newdata = test_data2[,c(2:22)])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse2_best<-mse_values
###########################
#BANDUNG
#############################
set.seed(123)
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data3$EXPENDITURE_TRANSFORMED, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data3), folds[[fold_idx]])
train_data3 <- data3[train_idx, ]
test_data3 <- data3[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data3[,c(2:22)],
Y = train_data3$EXPENDITURE_TRANSFORMED,num.trees = 500, mtry = 9)
mse_fold[fold_idx] <- mean((test_data3$EXPENDITURE_TRANSFORMED - predict(grf_model, newdata = test_data3[,c(2:22)])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse3_best<-mse_values
###########################
#KUNINGAN
#############################
set.seed(123)
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data4$EXPENDITURE_TRANSFORMED, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data4), folds[[fold_idx]])
train_data4 <- data4[train_idx, ]
test_data4 <- data4[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data4[,c(2:22)],
Y = train_data4$EXPENDITURE_TRANSFORMED,num.trees = 500, mtry = 9)
mse_fold[fold_idx] <- mean((test_data4$EXPENDITURE_TRANSFORMED - predict(grf_model, newdata = test_data4[,c(2:22)])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse4_best<-mse_values
###########################
#SUKABUMI
#############################
set.seed(123)
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data5$EXPENDITURE_TRANSFORMED, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data5), folds[[fold_idx]])
train_data5 <- data5[train_idx, ]
test_data5 <- data5[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data5[,c(2:22)],
Y = train_data5$EXPENDITURE_TRANSFORMED,num.trees = 500, mtry = 9)
mse_fold[fold_idx] <- mean((test_data5$EXPENDITURE_TRANSFORMED - predict(grf_model, newdata = test_data5[,c(2:22)])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse5_best<-mse_values
###########################
#BANJAR
#############################
set.seed(123)
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data6$EXPENDITURE_TRANSFORMED, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data6), folds[[fold_idx]])
train_data6 <- data6[train_idx, ]
test_data6 <- data6[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data6[,c(2:22)],
Y = train_data6$EXPENDITURE_TRANSFORMED,num.trees = 500, mtry = 9)
mse_fold[fold_idx] <- mean((test_data6$EXPENDITURE_TRANSFORMED - predict(grf_model, newdata = test_data6[,c(2:22)])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse6_best<-mse_values
# RF Model
# KAB BOGOR
# Setup cross-validation
############################
# m = 2
#########################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data1[,c(2:22)],
y = train_data1[,c(23)],ntree = 500, mtry = 2)
mserf_fold[fold_idx] <- mean((test_data1[[response]] - predict(rf_model, newdata = test_data1[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf1_m2<-mserf_values
############################
# m = 5
#########################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data1[,c(2:22)],
y = train_data1[,c(23)],ntree = 500, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data1[[response]] - predict(rf_model, newdata = test_data1[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf1_m5<-mserf_values
###################################
## m = 9
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data1[,c(2:22)],
y = train_data1[,c(23)],ntree = 500, mtry = 9)
mserf_fold[fold_idx] <- mean((test_data1[[response]] - predict(rf_model, newdata = test_data1[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf1_m9<-mserf_values
# Buat dataframe dengan label
graf1 <- data.frame(
m = factor(c("m=2", "m=5", "m=9")), # Label kategori
MSERF = c(mean(mserf1_m2), mean(mserf1_m5), mean(mserf1_m9)) # Nilai MSERF
)
library(ggplot2)
# Scatter plot dengan label "m=2", "m=5", "m=9" di sumbu X
ggplot(graf1, aes(x = m, y = MSERF)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "",
x = "Banyak peubah penjelas",
y = "MSERF"
) +
theme_minimal()
###################################
## k = 50
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data1[,c(2:22)],
y = train_data1[,c(23)],ntree = 50, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data1[[response]] - predict(rf_model, newdata = test_data1[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf1_k50<-mserf_values
###################################
## k = 100
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data1[,c(2:22)],
y = train_data1[,c(23)],ntree = 100, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data1[[response]] - predict(rf_model, newdata = test_data1[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf1_k100<-mserf_values
###################################
## k = 150
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data1[,c(2:22)],
y = train_data1[,c(23)],ntree = 150, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data1[[response]] - predict(rf_model, newdata = test_data1[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf1_k150<-mserf_values
###################################
## k = 250
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data1[,c(2:22)],
y = train_data1[,c(23)],ntree = 250, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data1[[response]] - predict(rf_model, newdata = test_data1[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf1_k250<-mserf_values
###################################
## k = 500
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data1[,c(2:22)],
y = train_data1[,c(23)],ntree = 500, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data1[[response]] - predict(rf_model, newdata = test_data1[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf1_k500<-mserf_values
# Pastikan kolom m diatur sebagai faktor dengan level terurut
graf2_data1 <- data.frame(
m = factor(c("k=50", "k=100", "k=150", "k=250", "k=500"),
levels = c("k=50", "k=100", "k=150", "k=250", "k=500")), # Urutan level
MSERF = c(mean(mserf1_k50), mean(mserf1_k100), mean(mserf1_k150),
mean(mserf1_k250), mean(mserf1_k500)) # Nilai MSERF
)
library(ggplot2)
# Scatter plot dengan sumbu X terurut
ggplot(graf2_data1, aes(x = m, y = MSERF)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "Perbandingan MSERF Berdasarkan Jumlah Pohon",
x = "Jumlah Pohon",
y = "MSERF"
) +
theme_minimal()
#KAB BEKASI
#########################################
# KAB BEKASI
# Setup cross-validation
############################
# m = 2
#########################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data2[,c(2:22)],
y = train_data2[,c(23)],ntree = 500, mtry = 2)
mserf_fold[fold_idx] <- mean((test_data2[[response]] - predict(rf_model, newdata = test_data2[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf2_m2<-mserf_values
############################
# m = 5
#########################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data2[,c(2:22)],
y = train_data2[,c(23)],ntree = 500, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data2[[response]] - predict(rf_model, newdata = test_data2[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf2_m5<-mserf_values
###################################
## m = 9
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data2[,c(2:22)],
y = train_data2[,c(23)],ntree = 500, mtry = 9)
mserf_fold[fold_idx] <- mean((test_data2[[response]] - predict(rf_model, newdata = test_data2[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf2_m9<-mserf_values
# Buat dataframe dengan label
graf1 <- data.frame(
m = factor(c("m=2", "m=5", "m=9")), # Label kategori
MSERF = c(mean(mserf2_m2), mean(mserf2_m5), mean(mserf2_m9)) # Nilai MSERF
)
library(ggplot2)
# Scatter plot dengan label "m=2", "m=5", "m=9" di sumbu X
ggplot(graf1, aes(x = m, y = MSERF)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "",
x = "Banyak peubah penjelas",
y = "MSERF"
) +
theme_minimal()
###################################
## k = 50
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data2[,c(2:22)],
y = train_data2[,c(23)],ntree = 50, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data2[[response]] - predict(rf_model, newdata = test_data2[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf2_k50<-mserf_values
###################################
## k = 100
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data2[,c(2:22)],
y = train_data2[,c(23)],ntree = 100, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data2[[response]] - predict(rf_model, newdata = test_data2[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf2_k100<-mserf_values
###################################
## k = 150
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data2[,c(2:22)],
y = train_data2[,c(23)],ntree = 150, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data2[[response]] - predict(rf_model, newdata = test_data2[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf2_k150<-mserf_values
###################################
## k = 250
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data2[,c(2:22)],
y = train_data2[,c(23)],ntree = 250, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data2[[response]] - predict(rf_model, newdata = test_data2[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf2_k250<-mserf_values
###################################
## k = 500
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data2[,c(2:22)],
y = train_data2[,c(23)],ntree = 500, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data2[[response]] - predict(rf_model, newdata = test_data2[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf2_k500<-mserf_values
# Buat dataframe dengan label
graf2_data2 <- data.frame(
m = factor(c("k=50", "k=100", "k=150", "k=250", "k=500"),
levels = c("k=50", "k=100", "k=150", "k=250", "k=500")), # Urutan level eksplisit
MSERF = c(mean(mserf2_k50), mean(mserf2_k100), mean(mserf2_k150),
mean(mserf2_k250), mean(mserf2_k500)) # Nilai MSERF
)
library(ggplot2)
# Scatter plot dengan sumbu X terurut
ggplot(graf2_data2, aes(x = m, y = MSERF)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "Perbandingan MSERF Berdasarkan Jumlah Pohon",
x = "Jumlah Pohon",
y = "MSERF"
) +
theme_minimal()
#KOTA BANDUNG
#########################################
# Setup cross-validation
############################
# m = 2
#########################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data3$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data3), folds[[fold_idx]])
train_data3 <- data3[train_idx, ]
test_data3 <- data3[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data3[,c(2:22)],
y = train_data3[,c(23)],ntree = 500, mtry = 2)
mserf_fold[fold_idx] <- mean((test_data3[[response]] - predict(rf_model, newdata = test_data3[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf3_m2<-mserf_values
############################
# m = 5
#########################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data3$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data3), folds[[fold_idx]])
train_data3 <- data3[train_idx, ]
test_data3 <- data3[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data3[,c(2:22)],
y = train_data3[,c(23)],ntree = 500, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data3[[response]] - predict(rf_model, newdata = test_data3[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf3_m5<-mserf_values
###################################
## m = 9
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data3$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data3), folds[[fold_idx]])
train_data3 <- data3[train_idx, ]
test_data3 <- data3[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data3[,c(2:22)],
y = train_data3[,c(23)],ntree = 500, mtry = 9)
mserf_fold[fold_idx] <- mean((test_data3[[response]] - predict(rf_model, newdata = test_data3[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf3_m9<-mserf_values
# Buat dataframe dengan label
graf1 <- data.frame(
m = factor(c("m=2", "m=5", "m=9")), # Label kategori
MSERF = c(mean(mserf3_m2), mean(mserf3_m5), mean(mserf3_m9)) # Nilai MSERF
)
library(ggplot2)
# Scatter plot dengan label "m=2", "m=5", "m=9" di sumbu X
ggplot(graf1, aes(x = m, y = MSERF)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "",
x = "Banyak peubah penjelas",
y = "MSERF"
) +
theme_minimal()
###################################
## k = 50
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data3$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data3), folds[[fold_idx]])
train_data3 <- data3[train_idx, ]
test_data3 <- data3[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data3[,c(2:22)],
y = train_data3[,c(23)],ntree = 50, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data3[[response]] - predict(rf_model, newdata = test_data3[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf3_k50<-mserf_values
###################################
## k = 100
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data3$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data3), folds[[fold_idx]])
train_data3 <- data3[train_idx, ]
test_data3 <- data3[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data3[,c(2:22)],
y = train_data3[,c(23)],ntree = 100, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data3[[response]] - predict(rf_model, newdata = test_data3[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf3_k100<-mserf_values
###################################
## k = 150
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data3$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data3), folds[[fold_idx]])
train_data3 <- data3[train_idx, ]
test_data3 <- data3[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data3[,c(2:22)],
y = train_data3[,c(23)],ntree = 150, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data3[[response]] - predict(rf_model, newdata = test_data3[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf3_k150<-mserf_values
###################################
## k = 250
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data3$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data3), folds[[fold_idx]])
train_data3 <- data3[train_idx, ]
test_data3 <- data3[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data3[,c(2:22)],
y = train_data3[,c(23)],ntree = 250, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data3[[response]] - predict(rf_model, newdata = test_data3[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf3_k250<-mserf_values
###################################
## k = 500
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data3$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data3), folds[[fold_idx]])
train_data3 <- data3[train_idx, ]
test_data3 <- data3[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data3[,c(2:22)],
y = train_data3[,c(23)],ntree = 500, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data3[[response]] - predict(rf_model, newdata = test_data3[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf3_k500<-mserf_values
# Buat dataframe dengan label
# Atur kolom 'm' sebagai faktor dengan level terurut
graf2_data3 <- data.frame(
m = factor(c("k=50", "k=100", "k=150", "k=250", "k=500"),
levels = c("k=50", "k=100", "k=150", "k=250", "k=500")), # Urutan level eksplisit
MSERF = c(mean(mserf3_k50), mean(mserf3_k100), mean(mserf3_k150),
mean(mserf3_k250), mean(mserf3_k500)) # Nilai MSERF
)
library(ggplot2)
# Scatter plot dengan sumbu X terurut
ggplot(graf2_data3, aes(x = m, y = MSERF)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "Perbandingan MSERF Berdasarkan Jumlah Pohon",
x = "Jumlah Pohon",
y = "MSERF"
) +
theme_minimal()
#KAB KUNINGAN
#########################################
# Setup cross-validation
############################
# m = 2
#########################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data4$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data4), folds[[fold_idx]])
train_data4 <- data4[train_idx, ]
test_data4 <- data4[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data4[,c(2:22)],
y = train_data4[,c(23)],ntree = 500, mtry = 2)
mserf_fold[fold_idx] <- mean((test_data4[[response]] - predict(rf_model, newdata = test_data4[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf4_m2<-mserf_values
############################
# m = 5
#########################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data4$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data4), folds[[fold_idx]])
train_data4 <- data4[train_idx, ]
test_data4 <- data4[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data4[,c(2:22)],
y = train_data4[,c(23)],ntree = 500, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data4[[response]] - predict(rf_model, newdata = test_data4[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf4_m5<-mserf_values
###################################
## m = 9
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data4$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data4), folds[[fold_idx]])
train_data4 <- data4[train_idx, ]
test_data4 <- data4[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data4[,c(2:22)],
y = train_data4[,c(23)],ntree = 500, mtry = 9)
mserf_fold[fold_idx] <- mean((test_data4[[response]] - predict(rf_model, newdata = test_data4[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf4_m9<-mserf_values
# Buat dataframe dengan label
graf1 <- data.frame(
m = factor(c("m=2", "m=5", "m=9")), # Label kategori
MSERF = c(mean(mserf4_m2), mean(mserf4_m5), mean(mserf4_m9)) # Nilai MSERF
)
library(ggplot2)
# Scatter plot dengan label "m=2", "m=5", "m=9" di sumbu X
ggplot(graf1, aes(x = m, y = MSERF)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "",
x = "Banyak peubah penjelas",
y = "MSERF"
) +
theme_minimal()
###################################
## k = 50
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data4$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data4), folds[[fold_idx]])
train_data4 <- data4[train_idx, ]
test_data4 <- data4[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data4[,c(2:22)],
y = train_data4[,c(23)],ntree = 50, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data4[[response]] - predict(rf_model, newdata = test_data4[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf4_k50<-mserf_values
###################################
## k = 100
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data4$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data4), folds[[fold_idx]])
train_data4 <- data4[train_idx, ]
test_data4 <- data4[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data4[,c(2:22)],
y = train_data4[,c(23)],ntree = 100, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data4[[response]] - predict(rf_model, newdata = test_data4[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf4_k100<-mserf_values
###################################
## k = 150
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data4$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data4), folds[[fold_idx]])
train_data4 <- data4[train_idx, ]
test_data4 <- data4[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data4[,c(2:22)],
y = train_data4[,c(23)],ntree = 150, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data4[[response]] - predict(rf_model, newdata = test_data4[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf4_k150<-mserf_values
###################################
## k = 250
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data4$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data4), folds[[fold_idx]])
train_data4 <- data4[train_idx, ]
test_data4 <- data4[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data4[,c(2:22)],
y = train_data4[,c(23)],ntree = 250, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data4[[response]] - predict(rf_model, newdata = test_data4[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf4_k250<-mserf_values
###################################
## k = 500
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data4$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data4), folds[[fold_idx]])
train_data4 <- data4[train_idx, ]
test_data4 <- data4[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data4[,c(2:22)],
y = train_data4[,c(23)],ntree = 500, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data4[[response]] - predict(rf_model, newdata = test_data4[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf4_k500<-mserf_values
# Buat dataframe dengan label
graf2_data4 <- data.frame(
m = factor(c("k=50", "k=100", "k=150", "k=250", "k=500"),
levels = c("k=50", "k=100", "k=150", "k=250", "k=500")), # Urutan level eksplisit
MSERF = c(mean(mserf4_k50), mean(mserf4_k100), mean(mserf4_k150),
mean(mserf4_k250), mean(mserf4_k500)) # Nilai MSERF
)
library(ggplot2)
# Scatter plot dengan sumbu X terurut
ggplot(graf2_data4, aes(x = m, y = MSERF)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "Perbandingan MSERF Berdasarkan Jumlah Pohon",
x = "Jumlah Pohon",
y = "MSERF"
) +
theme_minimal()
#KOTA SUKABUMI
#########################################
# Setup cross-validation
############################
# m = 2
#########################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data5$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data5), folds[[fold_idx]])
train_data5 <- data5[train_idx, ]
test_data5 <- data5[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data5[,c(2:22)],
y = train_data5[,c(23)],ntree = 500, mtry = 2)
mserf_fold[fold_idx] <- mean((test_data5[[response]] - predict(rf_model, newdata = test_data5[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf5_m2<-mserf_values
############################
# m = 5
#########################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data5$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data5), folds[[fold_idx]])
train_data5 <- data5[train_idx, ]
test_data5 <- data5[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data5[,c(2:22)],
y = train_data5[,c(23)],ntree = 500, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data5[[response]] - predict(rf_model, newdata = test_data5[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf5_m5<-mserf_values
###################################
## m = 9
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data5$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data5), folds[[fold_idx]])
train_data5 <- data5[train_idx, ]
test_data5 <- data5[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data5[,c(2:22)],
y = train_data5[,c(23)],ntree = 500, mtry = 9)
mserf_fold[fold_idx] <- mean((test_data5[[response]] - predict(rf_model, newdata = test_data5[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf5_m9<-mserf_values
# Buat dataframe dengan label
graf1 <- data.frame(
m = factor(c("m=2", "m=5", "m=9")), # Label kategori
MSERF = c(mean(mserf5_m2), mean(mserf5_m5), mean(mserf5_m9)) # Nilai MSERF
)
library(ggplot2)
# Scatter plot dengan label "m=2", "m=5", "m=9" di sumbu X
ggplot(graf1, aes(x = m, y = MSERF)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "",
x = "Banyak peubah penjelas",
y = "MSERF"
) +
theme_minimal()
###################################
## k = 50
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data5$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data5), folds[[fold_idx]])
train_data5 <- data5[train_idx, ]
test_data5 <- data5[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data5[,c(2:22)],
y = train_data5[,c(23)],ntree = 50, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data5[[response]] - predict(rf_model, newdata = test_data5[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf5_k50<-mserf_values
###################################
## k = 100
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data5$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data5), folds[[fold_idx]])
train_data5 <- data5[train_idx, ]
test_data5 <- data5[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data5[,c(2:22)],
y = train_data5[,c(23)],ntree = 100, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data5[[response]] - predict(rf_model, newdata = test_data5[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf5_k100<-mserf_values
###################################
## k = 150
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data5$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data5), folds[[fold_idx]])
train_data5 <- data5[train_idx, ]
test_data5 <- data5[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data5[,c(2:22)],
y = train_data5[,c(23)],ntree = 150, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data5[[response]] - predict(rf_model, newdata = test_data5[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf5_k150<-mserf_values
###################################
## k = 250
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data5$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data5), folds[[fold_idx]])
train_data5 <- data5[train_idx, ]
test_data5 <- data5[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data5[,c(2:22)],
y = train_data5[,c(23)],ntree = 250, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data5[[response]] - predict(rf_model, newdata = test_data5[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf5_k250<-mserf_values
###################################
## k = 500
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data5$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data5), folds[[fold_idx]])
train_data5 <- data5[train_idx, ]
test_data5 <- data5[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data5[,c(2:22)],
y = train_data5[,c(23)],ntree = 500, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data5[[response]] - predict(rf_model, newdata = test_data5[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf5_k500<-mserf_values
# Buat dataframe dengan label
graf2_data5 <- data.frame(
m = factor(c("k=50", "k=100", "k=150", "k=250", "k=500"),
levels = c("k=50", "k=100", "k=150", "k=250", "k=500")), # Urutan level eksplisit
MSERF = c(mean(mserf5_k50), mean(mserf5_k100), mean(mserf5_k150),
mean(mserf5_k250), mean(mserf5_k500)) # Nilai MSERF
)
library(ggplot2)
# Scatter plot dengan sumbu X terurut
ggplot(graf2_data5, aes(x = m, y = MSERF)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "Perbandingan MSERF Berdasarkan Jumlah Pohon",
x = "Jumlah Pohon",
y = "MSERF"
) +
theme_minimal()
#KOTA BANJAR
#########################################
# Setup cross-validation
############################
# m = 2
#########################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data6$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data6), folds[[fold_idx]])
train_data6 <- data6[train_idx, ]
test_data6 <- data6[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data6[,c(2:22)],
y = train_data6[,c(23)],ntree = 500, mtry = 2)
mserf_fold[fold_idx] <- mean((test_data6[[response]] - predict(rf_model, newdata = test_data6[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf6_m2<-mserf_values
############################
# m = 5
#########################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data6$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data6), folds[[fold_idx]])
train_data6 <- data6[train_idx, ]
test_data6 <- data6[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data6[,c(2:22)],
y = train_data6[,c(23)],ntree = 500, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data6[[response]] - predict(rf_model, newdata = test_data6[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf6_m5<-mserf_values
###################################
## m = 9
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data6$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data6), folds[[fold_idx]])
train_data6 <- data6[train_idx, ]
test_data6 <- data6[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data6[,c(2:22)],
y = train_data6[,c(23)],ntree = 500, mtry = 9)
mserf_fold[fold_idx] <- mean((test_data6[[response]] - predict(rf_model, newdata = test_data6[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf6_m9<-mserf_values
# Buat dataframe dengan label
graf1 <- data.frame(
m = factor(c("m=2", "m=5", "m=9")), # Label kategori
MSERF = c(mean(mserf6_m2), mean(mserf6_m5), mean(mserf6_m9)) # Nilai MSERF
)
library(ggplot2)
# Scatter plot dengan label "m=2", "m=5", "m=9" di sumbu X
ggplot(graf1, aes(x = m, y = MSERF)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "",
x = "Banyak peubah penjelas",
y = "MSERF"
) +
theme_minimal()
###################################
## k = 50
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data6$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data6), folds[[fold_idx]])
train_data6 <- data6[train_idx, ]
test_data6 <- data6[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data6[,c(2:22)],
y = train_data6[,c(23)],ntree = 50, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data6[[response]] - predict(rf_model, newdata = test_data6[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf6_k50<-mserf_values
###################################
## k = 100
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data6$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data6), folds[[fold_idx]])
train_data6 <- data6[train_idx, ]
test_data6 <- data6[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data6[,c(2:22)],
y = train_data6[,c(23)],ntree = 100, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data6[[response]] - predict(rf_model, newdata = test_data6[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf6_k100<-mserf_values
###################################
## k = 150
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data6$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data6), folds[[fold_idx]])
train_data6 <- data6[train_idx, ]
test_data6 <- data6[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data6[,c(2:22)],
y = train_data6[,c(23)],ntree = 150, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data6[[response]] - predict(rf_model, newdata = test_data6[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf6_k150<-mserf_values
###################################
## k = 250
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data6$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data6), folds[[fold_idx]])
train_data6 <- data6[train_idx, ]
test_data6 <- data6[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data6[,c(2:22)],
y = train_data6[,c(23)],ntree = 250, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data6[[response]] - predict(rf_model, newdata = test_data6[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf6_k250<-mserf_values
###################################
## k = 500
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data6$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data6), folds[[fold_idx]])
train_data6 <- data6[train_idx, ]
test_data6 <- data6[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data6[,c(2:22)],
y = train_data6[,c(23)],ntree = 500, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data6[[response]] - predict(rf_model, newdata = test_data6[predictors]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf6_k500<-mserf_values
# Buat dataframe dengan label
# Atur kolom 'm' sebagai faktor dengan level terurut
graf2_data6 <- data.frame(
m = factor(c("k=50", "k=100", "k=150", "k=250", "k=500"),
levels = c("k=50", "k=100", "k=150", "k=250", "k=500")), # Urutan level eksplisit
MSERF = c(mean(mserf6_k50), mean(mserf6_k100), mean(mserf6_k150),
mean(mserf6_k250), mean(mserf6_k500)) # Nilai MSERF
)
library(ggplot2)
# Scatter plot dengan sumbu X terurut
ggplot(graf2_data6, aes(x = m, y = MSERF)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "Perbandingan MSERF Berdasarkan Jumlah Pohon",
x = "Jumlah Pohon",
y = "MSERF"
) +
theme_minimal()
#GRAFIK GABUNGAN RF
#Grafik Gabungan
# Contoh data MSERF untuk 6 kabupaten/kota
graf3 <- data.frame(
m = rep(c("m=2", "m=5", "m=9"), times = 6), # Label m untuk sumbu X
MSERF = c(mean(mserf1_m2), mean(mserf1_m5), mean(mserf1_m9), # Kabupaten/kota 1
mean(mserf2_m2), mean(mserf2_m5), mean(mserf2_m9), # Kabupaten/kota 2
mean(mserf3_m2), mean(mserf3_m5), mean(mserf3_m9), # Kabupaten/kota 3
mean(mserf4_m2), mean(mserf4_m5), mean(mserf4_m9), # Kabupaten/kota 4
mean(mserf5_m2), mean(mserf5_m5), mean(mserf5_m9), # Kabupaten/kota 5
mean(mserf6_m2), mean(mserf6_m5), mean(mserf6_m9)), # Kabupaten/kota 6
Kabupaten = rep(c("Kab. Bogor", "Kab. Bekasi", "Kota Bandung",
"Kab. Kuningan", "Kota Sukabumi", "Kota Banjar"),
each = 3) # Nama kabupaten/kota
)
library(ggplot2)
library(dplyr)
# Tambahkan kategori PDRB
graf3 <- graf3 %>%
mutate(PDRB = ifelse(Kabupaten %in% c("Kab. Bogor", "Kab. Bekasi", "Kota Bandung"),
"PDRB Tinggi",
"PDRB Rendah"))
# Grafik untuk MSE dengan PDRB Tinggi dan Rendah
ggplot(graf3, aes(x = m, y = MSERF, color = Kabupaten, group = Kabupaten, shape = PDRB)) +
geom_point(size = 3) +
geom_line(linewidth = 1) +
scale_color_manual(
values = c("Kab. Bogor" = "#1E90FF", # Biru terang
"Kab. Bekasi" = "#87CEEB", # Biru muda
"Kota Bandung" = "#ADD8E6", # Biru sangat muda
"Kab. Kuningan" = "#800000", # Maroon
"Kota Sukabumi" = "#A52A2A", # Merah maroon lebih muda
"Kota Banjar" = "#CD5C5C") # Merah muda
) +
scale_shape_manual(
values = c("PDRB Tinggi" = 16, # Bulat
"PDRB Rendah" = 17) # Segitiga
) +
labs(
title = "Perbandingan MSE untuk Daerah PDRB Tinggi dan Rendah",
x = "Banyak peubah penjelas",
y = "MSE (Mean Squared Error)",
color = "Kabupaten/Kota",
shape = "Kategori PDRB"
) +
theme_minimal() +
theme(legend.position = "right")
#Grafik Gabungan
# Contoh data MSERF untuk 6 kabupaten/kota
graf4 <- data.frame(
m = rep(c("k=50", "k=100", "k=150", "k=250", "k=500"), times = 6), # Label m untuk sumbu X
MSERF = c(mean(mserf1_k50), mean(mserf1_k100), mean(mserf1_k150),mean(mserf1_k250), mean(mserf1_k500), # Kabupaten/kota 1
mean(mserf2_k50), mean(mserf2_k100), mean(mserf2_k150),mean(mserf2_k250), mean(mserf2_k500), # Kabupaten/kota 2
mean(mserf3_k50), mean(mserf3_k100), mean(mserf3_k150),mean(mserf3_k250), mean(mserf3_k500), # Kabupaten/kota 3
mean(mserf4_k50), mean(mserf4_k100), mean(mserf4_k150),mean(mserf4_k250), mean(mserf4_k500), # Kabupaten/kota 4
mean(mserf5_k50), mean(mserf5_k100), mean(mserf5_k150),mean(mserf5_k250), mean(mserf5_k500), # Kabupaten/kota 5
mean(mserf6_k50), mean(mserf6_k100), mean(mserf6_k150),mean(mserf6_k250), mean(mserf6_k500)), # Kabupaten/kota 6
Kabupaten = rep(c("Kab. Bogor", "Kab. Bekasi", "Kota Bandung",
"Kab. Kuningan", "Kota Sukabumi", "Kota Banjar"),
each = 5) # Nama kabupaten/kota
)
library(ggplot2)
graf4 <- graf4 %>%
mutate(PDRB = ifelse(Kabupaten %in% c("Kab. Bogor", "Kab. Bekasi", "Kota Bandung"),
"PDRB Tinggi",
"PDRB Rendah"))
# Grafik untuk jumlah pohon
ggplot(graf4, aes(x = m, y = MSERF, color = Kabupaten, group = Kabupaten, shape = PDRB)) +
geom_point(size = 3) +
geom_line(linewidth = 1) +
scale_color_manual(
values = c("Kab. Bogor" = "#1E90FF", # Biru terang
"Kab. Bekasi" = "#87CEEB", # Biru muda
"Kota Bandung" = "#ADD8E6", # Biru sangat muda
"Kab. Kuningan" = "#800000", # Maroon
"Kota Sukabumi" = "#A52A2A", # Merah maroon lebih muda
"Kota Banjar" = "#CD5C5C") # Merah muda
) +
scale_shape_manual(
values = c("PDRB Tinggi" = 16, # Bulat
"PDRB Rendah" = 17) # Segitiga
) +
labs(
title = "Perbandingan MSE untuk Daerah PDRB Tinggi dan Rendah",
x = "Jumlah pohon",
y = "MSE (Mean Squared Error)",
color = "Kabupaten/Kota",
shape = "Kategori PDRB"
) +
theme_minimal() +
theme(legend.position = "right")
## Deteksi Random Forest (RF)
# Deteksi pencilan dan transformasi BOGOR
# Menghitung kuartil Q1 dan Q3
Q1 <- quantile(data1$EXPENDITURE, 0.25)
Q3 <- quantile(data1$EXPENDITURE, 0.75)
# Menghitung IQR
IQR_value <- Q3 - Q1
# Menghitung batas bawah dan atas
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- detect_outliers_rf(randomForest(x = data1[,c(2:22)],y = data1[,c(23)],ntree = 500, mtry = 5), data1, response, predictors)
data1rfoutlier<-sum(outliers)
data1$EXPENDITURE_TRANSFORMED_RF <- data1$EXPENDITURE
data1$EXPENDITURE_TRANSFORMED_RF[outliers] <- ifelse(data1$EXPENDITURE_TRANSFORMED_RF[outliers] < lower_bound, lower_bound, upper_bound)
# Deteksi pencilan dan transformasi BEKASI
# Menghitung kuartil Q1 dan Q3
Q1 <- quantile(data2$EXPENDITURE, 0.25)
Q3 <- quantile(data2$EXPENDITURE, 0.75)
# Menghitung IQR
IQR_value <- Q3 - Q1
# Menghitung batas bawah dan atas
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- detect_outliers_rf(randomForest(x = data2[,c(2:22)],y = data2[,c(23)],ntree = 500, mtry = 5), data2, response, predictors)
data2rfoutlier<-sum(outliers)
data2$EXPENDITURE_TRANSFORMED_RF <- data2$EXPENDITURE
data2$EXPENDITURE_TRANSFORMED_RF[outliers] <- ifelse(data2$EXPENDITURE_TRANSFORMED_RF[outliers] < lower_bound, lower_bound, upper_bound)
# Deteksi pencilan dan transformasi BANDUNG
# Menghitung kuartil Q1 dan Q3
Q1 <- quantile(data3$EXPENDITURE, 0.25)
Q3 <- quantile(data3$EXPENDITURE, 0.75)
# Menghitung IQR
IQR_value <- Q3 - Q1
# Menghitung batas bawah dan atas
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- detect_outliers_rf(randomForest(x = data3[,c(2:22)],y = data3[,c(23)],ntree = 500, mtry = 5), data3, response, predictors)
data3rfoutlier<-sum(outliers)
data3$EXPENDITURE_TRANSFORMED_RF <- data3$EXPENDITURE
data3$EXPENDITURE_TRANSFORMED_RF[outliers] <- ifelse(data3$EXPENDITURE_TRANSFORMED_RF[outliers] < lower_bound, lower_bound, upper_bound)
# Deteksi pencilan dan transformasi KUNINGAN
# Menghitung kuartil Q1 dan Q3
Q1 <- quantile(data4$EXPENDITURE, 0.25)
Q3 <- quantile(data4$EXPENDITURE, 0.75)
# Menghitung IQR
IQR_value <- Q3 - Q1
# Menghitung batas bawah dan atas
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- detect_outliers_rf(randomForest(x = data4[,c(2:22)],y = data4[,c(23)],ntree = 500, mtry = 5), data4, response, predictors)
data4rfoutlier<-sum(outliers)
data4$EXPENDITURE_TRANSFORMED_RF <- data4$EXPENDITURE
data4$EXPENDITURE_TRANSFORMED_RF[outliers] <- ifelse(data4$EXPENDITURE_TRANSFORMED_RF[outliers] < lower_bound, lower_bound, upper_bound)
# Deteksi pencilan dan transformasi SUKABUMI
# Menghitung kuartil Q1 dan Q3
Q1 <- quantile(data5$EXPENDITURE, 0.25)
Q3 <- quantile(data5$EXPENDITURE, 0.75)
# Menghitung IQR
IQR_value <- Q3 - Q1
# Menghitung batas bawah dan atas
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- detect_outliers_rf(randomForest(x = data5[,c(2:22)],y = data5[,c(23)],ntree = 500, mtry = 5), data5, response, predictors)
data5rfoutlier<-sum(outliers)
data5$EXPENDITURE_TRANSFORMED_RF <- data5$EXPENDITURE
data5$EXPENDITURE_TRANSFORMED_RF[outliers] <- ifelse(data5$EXPENDITURE_TRANSFORMED_RF[outliers] < lower_bound, lower_bound, upper_bound)
# Deteksi pencilan dan transformasi BANJAR
# Menghitung kuartil Q1 dan Q3
Q1 <- quantile(data6$EXPENDITURE, 0.25)
Q3 <- quantile(data6$EXPENDITURE, 0.75)
# Menghitung IQR
IQR_value <- Q3 - Q1
# Menghitung batas bawah dan atas
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- detect_outliers_rf(randomForest(x = data6[,c(2:22)],y = data6[,c(23)],ntree = 500, mtry = 5), data6, response, predictors)
data6rfoutlier<-sum(outliers)
data6$EXPENDITURE_TRANSFORMED_RF <- data6$EXPENDITURE
data6$EXPENDITURE_TRANSFORMED_RF[outliers] <- ifelse(data6$EXPENDITURE_TRANSFORMED_RF[outliers] < lower_bound, lower_bound, upper_bound)
#Pemodelan setelah ditransformasi RF
# Pemodelan ulang setelah transformasi
# BOGOR
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE_TRANSFORMED_RF, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data1[,c(2:22)],
y = train_data1$EXPENDITURE_TRANSFORMED_RF,ntree = 500, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data1$EXPENDITURE_TRANSFORMED_RF - predict(rf_model, newdata = test_data1[,c(2:22)]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf1_best<-mserf_values
##################################
#BEKASI
#############################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE_TRANSFORMED_RF, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data2[,c(2:22)],
y = train_data2$EXPENDITURE_TRANSFORMED_RF,ntree = 500, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data2$EXPENDITURE_TRANSFORMED_RF - predict(rf_model, newdata = test_data2[,c(2:22)]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf2_best<-mserf_values
###########################
#BANDUNG
#############################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data3$EXPENDITURE_TRANSFORMED_RF, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data3), folds[[fold_idx]])
train_data3 <- data3[train_idx, ]
test_data3 <- data3[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data3[,c(2:22)],
y = train_data3$EXPENDITURE_TRANSFORMED_RF,ntree = 500, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data3$EXPENDITURE_TRANSFORMED_RF - predict(rf_model, newdata = test_data3[,c(2:22)]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf3_best<-mserf_values
###########################
#KUNINGAN
#############################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data4$EXPENDITURE_TRANSFORMED_RF, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data4), folds[[fold_idx]])
train_data4 <- data4[train_idx, ]
test_data4 <- data4[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data4[,c(2:22)],
y = train_data4$EXPENDITURE_TRANSFORMED_RF,ntree = 500, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data4$EXPENDITURE_TRANSFORMED_RF - predict(rf_model, newdata = test_data4[,c(2:22)]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf4_best<-mserf_values
###########################
#SUKABUMI
#############################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data5$EXPENDITURE_TRANSFORMED_RF, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data5), folds[[fold_idx]])
train_data5 <- data5[train_idx, ]
test_data5 <- data5[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data5[,c(2:22)],
y = train_data5$EXPENDITURE_TRANSFORMED_RF,ntree = 500, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data5$EXPENDITURE_TRANSFORMED_RF - predict(rf_model, newdata = test_data5[,c(2:22)]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf5_best<-mserf_values
###########################
#BANJAR
#############################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data6$EXPENDITURE_TRANSFORMED_RF, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data6), folds[[fold_idx]])
train_data6 <- data6[train_idx, ]
test_data6 <- data6[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data6[,c(2:22)],
y = train_data6$EXPENDITURE_TRANSFORMED_RF,ntree = 500, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data6$EXPENDITURE_TRANSFORMED_RF - predict(rf_model, newdata = test_data6[,c(2:22)]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf6_best<-mserf_values
#PDRB TINGGI
datapdrbtinggi<- data %>% filter(Kabupaten.Kota %in% c(1, 16, 73))
# GLMM Model
set.seed(123)
glmms <- list()
rmse_glmm <- c()
# Gunakan kolom 7 hingga 29
data1_subset <- datapdrbtinggi[, c(7:29)]
# Model awal (Full Model)
model1 <- lmer(EXPENDITURE ~ GENDER + AGE + EDU + SAVING + ILLITERATE + FOOD + PLACE + HOUSE + OTPLACE +
ROOF + WALL + FLOOR + DEFECATION + WATER + ELECTRICITY + CREDIT + LAND + INCOME +
KKS + AID + MICRO + (1|REGION), data = data1_subset)
summary(model1)
## Linear mixed model fit by REML ['lmerMod']
## Formula: EXPENDITURE ~ GENDER + AGE + EDU + SAVING + ILLITERATE + FOOD +
## PLACE + HOUSE + OTPLACE + ROOF + WALL + FLOOR + DEFECATION +
## WATER + ELECTRICITY + CREDIT + LAND + INCOME + KKS + AID +
## MICRO + (1 | REGION)
## Data: data1_subset
##
## REML criterion at convergence: 27952.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.8335 -0.5400 -0.1342 0.3070 17.6012
##
## Random effects:
## Groups Name Variance Std.Dev.
## REGION (Intercept) 3.219 1.794
## Residual 162.697 12.755
## Number of obs: 3525, groups: REGION, 2
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -15.275527 3.337000 -4.578
## GENDER 3.572532 0.651713 5.482
## AGE -0.031085 0.019139 -1.624
## EDU 1.505705 0.258626 5.822
## SAVING 0.161656 0.007922 20.407
## ILLITERATE 0.083813 0.012684 6.608
## FOOD -2.012526 0.536135 -3.754
## PLACE -1.228492 0.673387 -1.824
## HOUSE 0.067009 0.004668 14.356
## OTPLACE 3.302450 0.690325 4.784
## ROOF 0.054974 0.545962 0.101
## WALL -0.370684 0.580234 -0.639
## FLOOR 0.148272 0.256074 0.579
## DEFECATION -1.785663 1.114247 -1.603
## WATER 3.308000 0.320265 10.329
## ELECTRICITY -0.216089 0.713837 -0.303
## CREDIT 0.424801 0.544739 0.780
## LAND 1.669997 0.606626 2.753
## INCOME 4.916813 0.944850 5.204
## KKS -1.426719 0.944553 -1.510
## AID -2.566913 0.679201 -3.779
## MICRO 0.420176 0.792918 0.530
##
## Correlation matrix not shown by default, as p = 22 > 12.
## Use print(x, correlation=TRUE) or
## vcov(x) if you need it
############################
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.4.2
## Loaded glmnet 4.1-8
X <- model.matrix(~ . - EXPENDITURE - REGION, data1_subset)
y <- data1_subset$EXPENDITURE
lasso_model <- cv.glmnet(X, y, alpha = 1, family = "gaussian")
# Ekstrak koefisien sebagai matriks
lasso_coefs <- as.matrix(coef(lasso_model, s = "lambda.min"))
# Ambil nama variabel dengan koefisien tidak nol (selain intercept)
selected_vars <- rownames(lasso_coefs)[lasso_coefs[, 1] != 0][-1] # Hilangkan intercept
# Pastikan ada variabel yang dipilih
if (length(selected_vars) == 0) {
stop("Tidak ada variabel yang dipilih oleh Lasso.")
}
# Buat formula dinamis
formula <- as.formula(
paste("EXPENDITURE ~", paste(selected_vars, collapse = " + "), "+ (1 | REGION)")
)
# Model GLMM dengan variabel terpilih
model2 <- lmer(formula, data = data1_subset)
summary(model2)
## Linear mixed model fit by REML ['lmerMod']
## Formula: EXPENDITURE ~ GENDER + AGE + EDU + SAVING + ILLITERATE + FOOD +
## PLACE + HOUSE + OTPLACE + WALL + DEFECATION + WATER + CREDIT +
## LAND + INCOME + KKS + AID + MICRO + (1 | REGION)
## Data: data1_subset
##
## REML criterion at convergence: 27954.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.8370 -0.5384 -0.1338 0.3075 17.6079
##
## Random effects:
## Groups Name Variance Std.Dev.
## REGION (Intercept) 3.147 1.774
## Residual 162.578 12.751
## Number of obs: 3525, groups: REGION, 2
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -15.167415 3.001223 -5.054
## GENDER 3.569155 0.651145 5.481
## AGE -0.031567 0.019068 -1.656
## EDU 1.508437 0.258370 5.838
## SAVING 0.161764 0.007916 20.436
## ILLITERATE 0.083893 0.012671 6.621
## FOOD -2.024950 0.532637 -3.802
## PLACE -1.229617 0.671486 -1.831
## HOUSE 0.067079 0.004623 14.509
## OTPLACE 3.309257 0.689136 4.802
## WALL -0.263925 0.540637 -0.488
## DEFECATION -1.681413 1.094772 -1.536
## WATER 3.316612 0.319713 10.374
## CREDIT 0.421793 0.544087 0.775
## LAND 1.673777 0.605833 2.763
## INCOME 4.919122 0.943700 5.213
## KKS -1.433867 0.944010 -1.519
## AID -2.597266 0.677037 -3.836
## MICRO 0.409276 0.792225 0.517
##
## Correlation matrix not shown by default, as p = 19 > 12.
## Use print(x, correlation=TRUE) or
## vcov(x) if you need it
###############
repeats=10
mse_values <- numeric(repeats)
mse_bestvalues <- numeric(repeats)
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1_subset$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
mse_best_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1_subset), folds[[fold_idx]])
train_data1_subset <- data1_subset[train_idx, ]
test_data1_subset <- data1_subset[folds[[fold_idx]], ]
# Fit GLMM model
model1 <- lmer(EXPENDITURE ~ . - REGION + (1|REGION), data = train_data1_subset)
model2<-lmer(formula, data = train_data1_subset)
predictions_model1 <- predict(model1, test_data1_subset)
predictions_model2<-predict(model2, test_data1_subset)
# Hitung MSE untuk fold ini
mse_fold[fold_idx] <- mean((test_data1_subset$EXPENDITURE - predictions_model1)^2)
mse_best_fold[fold_idx] <- mean((test_data1_subset$EXPENDITURE - predictions_model2)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
mse_bestvalues[rep]<-mean(mse_best_fold)
}
msepdrbtinggi_model1<-mse_values
mean(msepdrbtinggi_model1)
## [1] 164.0136
msepdrbtinggi_model2<-mse_bestvalues
mean(msepdrbtinggi_model2)
## [1] 163.8556
datapdrbrendah<- data %>% filter(Kabupaten.Kota %in% c(8, 72, 79))
# GLMM Model
set.seed(123)
glmms <- list()
rmse_glmm <- c()
# Gunakan kolom 7 hingga 29
data2_subset <- datapdrbrendah[, c(7:29)]
# Model awal (Full Model)
model1 <- lmer(EXPENDITURE ~ GENDER + AGE + EDU + SAVING + ILLITERATE + FOOD + PLACE + HOUSE + OTPLACE +
ROOF + WALL + FLOOR + DEFECATION + WATER + ELECTRICITY + CREDIT + LAND + INCOME +
KKS + AID + MICRO + (1|REGION), data = data2_subset)
summary(model1)
## Linear mixed model fit by REML ['lmerMod']
## Formula: EXPENDITURE ~ GENDER + AGE + EDU + SAVING + ILLITERATE + FOOD +
## PLACE + HOUSE + OTPLACE + ROOF + WALL + FLOOR + DEFECATION +
## WATER + ELECTRICITY + CREDIT + LAND + INCOME + KKS + AID +
## MICRO + (1 | REGION)
## Data: data2_subset
##
## REML criterion at convergence: 15031.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.0629 -0.5397 -0.1316 0.2934 10.5942
##
## Random effects:
## Groups Name Variance Std.Dev.
## REGION (Intercept) 0.5504 0.7419
## Residual 94.2717 9.7094
## Number of obs: 2036, groups: REGION, 2
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -1.508e+01 3.936e+00 -3.830
## GENDER 1.288e+00 6.541e-01 1.969
## AGE 8.325e-04 1.854e-02 0.045
## EDU 1.910e+00 2.547e-01 7.496
## SAVING 7.854e-02 7.633e-03 10.290
## ILLITERATE 5.025e-02 1.354e-02 3.711
## FOOD -8.299e-01 5.348e-01 -1.552
## PLACE 6.538e-01 8.609e-01 0.760
## HOUSE 5.389e-02 5.315e-03 10.138
## OTPLACE 5.431e+00 7.355e-01 7.384
## ROOF 7.804e-01 7.417e-01 1.052
## WALL 8.798e-01 4.218e-01 2.086
## FLOOR 7.575e-02 2.348e-01 0.323
## DEFECATION -2.313e+00 2.299e+00 -1.006
## WATER 2.697e+00 3.645e-01 7.399
## ELECTRICITY -1.478e+00 8.626e-01 -1.714
## CREDIT -1.031e+00 4.920e-01 -2.095
## LAND -4.430e-01 8.094e-01 -0.547
## INCOME 1.941e+00 7.893e-01 2.459
## KKS -1.125e+00 7.478e-01 -1.504
## AID -4.925e-01 6.212e-01 -0.793
## MICRO 5.127e-01 6.931e-01 0.740
##
## Correlation matrix not shown by default, as p = 22 > 12.
## Use print(x, correlation=TRUE) or
## vcov(x) if you need it
############################
library(glmnet)
X <- model.matrix(~ . - EXPENDITURE - REGION, data2_subset)
y <- data2_subset$EXPENDITURE
lasso_model <- cv.glmnet(X, y, alpha = 1, family = "gaussian")
# Ekstrak koefisien sebagai matriks
lasso_coefs <- as.matrix(coef(lasso_model, s = "lambda.min"))
# Ambil nama variabel dengan koefisien tidak nol (selain intercept)
selected_vars <- rownames(lasso_coefs)[lasso_coefs[, 1] != 0][-1] # Hilangkan intercept
# Pastikan ada variabel yang dipilih
if (length(selected_vars) == 0) {
stop("Tidak ada variabel yang dipilih oleh Lasso.")
}
# Buat formula dinamis
formula <- as.formula(
paste("EXPENDITURE ~", paste(selected_vars, collapse = " + "), "+ (1 | REGION)")
)
# Model GLMM dengan variabel terpilih
model2 <- lmer(formula, data = data2_subset)
summary(model2)
## Linear mixed model fit by REML ['lmerMod']
## Formula: EXPENDITURE ~ GENDER + EDU + SAVING + ILLITERATE + FOOD + PLACE +
## HOUSE + OTPLACE + ROOF + WALL + FLOOR + DEFECATION + WATER +
## ELECTRICITY + CREDIT + INCOME + KKS + AID + MICRO + (1 | REGION)
## Data: data2_subset
##
## REML criterion at convergence: 15027.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.0615 -0.5404 -0.1305 0.2961 10.5911
##
## Random effects:
## Groups Name Variance Std.Dev.
## REGION (Intercept) 0.5162 0.7184
## Residual 94.1944 9.7054
## Number of obs: 2036, groups: REGION, 2
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -14.995973 3.856657 -3.888
## GENDER 1.298993 0.647157 2.007
## EDU 1.914000 0.247962 7.719
## SAVING 0.078478 0.007627 10.290
## ILLITERATE 0.050251 0.013223 3.800
## FOOD -0.823268 0.534164 -1.541
## PLACE 0.324253 0.598867 0.541
## HOUSE 0.053698 0.005266 10.197
## OTPLACE 5.378622 0.723052 7.439
## ROOF 0.774282 0.741191 1.045
## WALL 0.857705 0.419058 2.047
## FLOOR 0.073261 0.234561 0.312
## DEFECATION -2.321246 2.297475 -1.010
## WATER 2.699403 0.363652 7.423
## ELECTRICITY -1.492111 0.861492 -1.732
## CREDIT -1.028940 0.488538 -2.106
## INCOME 1.951953 0.768727 2.539
## KKS -1.113897 0.747210 -1.491
## AID -0.494261 0.620708 -0.796
## MICRO 0.510455 0.692818 0.737
##
## Correlation matrix not shown by default, as p = 20 > 12.
## Use print(x, correlation=TRUE) or
## vcov(x) if you need it
###############
repeats=10
mse_values <- numeric(repeats)
mse_bestvalues <- numeric(repeats)
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2_subset$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
mse_best_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2_subset), folds[[fold_idx]])
train_data2_subset <- data2_subset[train_idx, ]
test_data2_subset <- data2_subset[folds[[fold_idx]], ]
# Fit GLMM model
model1 <- lmer(EXPENDITURE ~ . - REGION + (1|REGION), data = train_data2_subset)
model2<-lmer(formula, data = train_data2_subset)
predictions_model1 <- predict(model1, test_data2_subset)
predictions_model2<-predict(model2, test_data2_subset)
# Hitung MSE untuk fold ini
mse_fold[fold_idx] <- mean((test_data2_subset$EXPENDITURE - predictions_model1)^2)
mse_best_fold[fold_idx] <- mean((test_data2_subset$EXPENDITURE - predictions_model2)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
mse_bestvalues[rep]<-mean(mse_best_fold)
}
msepdrbrendah_model1<-mse_values
mean(msepdrbrendah_model1)
## [1] 95.93487
msepdrbrendah_model2<-mse_bestvalues
mean(msepdrbrendah_model2)
## [1] 95.77551
#GRAFIK MODEL 1 VS MODEL 2
mse_pdrbtinggi<-c(mean(msepdrbtinggi_model1),mean(msepdrbtinggi_model2))
mse_pdrbrendah<-c(mean(msepdrbrendah_model1),mean(msepdrbrendah_model2))
# Misalnya, mse_pdrbtinggi dan mse_pdrbrendah sudah dihitung sebelumnya
mse_pdrbtinggi <- c(mean(msepdrbtinggi_model1), mean(msepdrbtinggi_model2))
mse_pdrbrendah <- c(mean(msepdrbrendah_model1), mean(msepdrbrendah_model2))
# Membuat data frame untuk scatter plot
datapdrb <- data.frame(
Model = rep(c("Model 1", "Model 2"), 2), # Menyusun dua model
MSE = c(mse_pdrbtinggi, mse_pdrbrendah), # Nilai MSE dari dua model
PDRB = c("Tinggi", "Tinggi", "Rendah", "Rendah") # Kategori PDRB
)
library(ggplot2)
# Membuat scatter plot dengan garis penghubung
ggplot(datapdrb, aes(x = Model, y = MSE, color = PDRB, group = PDRB)) +
geom_point(size = 4) + # Ukuran titik
geom_line(aes(group = PDRB), color = "black", linetype = "solid") + # Menambahkan garis penghubung antara model 1 dan model 2
scale_color_manual(values = c("Tinggi" = "lightblue", "Rendah" = "maroon")) + # Warna titik
labs(
title = "MSE Model 1 vs Model 2 berdasarkan PDRB",
x = "Model",
y = "MSE",
color = "Kategori PDRB"
) +
theme_minimal() + # Tema minimal
theme(legend.position = "right") # Posisi legenda di sebelah kanan
# Deteksi Pencilan
# Deteksi pencilan dan transformasi PDRB TINGGI
# Menghitung kuartil Q1 dan Q3
Q1 <- quantile(data1_subset$EXPENDITURE, 0.25)
Q3 <- quantile(data1_subset$EXPENDITURE, 0.75)
# Menghitung IQR
IQR_value <- Q3 - Q1
# Menghitung batas bawah dan atas
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
residuals <- data1_subset$EXPENDITURE - predict(lmer(EXPENDITURE ~ . - REGION + (1|REGION), data = data1_subset))
data1_subset$standardized_residuals <- (residuals - mean(residuals)) / sd(residuals)
# Menentukan outliers
outliers <- data1_subset[data1_subset$standardized_residuals > 3 | data1_subset$standardized_residuals < -3, ]
data1glmmoutlier<-nrow(outliers)
data1_subset$EXPENDITURE_GLMM <- ifelse(data1_subset$standardized_residuals > 3,upper_bound, ifelse(data1_subset$standardized_residuals < -3, lower_bound, data1_subset$EXPENDITURE))
sum(data1_subset$EXPENDITURE!=data1_subset$EXPENDITURE_GLMM)
## [1] 51
# Deteksi pencilan dan transformasi PDRB RENDAH
# Menghitung kuartil Q1 dan Q3
Q1 <- quantile(data2_subset$EXPENDITURE, 0.25)
Q3 <- quantile(data2_subset$EXPENDITURE, 0.75)
# Menghitung IQR
IQR_value <- Q3 - Q1
# Menghitung batas bawah dan atas
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
residuals <- data2_subset$EXPENDITURE - predict(lmer(EXPENDITURE ~ . - REGION + (1|REGION), data = data2_subset))
data2_subset$standardized_residuals <- (residuals - mean(residuals)) / sd(residuals)
# Menentukan outliers
outliers <- data2_subset[data2_subset$standardized_residuals > 3 | data2_subset$standardized_residuals < -3, ]
data2glmmoutlier<-nrow(outliers)
data2glmmoutlier
## [1] 29
data2_subset$EXPENDITURE_GLMM <- ifelse(data2_subset$standardized_residuals > 3,upper_bound, ifelse(data2_subset$standardized_residuals < -3, lower_bound, data2_subset$EXPENDITURE))
sum(data2_subset$EXPENDITURE!=data2_subset$EXPENDITURE_GLMM)
## [1] 29
#PDRB Tinggi
repeats=10
mse_values <- numeric(repeats)
mse_bestvalues <- numeric(repeats)
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1_subset$EXPENDITURE_GLMM, k = 10)
mse_fold <- numeric(10)
mse_best_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1_subset), folds[[fold_idx]])
train_data1_subset <- data1_subset[train_idx, ]
test_data1_subset <- data1_subset[folds[[fold_idx]], ]
# Fit GLMM model
model1 <- lmer(EXPENDITURE_GLMM ~ . - REGION + (1|REGION), data = train_data1_subset[,c(1:22,25)])
predictions_model1 <- predict(model1, test_data1_subset[,c(1:22,25)])
# Hitung MSE untuk fold ini
mse_fold[fold_idx] <- mean((test_data1_subset$EXPENDITURE_GLMM - predictions_model1)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse_bestpdrbtinggi <- mse_values
mean(mse_bestpdrbtinggi)
## [1] 93.31938
#PDRB RENDAH
repeats=10
mse_values <- numeric(repeats)
mse_bestvalues <- numeric(repeats)
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2_subset$EXPENDITURE_GLMM, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2_subset), folds[[fold_idx]])
train_data2_subset <- data2_subset[train_idx, ]
test_data2_subset <- data2_subset[folds[[fold_idx]], ]
# Fit GLMM model
model1 <- lmer(EXPENDITURE_GLMM ~ . - REGION + (1|REGION), data = train_data2_subset[,c(1:22,25)])
predictions_model1 <- predict(model1, test_data2_subset[,c(1:22,25)])
# Hitung MSE untuk fold ini
mse_fold[fold_idx] <- mean((test_data2_subset$EXPENDITURE_GLMM - predictions_model1)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse_bestpdrbrendah <- mse_values
mean(mse_bestpdrbrendah)
## [1] 53.89002
# BOGOR
train1_indices <- sample(1:nrow(data1), 0.7 * nrow(data1))
train1 <- data1[train1_indices, ]
test1 <- data1[-train1_indices, ]
# BEKASI
train2_indices <- sample(1:nrow(data2), 0.7 * nrow(data2))
train2 <- data2[train2_indices, ]
test2 <- data2[-train2_indices, ]
# BANDUNG
train3_indices <- sample(1:nrow(data3), 0.7 * nrow(data3))
train3 <- data3[train3_indices, ]
test3 <- data3[-train3_indices, ]
# KUNINGAN
train4_indices <- sample(1:nrow(data4), 0.7 * nrow(data4))
train4 <- data4[train4_indices, ]
test4 <- data4[-train4_indices, ]
# SUKABUMI
train5_indices <- sample(1:nrow(data5), 0.7 * nrow(data5))
train5 <- data5[train5_indices, ]
test5 <- data5[-train5_indices, ]
# BANJAR
train6_indices <- sample(1:nrow(data6), 0.7 * nrow(data6))
train6 <- data6[train6_indices, ]
test6 <- data6[-train6_indices, ]
# Pilih 10 data acak dari setiap data test
test1_sample <- test1[sample(nrow(test1), 10), ]
test2_sample <- test2[sample(nrow(test2), 10), ]
test3_sample <- test3[sample(nrow(test3), 10), ]
test4_sample <- test4[sample(nrow(test4), 10), ]
test5_sample <- test5[sample(nrow(test5), 10), ]
test6_sample <- test6[sample(nrow(test6), 10), ]
#PREDIKSI GRF
prediksi1_grf<-predict(regression_forest(X = train1[,c(2:22)],Y = train1$EXPENDITURE_TRANSFORMED,num.trees = 500, mtry = 9), newdata = test1_sample[,c(2:22)])$prediction
prediksi2_grf<-predict(regression_forest(X = train2[,c(2:22)],Y = train2$EXPENDITURE_TRANSFORMED,num.trees = 500, mtry = 9), newdata = test2_sample[,c(2:22)])$prediction
prediksi3_grf<-predict(regression_forest(X = train3[,c(2:22)],Y = train3$EXPENDITURE_TRANSFORMED,num.trees = 500, mtry = 9), newdata = test3_sample[,c(2:22)])$prediction
prediksi4_grf<-predict(regression_forest(X = train4[,c(2:22)],Y = train4$EXPENDITURE_TRANSFORMED,num.trees = 500, mtry = 9), newdata = test4_sample[,c(2:22)])$prediction
prediksi5_grf<-predict(regression_forest(X = train5[,c(2:22)],Y = train5$EXPENDITURE_TRANSFORMED,num.trees = 500, mtry = 9), newdata = test5_sample[,c(2:22)])$prediction
prediksi6_grf<-predict(regression_forest(X = train6[,c(2:22)],Y = train6$EXPENDITURE_TRANSFORMED,num.trees = 500, mtry = 9), newdata = test6_sample[,c(2:22)])$prediction
#PREDIKSI RF
prediksi1_rf<-predict(randomForest(x = train1[,c(2:22)],y = train1$EXPENDITURE_TRANSFORMED_RF,ntree = 500, mtry = 5), newdata = test1_sample[,c(2:22)])
prediksi2_rf<-predict(randomForest(x = train2[,c(2:22)],y = train2$EXPENDITURE_TRANSFORMED_RF,ntree = 500, mtry = 5), newdata = test2_sample[,c(2:22)])
prediksi3_rf<-predict(randomForest(x = train3[,c(2:22)],y = train3$EXPENDITURE_TRANSFORMED_RF,ntree = 500, mtry = 5), newdata = test3_sample[,c(2:22)])
prediksi4_rf<-predict(randomForest(x = train4[,c(2:22)],y = train4$EXPENDITURE_TRANSFORMED_RF,ntree = 500, mtry = 5), newdata = test4_sample[,c(2:22)])
prediksi5_rf<-predict(randomForest(x = train5[,c(2:22)],y = train5$EXPENDITURE_TRANSFORMED_RF,ntree = 500, mtry = 5), newdata = test5_sample[,c(2:22)])
prediksi6_rf<-predict(randomForest(x = train6[,c(2:22)],y = train6$EXPENDITURE_TRANSFORMED_RF,ntree = 500, mtry = 5), newdata = test6_sample[,c(2:22)])
#PREDIKSI GLMM
prediksi1_glmm<-predict(lmer(EXPENDITURE_GLMM ~ . - REGION + (1|REGION), data = train_data1_subset[,c(1:22,25)]),test1_sample[,c(1:22)])
prediksi2_glmm<-predict(lmer(EXPENDITURE_GLMM ~ . - REGION + (1|REGION), data = train_data1_subset[,c(1:22,25)]),test2_sample[,c(1:22)])
prediksi3_glmm<-predict(lmer(EXPENDITURE_GLMM ~ . - REGION + (1|REGION), data = train_data1_subset[,c(1:22,25)]),test3_sample[,c(1:22)])
prediksi4_glmm<-predict(lmer(EXPENDITURE_GLMM ~ . - REGION + (1|REGION), data = train_data2_subset[,c(1:22,25)]),test4_sample[,c(1:22)])
prediksi5_glmm<-predict(lmer(EXPENDITURE_GLMM ~ . - REGION + (1|REGION), data = train_data2_subset[,c(1:22,25)]),test5_sample[,c(1:22)])
prediksi6_glmm<-predict(lmer(EXPENDITURE_GLMM ~ . - REGION + (1|REGION), data = train_data2_subset[,c(1:22,25)]),test6_sample[,c(1:22)])
# Gabungkan hasil prediksi dan data aktual
region_results <- data.frame(
PDRB = c(rep("PDRB TINGGI",30),rep("PDRB RENDAH",30)),
KabupatenKota = c(rep("Kabupaten Bogor",10),rep("Kabupaten Bekasi",10),rep("Kota Bandung",10),rep("Kabupaten Kuningan",10),rep("Kota Sukabumi",10),rep("Kota Banjar",10)),
Aktual = c(test1_sample$EXPENDITURE,test2_sample$EXPENDITURE,test3_sample$EXPENDITURE,test4_sample$EXPENDITURE,test5_sample$EXPENDITURE,test6_sample$EXPENDITURE),
Prediksi_GRF = c(prediksi1_grf,prediksi2_grf,prediksi3_grf,prediksi4_grf,prediksi5_grf,prediksi6_grf),
Prediksi_RF = c(prediksi1_rf,prediksi2_rf,prediksi3_rf,prediksi4_rf,prediksi5_rf,prediksi6_rf),
Prediksi_GLMM = c(prediksi1_glmm,prediksi2_glmm,prediksi3_glmm,prediksi4_glmm,prediksi5_glmm,prediksi6_glmm)
)
region_results
## PDRB KabupatenKota Aktual Prediksi_GRF Prediksi_RF
## 659 PDRB TINGGI Kabupaten Bogor 12.501071 9.815027 7.989517
## 190 PDRB TINGGI Kabupaten Bogor 17.106375 10.947109 11.494637
## 431 PDRB TINGGI Kabupaten Bogor 10.731459 8.369725 6.531418
## 789 PDRB TINGGI Kabupaten Bogor 9.200446 9.206797 8.575179
## 1166 PDRB TINGGI Kabupaten Bogor 3.831905 9.822228 10.331465
## 1060 PDRB TINGGI Kabupaten Bogor 10.029226 9.075334 8.123538
## 946 PDRB TINGGI Kabupaten Bogor 30.499405 16.773495 20.437857
## 560 PDRB TINGGI Kabupaten Bogor 11.396250 7.501485 6.343586
## 1180 PDRB TINGGI Kabupaten Bogor 13.295595 11.310913 10.697819
## 510 PDRB TINGGI Kabupaten Bogor 3.430635 6.725551 5.724190
## 944 PDRB TINGGI Kabupaten Bekasi 10.870286 16.481053 12.898148
## 599 PDRB TINGGI Kabupaten Bekasi 7.182524 12.143560 12.026273
## 267 PDRB TINGGI Kabupaten Bekasi 8.472659 16.526518 19.812923
## 1062 PDRB TINGGI Kabupaten Bekasi 7.747202 11.712518 10.632952
## 234 PDRB TINGGI Kabupaten Bekasi 9.226131 13.926198 16.031033
## 508 PDRB TINGGI Kabupaten Bekasi 17.165119 13.401363 11.734599
## 596 PDRB TINGGI Kabupaten Bekasi 15.994286 11.765753 10.257765
## 277 PDRB TINGGI Kabupaten Bekasi 4.553452 14.886120 15.642327
## 585 PDRB TINGGI Kabupaten Bekasi 27.339643 35.173862 36.423396
## 920 PDRB TINGGI Kabupaten Bekasi 16.875298 13.175692 13.003309
## 962 PDRB TINGGI Kota Bandung 5.667223 12.870983 11.518509
## 229 PDRB TINGGI Kota Bandung 26.143905 29.712047 31.696110
## 828 PDRB TINGGI Kota Bandung 12.414000 14.943072 14.815193
## 1006 PDRB TINGGI Kota Bandung 18.752381 16.167728 17.123652
## 6 PDRB TINGGI Kota Bandung 52.057768 28.070202 28.103482
## 55 PDRB TINGGI Kota Bandung 100.566532 37.028913 45.834561
## 385 PDRB TINGGI Kota Bandung 21.891429 30.653868 35.169800
## 984 PDRB TINGGI Kota Bandung 30.480595 43.937396 43.194766
## 797 PDRB TINGGI Kota Bandung 8.748651 16.180064 16.199360
## 592 PDRB TINGGI Kota Bandung 53.782386 43.520921 55.186510
## 239 PDRB RENDAH Kabupaten Kuningan 7.301118 10.515541 10.284582
## 66 PDRB RENDAH Kabupaten Kuningan 6.987708 9.719509 9.917343
## 460 PDRB RENDAH Kabupaten Kuningan 12.283437 10.445754 9.301367
## 536 PDRB RENDAH Kabupaten Kuningan 6.095655 10.373628 10.373710
## 17 PDRB RENDAH Kabupaten Kuningan 4.798717 10.692705 9.694199
## 172 PDRB RENDAH Kabupaten Kuningan 6.200595 15.244001 15.793091
## 505 PDRB RENDAH Kabupaten Kuningan 8.014107 13.869792 15.845943
## 265 PDRB RENDAH Kabupaten Kuningan 12.759702 10.160049 10.015156
## 741 PDRB RENDAH Kabupaten Kuningan 20.528492 10.007908 8.975652
## 29 PDRB RENDAH Kabupaten Kuningan 10.353633 10.558437 9.733731
## 185 PDRB RENDAH Kota Sukabumi 50.960119 25.565587 28.414589
## 215 PDRB RENDAH Kota Sukabumi 9.631012 11.121488 10.957967
## 31 PDRB RENDAH Kota Sukabumi 3.528087 9.504842 7.285334
## 635 PDRB RENDAH Kota Sukabumi 9.079307 10.307852 9.806025
## 48 PDRB RENDAH Kota Sukabumi 32.956042 30.408110 38.062987
## 578 PDRB RENDAH Kota Sukabumi 6.405748 9.322842 7.930981
## 163 PDRB RENDAH Kota Sukabumi 23.347270 8.649616 6.337246
## 441 PDRB RENDAH Kota Sukabumi 6.763863 8.966821 7.833051
## 159 PDRB RENDAH Kota Sukabumi 4.001238 10.269149 8.020365
## 406 PDRB RENDAH Kota Sukabumi 7.858839 12.522900 12.864564
## 331 PDRB RENDAH Kota Banjar 3.892679 8.314573 8.347802
## 440 PDRB RENDAH Kota Banjar 4.944540 7.624958 6.099811
## 492 PDRB RENDAH Kota Banjar 5.096738 11.378992 9.983033
## 86 PDRB RENDAH Kota Banjar 31.930536 12.431322 12.755451
## 161 PDRB RENDAH Kota Banjar 6.787619 8.344425 8.336612
## 181 PDRB RENDAH Kota Banjar 6.169800 9.739628 8.610552
## 422 PDRB RENDAH Kota Banjar 14.204924 16.423083 17.826331
## 189 PDRB RENDAH Kota Banjar 34.614940 15.874294 18.836843
## 63 PDRB RENDAH Kota Banjar 20.330496 15.938603 19.391738
## 222 PDRB RENDAH Kota Banjar 9.614246 9.198533 8.951412
## Prediksi_GLMM
## 659 12.1340530
## 190 10.1874459
## 431 -0.8318712
## 789 6.5274716
## 1166 11.6902093
## 1060 7.2191546
## 946 18.6083784
## 560 5.7565196
## 1180 12.3576602
## 510 2.2423002
## 944 19.5181214
## 599 10.6300943
## 267 20.6241903
## 1062 10.0242776
## 234 16.3111925
## 508 9.7667986
## 596 10.2856004
## 277 21.0501171
## 585 24.5526742
## 920 16.4476702
## 962 14.1282739
## 229 25.9484938
## 828 7.9867442
## 1006 7.9328760
## 6 27.5320905
## 55 27.6778884
## 385 25.0301523
## 984 30.9369436
## 797 11.9170967
## 592 34.9066155
## 239 10.5660903
## 66 4.2314220
## 460 11.8571302
## 536 10.1715539
## 17 9.7708017
## 172 15.1080927
## 505 16.6764552
## 265 8.2877625
## 741 9.1942093
## 29 10.5264990
## 185 21.2908190
## 215 10.4813651
## 31 7.1160977
## 635 6.9183997
## 48 24.5285213
## 578 6.8588499
## 163 4.8736837
## 441 7.2930623
## 159 8.6711066
## 406 8.1351052
## 331 5.3463632
## 440 6.6495345
## 492 14.7589172
## 86 23.3525389
## 161 8.4884041
## 181 11.6434171
## 422 20.8888274
## 189 17.6909857
## 63 18.8988670
## 222 10.3753906