Analisis ini bertujuan membandingkan performa tiga metode pemodelan: - Generalized Random Forest (GRF) - Random Forest (RF) - Generalized Linear Mixed Model (GLMM)
Metode evaluasi menggunakan 10-fold cross-validation dengan 100 kali pengulangan. Setelah model terbaik diperoleh, dilakukan deteksi pencilan menggunakan standardized residual. Data kemudian ditransformasi menggunakan winsorization berbasis IQR, dan pemodelan diulang hingga diperoleh model terbaik setelah transformasi.
# Load library
library(grf)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(lme4)
## Loading required package: Matrix
library(caret)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## Loading required package: lattice
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
##
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
##
## expand, pack, unpack
library(DescTools)
##
## Attaching package: 'DescTools'
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
library(moments)
# Load dataset
data <- read.csv("D:/Download/rawfixs.csv")
# Cek data
str(data)
## 'data.frame': 25686 obs. of 29 variables:
## $ Nomor.urut.RT : int 16702 16720 16717 16711 16699 16714 16708 16696 16693 16705 ...
## $ No.Urut.Nks : int 17685 17685 17685 17685 17685 17685 17685 17685 17685 17685 ...
## $ No.Urut.Ruta : int 124503 261097 296866 84174 95901 36919 212233 232632 311648 30008 ...
## $ Provinsi : int 32 32 32 32 32 32 32 32 32 32 ...
## $ Kabupaten.Kota : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Hubungan.Dengan.Kepala.Rumah.Tangga..Krt.: int 1 1 1 1 1 1 1 1 1 1 ...
## $ REGION : int 2 2 2 2 2 2 2 2 2 2 ...
## $ GENDER : int 1 1 1 1 1 1 1 1 1 1 ...
## $ AGE : int 49 40 58 69 57 31 33 32 37 30 ...
## $ EDU : int 1 1 4 1 1 1 1 2 1 1 ...
## $ SAVING : num 100 0 66.7 0 0 ...
## $ ILLITERATE : num 100 100 100 100 100 ...
## $ FOOD : int 0 0 0 0 0 0 0 1 0 1 ...
## $ PLACE : int 1 1 1 1 1 1 0 1 1 1 ...
## $ HOUSE : int 96 60 63 42 108 98 42 56 36 96 ...
## $ OTPLACE : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ROOF : int 3 3 3 3 3 3 3 3 3 3 ...
## $ WALL : int 4 4 4 2 4 4 4 4 3 4 ...
## $ FLOOR : int 5 5 5 2 5 5 5 5 2 5 ...
## $ DEFECATION : int 1 1 1 1 1 1 1 1 1 1 ...
## $ WATER : int 3 3 3 3 3 4 3 3 3 3 ...
## $ ELECTRICITY : int 1 1 1 1 1 1 1 1 1 1 ...
## $ CREDIT : int 0 0 0 0 0 0 0 0 0 0 ...
## $ LAND : int 1 1 1 1 0 0 0 0 0 0 ...
## $ INCOME : int 0 0 0 0 0 0 0 0 1 0 ...
## $ KKS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AID : int 0 0 0 0 0 0 0 0 0 0 ...
## $ MICRO : int 1 0 0 0 1 0 0 0 0 0 ...
## $ EXPENDITURE : num 30.55 7.71 39.7 7.36 19.61 ...
summary(data)
## Nomor.urut.RT No.Urut.Nks No.Urut.Ruta Provinsi Kabupaten.Kota
## Min. : 4279 Min. : 7 Min. : 7 Min. :32 Min. : 1.00
## 1st Qu.:114517 1st Qu.: 8100 1st Qu.: 82890 1st Qu.:32 1st Qu.: 6.00
## Median :191808 Median :16908 Median :160399 Median :32 Median :13.00
## Mean :181524 Mean :16604 Mean :162151 Mean :32 Mean :28.28
## 3rd Qu.:259275 3rd Qu.:24591 3rd Qu.:243245 3rd Qu.:32 3rd Qu.:72.00
## Max. :338803 Max. :32971 Max. :326034 Max. :32 Max. :79.00
## Hubungan.Dengan.Kepala.Rumah.Tangga..Krt. REGION GENDER
## Min. :1 Min. :1.000 Min. :1.00
## 1st Qu.:1 1st Qu.:1.000 1st Qu.:1.00
## Median :1 Median :1.000 Median :1.00
## Mean :1 Mean :1.345 Mean :1.15
## 3rd Qu.:1 3rd Qu.:2.000 3rd Qu.:1.00
## Max. :1 Max. :2.000 Max. :2.00
## AGE EDU SAVING ILLITERATE
## Min. :15.00 Min. :1.000 Min. : 0.00 Min. : 0.00
## 1st Qu.:38.00 1st Qu.:1.000 1st Qu.: 0.00 1st Qu.: 75.00
## Median :48.00 Median :2.000 Median : 28.57 Median :100.00
## Mean :48.57 Mean :1.974 Mean : 33.63 Mean : 89.46
## 3rd Qu.:58.00 3rd Qu.:3.000 3rd Qu.: 50.00 3rd Qu.:100.00
## Max. :97.00 Max. :4.000 Max. :100.00 Max. :100.00
## FOOD PLACE HOUSE OTPLACE
## Min. :0.00 Min. :0.0000 Min. : 36.0 Min. :0.0000
## 1st Qu.:0.00 1st Qu.:1.0000 1st Qu.: 40.0 1st Qu.:0.0000
## Median :0.00 Median :1.0000 Median : 60.0 Median :0.0000
## Mean :0.23 Mean :0.8051 Mean : 70.6 Mean :0.0934
## 3rd Qu.:0.00 3rd Qu.:1.0000 3rd Qu.: 84.0 3rd Qu.:0.0000
## Max. :1.00 Max. :1.0000 Max. :650.0 Max. :1.0000
## ROOF WALL FLOOR DEFECATION
## Min. :1.00 Min. :1.000 Min. :1.000 Min. :0.0000
## 1st Qu.:3.00 1st Qu.:4.000 1st Qu.:5.000 1st Qu.:1.0000
## Median :3.00 Median :4.000 Median :5.000 Median :1.0000
## Mean :2.87 Mean :3.733 Mean :4.413 Mean :0.9666
## 3rd Qu.:3.00 3rd Qu.:4.000 3rd Qu.:5.000 3rd Qu.:1.0000
## Max. :4.00 Max. :4.000 Max. :5.000 Max. :1.0000
## WATER ELECTRICITY CREDIT LAND
## Min. :1.000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:3.000 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :3.000 Median :1.0000 Median :0.0000 Median :1.0000
## Mean :3.498 Mean :0.9042 Mean :0.2488 Mean :0.7134
## 3rd Qu.:4.000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :5.000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## INCOME KKS AID MICRO
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.00000
## Mean :0.0793 Mean :0.1186 Mean :0.2361 Mean :0.07981
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## EXPENDITURE
## Min. : 1.596
## 1st Qu.: 6.615
## Median : 10.393
## Mean : 14.744
## 3rd Qu.: 17.295
## Max. :401.382
# Konversi variabel yang diperlukan
# Misalnya, pastikan variabel kategorikal adalah faktor
data$REGION <- as.factor(data$REGION)
# Fungsi evaluasi performa
evaluate_grf <- function(model, test_data, response, predictors) {
preds <- predict(model, newdata = test_data[,c(8:28)])$prediction
mse <- mean((test_data[,c(29)] - preds)^2)
return(mse)
}
# Deteksi pencilan dengan standardized residual
detect_outliers_grf <- function(model, data, response, predictors) {
residuals <- data[,c(29)] - predict(model, newdata = data[,c(8:28)])$prediction
standardized_residuals <- (residuals - mean(residuals)) / sd(residuals)
return(abs(standardized_residuals) > 3)
}
# Deteksi pencilan dengan standardized residual RF
detect_outliers_rf <- function(model, data, response, predictors) {
residuals <- data[,c(29)] - predict(model, newdata = data[,c(8:28)])
standardized_residuals <- (residuals - mean(residuals)) / sd(residuals)
return(abs(standardized_residuals) > 3)
}
# GRF Model
data1<- data %>% filter(Kabupaten.Kota %in% c(1, 16, 73))
response <- "EXPENDITURE"
predictors <- setdiff(names(data1), c("EXPENDITURE", "REGION"))
# Setup cross-validation
############################
# m = 2
#########################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data1[,c(8:28)],
Y = train_data1[,c(29)],num.trees = 500, mtry = 2)
mse_fold[fold_idx] <- mean((test_data1[,c(29)] - predict(grf_model, newdata = test_data1[,c(8:28)])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse1_m2<-mse_values
############################
# m = 5
#########################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data1[,c(8:28)],
Y = train_data1[,c(29)],num.trees = 500, mtry = 5)
mse_fold[fold_idx] <- mean((test_data1[,c(29)] - predict(grf_model, newdata = test_data1[,c(8:28)])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse1_m5<-mse_values
###################################
## m = 9
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data1[,c(8:28)],
Y = train_data1[,c(29)],num.trees = 500, mtry = 9)
mse_fold[fold_idx] <- mean((test_data1[,c(29)] - predict(grf_model, newdata = test_data1[,c(8:28)])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse1_m9<-mse_values
# Buat dataframe dengan label
graf1 <- data.frame(
m = factor(c("m=2", "m=5", "m=9")), # Label kategori
MSE = c(mean(mse1_m2), mean(mse1_m5), mean(mse1_m9)) # Nilai MSE
)
library(ggplot2)
# Scatter plot dengan label "m=2", "m=5", "m=9" di sumbu X
ggplot(graf1, aes(x = m, y = MSE)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "",
x = "Banyak peubah penjelas",
y = "MSE"
) +
theme_minimal()
###################################
## k = 50
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data1[,c(8:28)],
Y = train_data1[,c(29)],num.trees = 50, mtry = 9)
mse_fold[fold_idx] <- mean((test_data1[,c(29)] - predict(grf_model, newdata = test_data1[,c(8:28)])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse1_k50<-mse_values
###################################
## k = 100
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data1[,c(8:28)],
Y = train_data1[,c(29)],num.trees = 100, mtry = 9)
mse_fold[fold_idx] <- mean((test_data1[,c(29)] - predict(grf_model, newdata = test_data1[,c(8:28)])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse1_k100<-mse_values
###################################
## k = 150
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data1[,c(8:28)],
Y = train_data1[,c(29)],num.trees = 150, mtry = 9)
mse_fold[fold_idx] <- mean((test_data1[,c(29)] - predict(grf_model, newdata = test_data1[,c(8:28)])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse1_k150<-mse_values
###################################
## k = 250
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data1[,c(8:28)],
Y = train_data1[,c(29)],num.trees = 250, mtry = 9)
mse_fold[fold_idx] <- mean((test_data1[,c(29)] - predict(grf_model, newdata = test_data1[,c(8:28)])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse1_k250<-mse_values
###################################
## k = 500
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data1[,c(8:28)],
Y = train_data1[,c(29)],num.trees = 500, mtry = 9)
mse_fold[fold_idx] <- mean((test_data1[,c(29)] - predict(grf_model, newdata = test_data1[,c(8:28)])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse1_k500<-mse_values
# Dataframe dengan label k untuk sumbu X
graf2_data1 <- data.frame(
m = factor(c("k=50", "k=100", "k=150", "k=250", "k=500"), levels = c("k=50", "k=100", "k=150", "k=250", "k=500")), # Urutan sumbu X
MSE = c(mean(mse1_k50), mean(mse1_k100), mean(mse1_k150), mean(mse1_k250), mean(mse1_k500)) # Nilai MSE
)
# Scatter plot dengan ggplot2
library(ggplot2)
ggplot(graf2_data1, aes(x = m, y = MSE)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "Perbandingan MSE Berdasarkan Jumlah Pohon",
x = "Jumlah Pohon",
y = "MSE (Mean Squared Error)"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1), # Rotasi label sumbu X
legend.position = "none" # Tidak ada legenda
)
#PDRB Rendah
#########################################
data2<-data %>% filter(Kabupaten.Kota %in% c(8, 72, 79))
response <- "EXPENDITURE"
predictors <- setdiff(names(data2), c("EXPENDITURE", "REGION"))
# Setup cross-validation
############################
# m = 2
#########################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data2[,c(8:28)],
Y = train_data2[,c(29)],num.trees = 500, mtry = 2)
mse_fold[fold_idx] <- mean((test_data2[,c(29)] - predict(grf_model, newdata = test_data2[,c(8:28)])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse2_m2<-mse_values
############################
# m = 5
#########################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data2[,c(8:28)],
Y = train_data2[,c(29)],num.trees = 500, mtry = 5)
mse_fold[fold_idx] <- mean((test_data2[,c(29)] - predict(grf_model, newdata = test_data2[,c(8:28)])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse2_m5<-mse_values
###################################
## m = 9
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data2[,c(8:28)],
Y = train_data2[,c(29)],num.trees = 500, mtry = 9)
mse_fold[fold_idx] <- mean((test_data2[,c(29)] - predict(grf_model, newdata = test_data2[,c(8:28)])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse2_m9<-mse_values
# Buat dataframe dengan label
graf1 <- data.frame(
m = factor(c("m=2", "m=5", "m=9")), # Label kategori
MSE = c(mean(mse2_m2), mean(mse2_m5), mean(mse2_m9)) # Nilai MSE
)
library(ggplot2)
# Scatter plot dengan label "m=2", "m=5", "m=9" di sumbu X
ggplot(graf1, aes(x = m, y = MSE)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "",
x = "Banyak peubah penjelas",
y = "MSE"
) +
theme_minimal()
###################################
## k = 50
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data2[,c(8:28)],
Y = train_data2[,c(29)],num.trees = 50, mtry = 9)
mse_fold[fold_idx] <- mean((test_data2[,c(29)] - predict(grf_model, newdata = test_data2[,c(8:28)])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse2_k50<-mse_values
###################################
## k = 100
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data2[,c(8:28)],
Y = train_data2[,c(29)],num.trees = 100, mtry = 9)
mse_fold[fold_idx] <- mean((test_data2[,c(29)] - predict(grf_model, newdata = test_data2[,c(8:28)])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse2_k100<-mse_values
###################################
## k = 150
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data2[,c(8:28)],
Y = train_data2[,c(29)],num.trees = 150, mtry = 9)
mse_fold[fold_idx] <- mean((test_data2[,c(29)] - predict(grf_model, newdata = test_data2[,c(8:28)])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse2_k150<-mse_values
###################################
## k = 250
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data2[,c(8:28)],
Y = train_data2[,c(29)],num.trees = 250, mtry = 9)
mse_fold[fold_idx] <- mean((test_data2[,c(29)] - predict(grf_model, newdata = test_data2[,c(8:28)])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse2_k250<-mse_values
###################################
## k = 500
#################################
set.seed(123)
grfs <- list()
mse_grf <- c()
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data2[,c(8:28)],
Y = train_data2[,c(29)],num.trees = 500, mtry = 9)
mse_fold[fold_idx] <- mean((test_data2[,c(29)] - predict(grf_model, newdata = test_data2[,c(8:28)])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse2_k500<-mse_values
graf2_data2 <- data.frame(
m = factor(c("k=50", "k=100", "k=150", "k=250", "k=500"), levels = c("k=50", "k=100", "k=150", "k=250", "k=500")), # Urutan sumbu X
MSE = c(mean(mse2_k50), mean(mse2_k100), mean(mse2_k150), mean(mse2_k250), mean(mse2_k500)) # Nilai MSE
)
# Scatter plot dengan ggplot2
library(ggplot2)
ggplot(graf2_data2, aes(x = m, y = MSE)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "Perbandingan MSE Berdasarkan Jumlah Pohon",
x = "Jumlah Pohon",
y = "MSE (Mean Squared Error)"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1), # Rotasi label sumbu X
legend.position = "none" # Tidak ada legenda
)
#GRAFIK GABUNGAN GRF
#Grafik Gabungan
# Contoh data MSE untuk 6 kabupaten/kota
graf3 <- data.frame(
m = rep(c("m=2", "m=5", "m=9"), times = 2), # Label m untuk sumbu X
MSE = c(mean(mse1_m2), mean(mse1_m5), mean(mse1_m9), # PDRB Tinggi
mean(mse2_m2), mean(mse2_m5), mean(mse2_m9)), # PDRB Rendah
PDRB = rep(c("PDRB TINGGI", "PDRB RENDAH"),
each = 3) # Nama kabupaten/kota
)
# Hitung rentang skala Y berdasarkan graf3 dengan margin ±10
y_min <- min(graf3$MSE, na.rm = TRUE) - 10
y_max <- max(graf3$MSE, na.rm = TRUE) + 10
# Grafik untuk MSE dengan PDRB Tinggi dan Rendah
# Grafik untuk MSE dengan margin pada skala Y
ggplot(graf3, aes(x = m, y = MSE, group = PDRB, shape = PDRB)) +
geom_point(size = 3, color = "black") + # Titik dengan warna hitam
geom_line(linewidth = 1, color = "black") + # Garis dengan warna hitam
scale_shape_manual(
values = c("PDRB TINGGI" = 17, # Segitiga
"PDRB RENDAH" = 16) # Bulat
) +
scale_y_continuous(
limits = c(y_min, y_max), # Tambahkan margin ±10 ke rentang skala Y
expand = c(0, 0) # Hilangkan margin otomatis
) +
labs(
title = "Perbandingan MSE untuk Daerah PDRB Tinggi dan Rendah",
x = "Banyak peubah penjelas",
y = "MSE (Mean Squared Error)",
shape = "Kategori PDRB"
) +
theme_minimal() +
theme(legend.position = "right")
#Grafik Gabungan
# Contoh data MSE untuk 6 kabupaten/kota
graf4 <- data.frame(
m = rep(c("k=50", "k=100", "k=150", "k=250", "k=500"), times = 2), # Label m untuk sumbu X
MSE = c(mean(mse1_k50), mean(mse1_k100), mean(mse1_k150),mean(mse1_k250), mean(mse1_k500), # Kabupaten/kota 1
mean(mse2_k50), mean(mse2_k100), mean(mse2_k150),mean(mse2_k250), mean(mse2_k500)), # Kabupaten/kota 2
PDRB = rep(c("PDRB TINGGI", "PDRB RENDAH"),
each = 5) # Nama kabupaten/kota
)
library(ggplot2)
# Dataframe graf4
graf4 <- data.frame(
m = factor(rep(c("k=50", "k=100", "k=150", "k=250", "k=500"), times = 2),
levels = c("k=50", "k=100", "k=150", "k=250", "k=500")), # Pastikan urutan level
MSE = c(mean(mse1_k50), mean(mse1_k100), mean(mse1_k150), mean(mse1_k250), mean(mse1_k500), # PDRB Tinggi
mean(mse2_k50), mean(mse2_k100), mean(mse2_k150), mean(mse2_k250), mean(mse2_k500)), # PDRB Rendah
PDRB = rep(c("PDRB TINGGI", "PDRB RENDAH"), each = 5) # Kategori PDRB
)
# Grafik untuk MSE dengan PDRB Tinggi dan Rendah
ggplot(graf4, aes(x = m, y = MSE, group = PDRB, shape = PDRB)) +
geom_point(size = 3, color = "black", na.rm = TRUE) + # Titik dengan warna hitam
geom_line(linewidth = 1, color = "black", na.rm = TRUE) + # Garis dengan warna hitam
scale_shape_manual(
values = c("PDRB TINGGI" = 17, # Segitiga
"PDRB RENDAH" = 16) # Bulat
) +
scale_y_continuous(
limits = c(y_min,y_max),
expand = c(0, 0) # Sesuaikan skala Y berdasarkan nilai MSE
) +
labs(
title = "Perbandingan MSE untuk Jumlah Pohon pada Daerah PDRB Tinggi dan Rendah",
x = "Jumlah Pohon (k)",
y = "MSE (Mean Squared Error)",
shape = "Kategori PDRB"
) +
theme_minimal() +
theme(legend.position = "right")
## Deteksi Generalized Random Forest (GRF)
# Deteksi pencilan dan transformasi BOGOR
# Menghitung kuartil Q1 dan Q3
Q1 <- quantile(data1$EXPENDITURE, 0.25)
Q3 <- quantile(data1$EXPENDITURE, 0.75)
# Menghitung IQR
IQR_value <- Q3 - Q1
# Menghitung batas bawah dan atas
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- detect_outliers_grf(regression_forest(X = data1[,c(8:28)],Y = data1[,c(29)],num.trees = 500, mtry = 9), data1, response, predictors)
data1outlier<-sum(outliers)
data1$EXPENDITURE_TRANSFORMED <- data1$EXPENDITURE
data1$EXPENDITURE_TRANSFORMED[outliers] <- ifelse(data1$EXPENDITURE_TRANSFORMED[outliers] < lower_bound, lower_bound, upper_bound)
sum(data1$EXPENDITURE!=data1$EXPENDITURE_TRANSFORMED)
## [1] 55
# Deteksi pencilan dan transformasi BEKASI
# Menghitung kuartil Q1 dan Q3
Q1 <- quantile(data2$EXPENDITURE, 0.25)
Q3 <- quantile(data2$EXPENDITURE, 0.75)
# Menghitung IQR
IQR_value <- Q3 - Q1
# Menghitung batas bawah dan atas
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- detect_outliers_grf(regression_forest(X = data2[,c(8:28)],Y = data2[,c(29)],num.trees = 500, mtry = 9), data2, response, predictors)
data2outlier<-sum(outliers)
data2$EXPENDITURE_TRANSFORMED <- data2$EXPENDITURE
data2$EXPENDITURE_TRANSFORMED[outliers] <- ifelse(data2$EXPENDITURE_TRANSFORMED[outliers] < lower_bound, lower_bound, upper_bound)
#Pemodelan GRF setelah ditransformasi
# Pemodelan ulang setelah transformasi
# PDRB TINGGI
set.seed(123)
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE_TRANSFORMED, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data1[,c(8:28)],
Y = train_data1$EXPENDITURE_TRANSFORMED,num.trees = 500, mtry = 9)
mse_fold[fold_idx] <- mean((test_data1$EXPENDITURE_TRANSFORMED - predict(grf_model, newdata = test_data1[,c(8:28)])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse1_best<-mse_values
##################################
#PDRB RENDAH
#############################
set.seed(123)
mse_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE_TRANSFORMED, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
grf_model <- regression_forest(X = train_data2[,c(8:28)],
Y = train_data2$EXPENDITURE_TRANSFORMED,num.trees = 500, mtry = 9)
mse_fold[fold_idx] <- mean((test_data2$EXPENDITURE_TRANSFORMED - predict(grf_model, newdata = test_data2[,c(8:28)])$prediction)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse2_best<-mse_values
# RF Model
# KAB BOGOR
# Setup cross-validation
############################
# m = 2
#########################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data1[,c(8:28)],
y = train_data1[,c(29)],ntree = 500, mtry = 2)
mserf_fold[fold_idx] <- mean((test_data1[,c(29)] - predict(rf_model, newdata = test_data1[,c(8:28)]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf1_m2<-mserf_values
############################
# m = 5
#########################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data1[,c(8:28)],
y = train_data1[,c(29)],ntree = 500, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data1[,c(29)] - predict(rf_model, newdata = test_data1[,c(8:28)]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf1_m5<-mserf_values
###################################
## m = 9
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data1[,c(8:28)],
y = train_data1[,c(29)],ntree = 500, mtry = 9)
mserf_fold[fold_idx] <- mean((test_data1[,c(29)] - predict(rf_model, newdata = test_data1[,c(8:28)]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf1_m9<-mserf_values
# Buat dataframe dengan label
graf1 <- data.frame(
m = factor(c("m=2", "m=5", "m=9")), # Label kategori
MSERF = c(mean(mserf1_m2), mean(mserf1_m5), mean(mserf1_m9)) # Nilai MSERF
)
library(ggplot2)
# Scatter plot dengan label "m=2", "m=5", "m=9" di sumbu X
ggplot(graf1, aes(x = m, y = MSERF)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "",
x = "Banyak peubah penjelas",
y = "MSERF"
) +
theme_minimal()
###################################
## k = 50
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data1[,c(8:28)],
y = train_data1[,c(29)],ntree = 50, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data1[,c(29)] - predict(rf_model, newdata = test_data1[,c(8:28)]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf1_k50<-mserf_values
###################################
## k = 100
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data1[,c(8:28)],
y = train_data1[,c(29)],ntree = 100, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data1[,c(29)] - predict(rf_model, newdata = test_data1[,c(8:28)]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf1_k100<-mserf_values
###################################
## k = 150
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data1[,c(8:28)],
y = train_data1[,c(29)],ntree = 150, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data1[,c(29)] - predict(rf_model, newdata = test_data1[,c(8:28)]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf1_k150<-mserf_values
###################################
## k = 250
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data1[,c(8:28)],
y = train_data1[,c(29)],ntree = 250, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data1[,c(29)] - predict(rf_model, newdata = test_data1[,c(8:28)]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf1_k250<-mserf_values
###################################
## k = 500
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data1[,c(8:28)],
y = train_data1[,c(29)],ntree = 500, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data1[,c(29)] - predict(rf_model, newdata = test_data1[,c(8:28)]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf1_k500<-mserf_values
# Pastikan kolom m diatur sebagai faktor dengan level terurut
graf2_data1 <- data.frame(
m = factor(c("k=50", "k=100", "k=150", "k=250", "k=500"),
levels = c("k=50", "k=100", "k=150", "k=250", "k=500")), # Urutan level
MSERF = c(mean(mserf1_k50), mean(mserf1_k100), mean(mserf1_k150),
mean(mserf1_k250), mean(mserf1_k500)) # Nilai MSERF
)
library(ggplot2)
# Scatter plot dengan sumbu X terurut
ggplot(graf2_data1, aes(x = m, y = MSERF)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "Perbandingan MSERF Berdasarkan Jumlah Pohon",
x = "Jumlah Pohon",
y = "MSERF"
) +
theme_minimal()
#PDRB RENDAH
#########################################
# KAB PDRB RENDAH
# Setup cross-validation
############################
# m = 2
#########################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data2[,c(8:28)],
y = train_data2[,c(29)],ntree = 500, mtry = 2)
mserf_fold[fold_idx] <- mean((test_data2[,c(29)] - predict(rf_model, newdata = test_data2[,c(8:28)]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf2_m2<-mserf_values
############################
# m = 5
#########################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data2[,c(8:28)],
y = train_data2[,c(29)],ntree = 500, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data2[,c(29)] - predict(rf_model, newdata = test_data2[,c(8:28)]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf2_m5<-mserf_values
###################################
## m = 9
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data2[,c(8:28)],
y = train_data2[,c(29)],ntree = 500, mtry = 9)
mserf_fold[fold_idx] <- mean((test_data2[,c(29)] - predict(rf_model, newdata = test_data2[,c(8:28)]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf2_m9<-mserf_values
# Buat dataframe dengan label
graf1 <- data.frame(
m = factor(c("m=2", "m=5", "m=9")), # Label kategori
MSERF = c(mean(mserf2_m2), mean(mserf2_m5), mean(mserf2_m9)) # Nilai MSERF
)
library(ggplot2)
# Scatter plot dengan label "m=2", "m=5", "m=9" di sumbu X
ggplot(graf1, aes(x = m, y = MSERF)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "",
x = "Banyak peubah penjelas",
y = "MSERF"
) +
theme_minimal()
###################################
## k = 50
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data2[,c(8:28)],
y = train_data2[,c(29)],ntree = 50, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data2[,c(29)] - predict(rf_model, newdata = test_data2[,c(8:28)]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf2_k50<-mserf_values
###################################
## k = 100
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data2[,c(8:28)],
y = train_data2[,c(29)],ntree = 100, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data2[,c(29)] - predict(rf_model, newdata = test_data2[,c(8:28)]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf2_k100<-mserf_values
###################################
## k = 150
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data2[,c(8:28)],
y = train_data2[,c(29)],ntree = 150, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data2[,c(29)] - predict(rf_model, newdata = test_data2[,c(8:28)]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf2_k150<-mserf_values
###################################
## k = 250
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data2[,c(8:28)],
y = train_data2[,c(29)],ntree = 250, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data2[,c(29)] - predict(rf_model, newdata = test_data2[,c(8:28)]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf2_k250<-mserf_values
###################################
## k = 500
#################################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data2[,c(8:28)],
y = train_data2[,c(29)],ntree = 500, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data2[,c(29)] - predict(rf_model, newdata = test_data2[,c(8:28)]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf2_k500<-mserf_values
# Buat dataframe dengan label
graf2_data2 <- data.frame(
m = factor(c("k=50", "k=100", "k=150", "k=250", "k=500"),
levels = c("k=50", "k=100", "k=150", "k=250", "k=500")), # Urutan level eksplisit
MSERF = c(mean(mserf2_k50), mean(mserf2_k100), mean(mserf2_k150),
mean(mserf2_k250), mean(mserf2_k500)) # Nilai MSERF
)
library(ggplot2)
# Scatter plot dengan sumbu X terurut
ggplot(graf2_data2, aes(x = m, y = MSERF)) +
geom_point(size = 4, color = "black") + # Titik
geom_line(aes(group = 1), color = "black", linetype = "solid") + # Garis penghubung
labs(
title = "Perbandingan MSERF Berdasarkan Jumlah Pohon",
x = "Jumlah Pohon",
y = "MSERF"
) +
theme_minimal()
#GRAFIK GABUNGAN RF
#Grafik Gabungan
# Contoh data MSERF untuk 6 kabupaten/kota
graf3 <- data.frame(
m = rep(c("m=2", "m=5", "m=9"), times = 2), # Label m untuk sumbu X
MSERF = c(mean(mserf1_m2), mean(mserf1_m5), mean(mserf1_m9), # Kabupaten/kota 1
mean(mserf2_m2), mean(mserf2_m5), mean(mserf2_m9)), # Kabupaten/kota 2
PDRB = rep(c("PDRB TINGGI", "PDRB RENDAH"),
each = 3) # Nama kabupaten/kota
)
library(ggplot2)
library(dplyr)
# Hitung rentang skala Y berdasarkan graf3 dengan margin ±10
y_min <- min(graf3$MSERF, na.rm = TRUE) - 10
y_max <- max(graf3$MSERF, na.rm = TRUE) + 10
# Grafik untuk MSE dengan PDRB Tinggi dan Rendah
# Grafik untuk MSE dengan margin pada skala Y
ggplot(graf3, aes(x = m, y = MSERF, group = PDRB, shape = PDRB)) +
geom_point(size = 3, color = "black") + # Titik dengan warna hitam
geom_line(linewidth = 1, color = "black") + # Garis dengan warna hitam
scale_shape_manual(
values = c("PDRB TINGGI" = 17, # Segitiga
"PDRB RENDAH" = 16) # Bulat
) +
scale_y_continuous(
limits = c(y_min, y_max), # Tambahkan margin ±10 ke rentang skala Y
expand = c(0, 0) # Hilangkan margin otomatis
) +
labs(
title = "Perbandingan MSE untuk Daerah PDRB Tinggi dan Rendah",
x = "Banyak peubah penjelas",
y = "MSE (Mean Squared Error)",
shape = "Kategori PDRB"
) +
theme_minimal() +
theme(legend.position = "right")
#Grafik Gabungan
# Contoh data MSERF untuk 6 kabupaten/kota
# Pastikan library yang diperlukan telah dimuat
library(dplyr)
library(ggplot2)
# Contoh data MSE untuk 6 kabupaten/kota
graf4 <- data.frame(
m = rep(c("k=50", "k=100", "k=150", "k=250", "k=500"), times = 2), # Label m untuk sumbu X
MSERF = c(mean(mserf1_k50), mean(mserf1_k100), mean(mserf1_k150),mean(mserf1_k250), mean(mserf1_k500), # Kabupaten/kota 1
mean(mserf2_k50), mean(mserf2_k100), mean(mserf2_k150),mean(mserf2_k250), mean(mserf2_k500)), # Kabupaten/kota 2
PDRB = rep(c("PDRB TINGGI", "PDRB RENDAH"),
each = 5) # Nama kabupaten/kota
)
library(ggplot2)
# Dataframe graf4
graf4 <- data.frame(
m = factor(rep(c("k=50", "k=100", "k=150", "k=250", "k=500"), times = 2),
levels = c("k=50", "k=100", "k=150", "k=250", "k=500")), # Pastikan urutan level
MSERF = c(mean(mserf1_k50), mean(mserf1_k100), mean(mserf1_k150),mean(mserf1_k250), mean(mserf1_k500), # PDRB TINGGI
mean(mserf2_k50), mean(mserf2_k100), mean(mserf2_k150),mean(mserf2_k250), mean(mserf2_k500)), # PDRB RENDAH
PDRB = rep(c("PDRB TINGGI", "PDRB RENDAH"), each = 5) # Kategori PDRB
)
# Grafik untuk MSE dengan PDRB Tinggi dan Rendah
ggplot(graf4, aes(x = m, y = MSERF, group = PDRB, shape = PDRB)) +
geom_point(size = 3, color = "black", na.rm = TRUE) + # Titik dengan warna hitam
geom_line(linewidth = 1, color = "black", na.rm = TRUE) + # Garis dengan warna hitam
scale_shape_manual(
values = c("PDRB TINGGI" = 17, # Segitiga
"PDRB RENDAH" = 16) # Bulat
) +
scale_y_continuous(
limits = c(y_min,y_max),
expand = c(0, 0) # Sesuaikan skala Y berdasarkan nilai MSE
) +
labs(
title = "Perbandingan MSE untuk Jumlah Pohon pada Daerah PDRB Tinggi dan Rendah",
x = "Jumlah Pohon (k)",
y = "MSE (Mean Squared Error)",
shape = "Kategori PDRB"
) +
theme_minimal() +
theme(legend.position = "right")
## Deteksi Random Forest (RF)
# Deteksi pencilan dan transformasi BOGOR
# Menghitung kuartil Q1 dan Q3
Q1 <- quantile(data1$EXPENDITURE, 0.25)
Q3 <- quantile(data1$EXPENDITURE, 0.75)
# Menghitung IQR
IQR_value <- Q3 - Q1
# Menghitung batas bawah dan atas
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- detect_outliers_rf(randomForest(x = data1[,c(8:28)],y = data1[,c(29)],ntree = 500, mtry = 5), data1, response, predictors)
data1rfoutlier<-sum(outliers)
data1$EXPENDITURE_TRANSFORMED_RF <- data1$EXPENDITURE
data1$EXPENDITURE_TRANSFORMED_RF[outliers] <- ifelse(data1$EXPENDITURE_TRANSFORMED_RF[outliers] < lower_bound, lower_bound, upper_bound)
sum(data1$EXPENDITURE!=data1$EXPENDITURE_TRANSFORMED_RF)
## [1] 64
# Deteksi pencilan dan transformasi PDRB Rendah
# Menghitung kuartil Q1 dan Q3
Q1 <- quantile(data2$EXPENDITURE, 0.25)
Q3 <- quantile(data2$EXPENDITURE, 0.75)
# Menghitung IQR
IQR_value <- Q3 - Q1
# Menghitung batas bawah dan atas
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- detect_outliers_rf(randomForest(x = data2[,c(8:28)],y = data2[,c(29)],ntree = 500, mtry = 5), data2, response, predictors)
data2rfoutlier<-sum(outliers)
data2$EXPENDITURE_TRANSFORMED_RF <- data2$EXPENDITURE
data2$EXPENDITURE_TRANSFORMED_RF[outliers] <- ifelse(data2$EXPENDITURE_TRANSFORMED_RF[outliers] < lower_bound, lower_bound, upper_bound)
#Pemodelan setelah ditransformasi RF
# Pemodelan ulang setelah transformasi
# PDRB TINGGI
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE_TRANSFORMED_RF, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data1[,c(8:28)],
y = train_data1$EXPENDITURE_TRANSFORMED_RF,ntree = 500, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data1$EXPENDITURE_TRANSFORMED_RF - predict(rf_model, newdata = test_data1[,c(8:28)]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf1_best<-mserf_values
##################################
#PDRB RENDAH
#############################
set.seed(123)
mserf_values<-c()
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE_TRANSFORMED_RF, k = 10)
mserf_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
rf_model <- randomForest(x = train_data2[,c(8:28)],
y = train_data2$EXPENDITURE_TRANSFORMED_RF,ntree = 500, mtry = 5)
mserf_fold[fold_idx] <- mean((test_data2$EXPENDITURE_TRANSFORMED_RF - predict(rf_model, newdata = test_data2[,c(8:28)]))^2)
}
# Simpan rata-rata MSERF untuk pengulangan ini
mserf_values[rep] <- mean(mserf_fold)
}
mserf2_best<-mserf_values
#PDRB TINGGI
library(lme4)
# GLMM Model
set.seed(123)
glmms <- list()
rmse_glmm <- c()
# Model awal (Full Model)
model1 <- lmer(EXPENDITURE ~ GENDER + AGE + EDU + SAVING + ILLITERATE + FOOD + PLACE + HOUSE + OTPLACE +
ROOF + WALL + FLOOR + DEFECATION + WATER + ELECTRICITY + CREDIT + LAND + INCOME +
KKS + AID + MICRO + (1|REGION), data = data1[, c(7:29)])
summary(model1)
## Linear mixed model fit by REML ['lmerMod']
## Formula: EXPENDITURE ~ GENDER + AGE + EDU + SAVING + ILLITERATE + FOOD +
## PLACE + HOUSE + OTPLACE + ROOF + WALL + FLOOR + DEFECATION +
## WATER + ELECTRICITY + CREDIT + LAND + INCOME + KKS + AID +
## MICRO + (1 | REGION)
## Data: data1[, c(7:29)]
##
## REML criterion at convergence: 27952.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.8335 -0.5400 -0.1342 0.3070 17.6012
##
## Random effects:
## Groups Name Variance Std.Dev.
## REGION (Intercept) 3.219 1.794
## Residual 162.697 12.755
## Number of obs: 3525, groups: REGION, 2
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -15.275527 3.337000 -4.578
## GENDER 3.572532 0.651713 5.482
## AGE -0.031085 0.019139 -1.624
## EDU 1.505705 0.258626 5.822
## SAVING 0.161656 0.007922 20.407
## ILLITERATE 0.083813 0.012684 6.608
## FOOD -2.012526 0.536135 -3.754
## PLACE -1.228492 0.673387 -1.824
## HOUSE 0.067009 0.004668 14.356
## OTPLACE 3.302450 0.690325 4.784
## ROOF 0.054974 0.545962 0.101
## WALL -0.370684 0.580234 -0.639
## FLOOR 0.148272 0.256074 0.579
## DEFECATION -1.785663 1.114247 -1.603
## WATER 3.308000 0.320265 10.329
## ELECTRICITY -0.216089 0.713837 -0.303
## CREDIT 0.424801 0.544739 0.780
## LAND 1.669997 0.606626 2.753
## INCOME 4.916813 0.944850 5.204
## KKS -1.426719 0.944553 -1.510
## AID -2.566913 0.679201 -3.779
## MICRO 0.420176 0.792918 0.530
##
## Correlation matrix not shown by default, as p = 22 > 12.
## Use print(x, correlation=TRUE) or
## vcov(x) if you need it
############################
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.4.2
## Loaded glmnet 4.1-8
X <- model.matrix(~ . - EXPENDITURE - REGION, data1[, c(7:29)])
y <- data1$EXPENDITURE
lasso_model <- cv.glmnet(X, y, alpha = 1, family = "gaussian")
# Ekstrak koefisien sebagai matriks
lasso_coefs <- as.matrix(coef(lasso_model, s = "lambda.min"))
# Ambil nama variabel dengan koefisien tidak nol (selain intercept)
selected_vars <- rownames(lasso_coefs)[lasso_coefs[, 1] != 0][-1] # Hilangkan intercept
# Pastikan ada variabel yang dipilih
if (length(selected_vars) == 0) {
stop("Tidak ada variabel yang dipilih oleh Lasso.")
}
# Buat formula dinamis
formula <- as.formula(
paste("EXPENDITURE ~", paste(selected_vars, collapse = " + "), "+ (1 | REGION)")
)
# Model GLMM dengan variabel terpilih
model2 <- lmer(formula, data = data1[, c(7:29)])
summary(model2)
## Linear mixed model fit by REML ['lmerMod']
## Formula: EXPENDITURE ~ GENDER + AGE + EDU + SAVING + ILLITERATE + FOOD +
## PLACE + HOUSE + OTPLACE + WALL + DEFECATION + WATER + CREDIT +
## LAND + INCOME + KKS + AID + MICRO + (1 | REGION)
## Data: data1[, c(7:29)]
##
## REML criterion at convergence: 27954.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.8370 -0.5384 -0.1338 0.3075 17.6079
##
## Random effects:
## Groups Name Variance Std.Dev.
## REGION (Intercept) 3.147 1.774
## Residual 162.578 12.751
## Number of obs: 3525, groups: REGION, 2
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -15.167415 3.001223 -5.054
## GENDER 3.569155 0.651145 5.481
## AGE -0.031567 0.019068 -1.656
## EDU 1.508437 0.258370 5.838
## SAVING 0.161764 0.007916 20.436
## ILLITERATE 0.083893 0.012671 6.621
## FOOD -2.024950 0.532637 -3.802
## PLACE -1.229617 0.671486 -1.831
## HOUSE 0.067079 0.004623 14.509
## OTPLACE 3.309257 0.689136 4.802
## WALL -0.263925 0.540637 -0.488
## DEFECATION -1.681413 1.094772 -1.536
## WATER 3.316612 0.319713 10.374
## CREDIT 0.421793 0.544087 0.775
## LAND 1.673777 0.605833 2.763
## INCOME 4.919122 0.943700 5.213
## KKS -1.433867 0.944010 -1.519
## AID -2.597266 0.677037 -3.836
## MICRO 0.409276 0.792225 0.517
##
## Correlation matrix not shown by default, as p = 19 > 12.
## Use print(x, correlation=TRUE) or
## vcov(x) if you need it
###############
repeats=10
mse_values <- numeric(repeats)
mse_bestvalues <- numeric(repeats)
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
mse_best_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
# Fit GLMM model
model1 <- lmer(EXPENDITURE ~ . - REGION + (1|REGION), data = train_data1[, c(7:29)])
model2<-lmer(formula, data = train_data1[, c(7:29)])
predictions_model1 <- predict(model1, test_data1[, c(7:28)])
predictions_model2<-predict(model2, test_data1[, c(7:28)])
# Hitung MSE untuk fold ini
mse_fold[fold_idx] <- mean((test_data1$EXPENDITURE - predictions_model1)^2)
mse_best_fold[fold_idx] <- mean((test_data1$EXPENDITURE - predictions_model2)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
mse_bestvalues[rep]<-mean(mse_best_fold)
}
msepdrbtinggi_model1<-mse_values
mean(msepdrbtinggi_model1)
## [1] 164.0136
msepdrbtinggi_model2<-mse_bestvalues
mean(msepdrbtinggi_model2)
## [1] 163.8556
# GLMM Model
set.seed(123)
glmms <- list()
rmse_glmm <- c()
# Gunakan kolom 7 hingga 29
# Model awal (Full Model)
model1 <- lmer(EXPENDITURE ~ GENDER + AGE + EDU + SAVING + ILLITERATE + FOOD + PLACE + HOUSE + OTPLACE +
ROOF + WALL + FLOOR + DEFECATION + WATER + ELECTRICITY + CREDIT + LAND + INCOME +
KKS + AID + MICRO + (1|REGION), data = data2[, c(7:29)])
summary(model1)
## Linear mixed model fit by REML ['lmerMod']
## Formula: EXPENDITURE ~ GENDER + AGE + EDU + SAVING + ILLITERATE + FOOD +
## PLACE + HOUSE + OTPLACE + ROOF + WALL + FLOOR + DEFECATION +
## WATER + ELECTRICITY + CREDIT + LAND + INCOME + KKS + AID +
## MICRO + (1 | REGION)
## Data: data2[, c(7:29)]
##
## REML criterion at convergence: 15031.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.0629 -0.5397 -0.1316 0.2934 10.5942
##
## Random effects:
## Groups Name Variance Std.Dev.
## REGION (Intercept) 0.5504 0.7419
## Residual 94.2717 9.7094
## Number of obs: 2036, groups: REGION, 2
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -1.508e+01 3.936e+00 -3.830
## GENDER 1.288e+00 6.541e-01 1.969
## AGE 8.325e-04 1.854e-02 0.045
## EDU 1.910e+00 2.547e-01 7.496
## SAVING 7.854e-02 7.633e-03 10.290
## ILLITERATE 5.025e-02 1.354e-02 3.711
## FOOD -8.299e-01 5.348e-01 -1.552
## PLACE 6.538e-01 8.609e-01 0.760
## HOUSE 5.389e-02 5.315e-03 10.138
## OTPLACE 5.431e+00 7.355e-01 7.384
## ROOF 7.804e-01 7.417e-01 1.052
## WALL 8.798e-01 4.218e-01 2.086
## FLOOR 7.575e-02 2.348e-01 0.323
## DEFECATION -2.313e+00 2.299e+00 -1.006
## WATER 2.697e+00 3.645e-01 7.399
## ELECTRICITY -1.478e+00 8.626e-01 -1.714
## CREDIT -1.031e+00 4.920e-01 -2.095
## LAND -4.430e-01 8.094e-01 -0.547
## INCOME 1.941e+00 7.893e-01 2.459
## KKS -1.125e+00 7.478e-01 -1.504
## AID -4.925e-01 6.212e-01 -0.793
## MICRO 5.127e-01 6.931e-01 0.740
##
## Correlation matrix not shown by default, as p = 22 > 12.
## Use print(x, correlation=TRUE) or
## vcov(x) if you need it
############################
library(glmnet)
X <- model.matrix(~ . - EXPENDITURE - REGION, data2[, c(7:29)])
y <- data2$EXPENDITURE
lasso_model <- cv.glmnet(X, y, alpha = 1, family = "gaussian")
# Ekstrak koefisien sebagai matriks
lasso_coefs <- as.matrix(coef(lasso_model, s = "lambda.min"))
# Ambil nama variabel dengan koefisien tidak nol (selain intercept)
selected_vars <- rownames(lasso_coefs)[lasso_coefs[, 1] != 0][-1] # Hilangkan intercept
# Pastikan ada variabel yang dipilih
if (length(selected_vars) == 0) {
stop("Tidak ada variabel yang dipilih oleh Lasso.")
}
# Buat formula dinamis
formula <- as.formula(
paste("EXPENDITURE ~", paste(selected_vars, collapse = " + "), "+ (1 | REGION)")
)
# Model GLMM dengan variabel terpilih
model2 <- lmer(formula, data = data2[, c(7:29)])
summary(model2)
## Linear mixed model fit by REML ['lmerMod']
## Formula: EXPENDITURE ~ GENDER + EDU + SAVING + ILLITERATE + FOOD + PLACE +
## HOUSE + OTPLACE + ROOF + WALL + FLOOR + DEFECATION + WATER +
## ELECTRICITY + CREDIT + INCOME + KKS + AID + MICRO + (1 | REGION)
## Data: data2[, c(7:29)]
##
## REML criterion at convergence: 15027.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.0615 -0.5404 -0.1305 0.2961 10.5911
##
## Random effects:
## Groups Name Variance Std.Dev.
## REGION (Intercept) 0.5162 0.7184
## Residual 94.1944 9.7054
## Number of obs: 2036, groups: REGION, 2
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -14.995973 3.856657 -3.888
## GENDER 1.298993 0.647157 2.007
## EDU 1.914000 0.247962 7.719
## SAVING 0.078478 0.007627 10.290
## ILLITERATE 0.050251 0.013223 3.800
## FOOD -0.823268 0.534164 -1.541
## PLACE 0.324253 0.598867 0.541
## HOUSE 0.053698 0.005266 10.197
## OTPLACE 5.378622 0.723052 7.439
## ROOF 0.774282 0.741191 1.045
## WALL 0.857705 0.419058 2.047
## FLOOR 0.073261 0.234561 0.312
## DEFECATION -2.321246 2.297475 -1.010
## WATER 2.699403 0.363652 7.423
## ELECTRICITY -1.492111 0.861492 -1.732
## CREDIT -1.028940 0.488538 -2.106
## INCOME 1.951953 0.768727 2.539
## KKS -1.113897 0.747210 -1.491
## AID -0.494261 0.620708 -0.796
## MICRO 0.510455 0.692818 0.737
##
## Correlation matrix not shown by default, as p = 20 > 12.
## Use print(x, correlation=TRUE) or
## vcov(x) if you need it
###############
library(glmmLasso)
## Warning: package 'glmmLasso' was built under R version 4.4.2
repeats=10
mse_values <- numeric(repeats)
mse_bestvalues <- numeric(repeats)
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE, k = 10)
mse_fold <- numeric(10)
mse_best_fold <- numeric(10)
mse_fold3<-numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
# Fit GLMM model
model1 <- lmer(EXPENDITURE ~ . - REGION + (1|REGION), data = train_data2[, c(7:29)])
model2<-lmer(formula, data = train_data2[, c(7:29)])
predictions_model1 <- predict(model1, test_data2[, c(7:28)])
predictions_model2<-predict(model2, test_data2[, c(7:28)])
# Hitung MSE untuk fold ini
mse_fold[fold_idx] <- mean((test_data2$EXPENDITURE - predictions_model1)^2)
mse_best_fold[fold_idx] <- mean((test_data2$EXPENDITURE - predictions_model2)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
mse_bestvalues[rep]<-mean(mse_best_fold)
}
msepdrbrendah_model1<-mse_values
mean(msepdrbrendah_model1)
## [1] 95.93487
msepdrbrendah_model2<-mse_bestvalues
mean(msepdrbrendah_model2)
## [1] 95.77551
#GRAFIK MODEL 1 VS MODEL 2
mse_pdrbtinggi<-c(mean(msepdrbtinggi_model1),mean(msepdrbtinggi_model2))
mse_pdrbrendah<-c(mean(msepdrbrendah_model1),mean(msepdrbrendah_model2))
# Misalnya, mse_pdrbtinggi dan mse_pdrbrendah sudah dihitung sebelumnya
mse_pdrbtinggi <- c(mean(msepdrbtinggi_model1), mean(msepdrbtinggi_model2))
mse_pdrbrendah <- c(mean(msepdrbrendah_model1), mean(msepdrbrendah_model2))
# Membuat data frame untuk scatter plot
datapdrb <- data.frame(
Model = rep(c("Model 1", "Model 2"), 2), # Menyusun dua model
MSE = c(mse_pdrbtinggi, mse_pdrbrendah), # Nilai MSE dari dua model
PDRB = c("PDRB Tinggi", "PDRB Tinggi", "PDRB Rendah", "PDRB Rendah") # Kategori PDRB
)
library(ggplot2)
# Membuat scatter plot dengan garis penghubung
ggplot(datapdrb, aes(x = Model, y = MSE, color = PDRB, group = PDRB)) +
geom_point(size = 4) + # Ukuran titik
geom_line(aes(group = PDRB), color = "black", linetype = "solid") + # Menambahkan garis penghubung antara model 1 dan model 2
scale_color_manual(values = c("PDRB Tinggi" = "lightblue", "PDRB Rendah" = "maroon")) + # Warna titik
labs(
title = "MSE Model 1 vs Model 2 berdasarkan PDRB",
x = "Model",
y = "MSE",
color = "Kategori PDRB"
) +
theme_minimal() + # Tema minimal
theme(legend.position = "right") # Posisi legenda di sebelah kanan
# Deteksi pencilan dan transformasi PDRB TINGGI
# Menghitung kuartil Q1 dan Q3
Q1 <- quantile(data1$EXPENDITURE, 0.25)
Q3 <- quantile(data1$EXPENDITURE, 0.75)
# Menghitung IQR
IQR_value <- Q3 - Q1
# Menghitung batas bawah dan atas
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
residuals <- data1$EXPENDITURE - predict(lmer(EXPENDITURE ~ . - REGION + (1|REGION), data = data1[, c(7:29)]))
data1$standardized_residuals <- (residuals - mean(residuals)) / sd(residuals)
# Menentukan outliers
outliers <- data1[data1$standardized_residuals > 3 | data1$standardized_residuals < -3, ]
data1glmmoutlier<-nrow(outliers)
data1$EXPENDITURE_GLMM <- ifelse(data1$standardized_residuals > 3,upper_bound, ifelse(data1$standardized_residuals < -3, lower_bound, data1$EXPENDITURE))
sum(data1$EXPENDITURE!=data1$EXPENDITURE_GLMM)
## [1] 51
# Deteksi pencilan dan transformasi PDRB RENDAH
# Menghitung kuartil Q1 dan Q3
Q1 <- quantile(data2$EXPENDITURE, 0.25)
Q3 <- quantile(data2$EXPENDITURE, 0.75)
# Menghitung IQR
IQR_value <- Q3 - Q1
# Menghitung batas bawah dan atas
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
residuals <- data2$EXPENDITURE - predict(lmer(EXPENDITURE ~ . - REGION + (1|REGION), data = data2[, c(7:29)]))
data2$standardized_residuals <- (residuals - mean(residuals)) / sd(residuals)
# Menentukan outliers
outliers <- data2[data2$standardized_residuals > 3 | data2$standardized_residuals < -3, ]
data2glmmoutlier<-nrow(outliers)
data2glmmoutlier
## [1] 29
data2$EXPENDITURE_GLMM <- ifelse(data2$standardized_residuals > 3,upper_bound, ifelse(data2$standardized_residuals < -3, lower_bound, data2$EXPENDITURE))
sum(data2$EXPENDITURE!=data2$EXPENDITURE_GLMM)
## [1] 29
#PDRB Tinggi
repeats=10
mse_values <- numeric(repeats)
mse_bestvalues <- numeric(repeats)
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data1$EXPENDITURE_GLMM, k = 10)
mse_fold <- numeric(10)
mse_best_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data1), folds[[fold_idx]])
train_data1 <- data1[train_idx, ]
test_data1 <- data1[folds[[fold_idx]], ]
# Fit GLMM model
model1 <- lmer(EXPENDITURE_GLMM ~ . - REGION + (1|REGION), data = train_data1[, c(7:28,33)])
predictions_model1 <- predict(model1, test_data1[,c(7:28,33)])
# Hitung MSE untuk fold ini
mse_fold[fold_idx] <- mean((test_data1$EXPENDITURE_GLMM - predictions_model1)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse_bestpdrbtinggi <- mse_values
mean(mse_bestpdrbtinggi)
## [1] 93.31938
#PDRB RENDAH
repeats=10
mse_values <- numeric(repeats)
mse_bestvalues <- numeric(repeats)
for (rep in 1:10) {
# Pembagian data menggunakan k-fold CV
folds <- createFolds(data2$EXPENDITURE_GLMM, k = 10)
mse_fold <- numeric(10)
for (fold_idx in 1:10) {
train_idx <- setdiff(1:nrow(data2), folds[[fold_idx]])
train_data2 <- data2[train_idx, ]
test_data2 <- data2[folds[[fold_idx]], ]
# Fit GLMM model
model1 <- lmer(EXPENDITURE_GLMM ~ . - REGION + (1|REGION), data = train_data2[, c(7:28,33)])
predictions_model1 <- predict(model1, test_data2[, c(7:28,33)])
# Hitung MSE untuk fold ini
mse_fold[fold_idx] <- mean((test_data2$EXPENDITURE_GLMM - predictions_model1)^2)
}
# Simpan rata-rata MSE untuk pengulangan ini
mse_values[rep] <- mean(mse_fold)
}
mse_bestpdrbrendah <- mse_values
mean(mse_bestpdrbrendah)
## [1] 53.89002
library(Metrics)
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
data1gb<-data1[1:1265,]
data2gb<-data1[1266:2412,]
data3gb<-data1[2413:3525,]
data4gb<-data2[1:831,]
data5gb<-data2[832:1469,]
data6gb<-data2[1470:2036,]
# Fungsi untuk menghitung statistik
calculate_stats <- function(column) {
c(
Minimum = min(column, na.rm = TRUE),
Median = median(column, na.rm = TRUE),
Mean = mean(column, na.rm = TRUE), # Tambahan jika ingin rata-rata
Std_Dev = sd(column, na.rm = TRUE),
Maximum = max(column, na.rm = TRUE),
Skewness = skewness(column, na.rm = TRUE),
Kurtosis = kurtosis(column, na.rm = TRUE)
)}
datatf1_GRF<-data.frame(data1gb$EXPENDITURE,data1gb$EXPENDITURE_TRANSFORMED)
datatf1_RF<-data.frame(data1gb$EXPENDITURE,data1gb$EXPENDITURE_TRANSFORMED_RF)
datatf1_GLMM<-data.frame(data1gb$EXPENDITURE,data1gb$EXPENDITURE_GLMM)
datatf2_GRF<-data.frame(data2gb$EXPENDITURE,data2gb$EXPENDITURE_TRANSFORMED)
datatf2_RF<-data.frame(data2gb$EXPENDITURE,data2gb$EXPENDITURE_TRANSFORMED_RF)
datatf2_GLMM<-data.frame(data2gb$EXPENDITURE,data2gb$EXPENDITURE_GLMM)
datatf3_GRF<-data.frame(data3gb$EXPENDITURE,data3gb$EXPENDITURE_TRANSFORMED)
datatf3_RF<-data.frame(data3gb$EXPENDITURE,data3gb$EXPENDITURE_TRANSFORMED_RF)
datatf3_GLMM<-data.frame(data3gb$EXPENDITURE,data3gb$EXPENDITURE_GLMM)
datatf4_GRF<-data.frame(data4gb$EXPENDITURE,data4gb$EXPENDITURE_TRANSFORMED)
datatf4_RF<-data.frame(data4gb$EXPENDITURE,data4gb$EXPENDITURE_TRANSFORMED_RF)
datatf4_GLMM<-data.frame(data4gb$EXPENDITURE,data4gb$EXPENDITURE_GLMM)
datatf5_GRF<-data.frame(data5gb$EXPENDITURE,data5gb$EXPENDITURE_TRANSFORMED)
datatf5_RF<-data.frame(data5gb$EXPENDITURE,data5gb$EXPENDITURE_TRANSFORMED_RF)
datatf5_GLMM<-data.frame(data5gb$EXPENDITURE,data5gb$EXPENDITURE_GLMM)
datatf6_GRF<-data.frame(data6gb$EXPENDITURE,data6gb$EXPENDITURE_TRANSFORMED)
datatf6_RF<-data.frame(data6gb$EXPENDITURE,data6gb$EXPENDITURE_TRANSFORMED_RF)
datatf6_GLMM<-data.frame(data6gb$EXPENDITURE,data6gb$EXPENDITURE_GLMM)
stat_result1_GRF<-data.frame(sapply(datatf1_GRF,calculate_stats))
stat_result1_RF<-data.frame(sapply(datatf1_RF,calculate_stats))
stat_result1_GLMM<-data.frame(sapply(datatf1_GLMM,calculate_stats))
stat_result2_GRF<-data.frame(sapply(datatf2_GRF,calculate_stats))
stat_result2_RF<-data.frame(sapply(datatf2_RF,calculate_stats))
stat_result2_GLMM<-data.frame(sapply(datatf2_GLMM,calculate_stats))
stat_result3_GRF<-data.frame(sapply(datatf3_GRF,calculate_stats))
stat_result3_RF<-data.frame(sapply(datatf3_RF,calculate_stats))
stat_result3_GLMM<-data.frame(sapply(datatf3_GLMM,calculate_stats))
stat_result4_GRF<-data.frame(sapply(datatf4_GRF,calculate_stats))
stat_result4_RF<-data.frame(sapply(datatf4_RF,calculate_stats))
stat_result4_GLMM<-data.frame(sapply(datatf4_GLMM,calculate_stats))
stat_result5_GRF<-data.frame(sapply(datatf5_GRF,calculate_stats))
stat_result5_RF<-data.frame(sapply(datatf5_RF,calculate_stats))
stat_result5_GLMM<-data.frame(sapply(datatf5_GLMM,calculate_stats))
stat_result6_GRF<-data.frame(sapply(datatf6_GRF,calculate_stats))
stat_result6_RF<-data.frame(sapply(datatf6_RF,calculate_stats))
stat_result6_GLMM<-data.frame(sapply(datatf6_GLMM,calculate_stats))
stat_resultjoin1<-data.frame(stat_result1_GRF$data1gb.EXPENDITURE,stat_result1_GRF$data1gb.EXPENDITURE_TRANSFORMED,stat_result1_RF$data1gb.EXPENDITURE_TRANSFORMED_RF,stat_result1_GLMM$data1gb.EXPENDITURE_GLMM)
stat_resultjoin2<-data.frame(stat_result2_GRF$data2gb.EXPENDITURE,stat_result2_GRF$data2gb.EXPENDITURE_TRANSFORMED,stat_result2_RF$data2gb.EXPENDITURE_TRANSFORMED_RF,stat_result2_GLMM$data2gb.EXPENDITURE_GLMM)
stat_resultjoin3<-data.frame(stat_result3_GRF$data3gb.EXPENDITURE,stat_result3_GRF$data3gb.EXPENDITURE_TRANSFORMED,stat_result3_RF$data3gb.EXPENDITURE_TRANSFORMED_RF,stat_result3_GLMM$data3gb.EXPENDITURE_GLMM)
stat_resultjoin4<-data.frame(stat_result4_GRF$data4gb.EXPENDITURE,stat_result4_GRF$data4gb.EXPENDITURE_TRANSFORMED,stat_result4_RF$data4gb.EXPENDITURE_TRANSFORMED_RF,stat_result4_GLMM$data4gb.EXPENDITURE_GLMM)
stat_resultjoin5<-data.frame(stat_result5_GRF$data5gb.EXPENDITURE,stat_result5_GRF$data5gb.EXPENDITURE_TRANSFORMED,stat_result5_RF$data5gb.EXPENDITURE_TRANSFORMED_RF,stat_result5_GLMM$data5gb.EXPENDITURE_GLMM)
stat_resultjoin6<-data.frame(stat_result6_GRF$data6gb.EXPENDITURE,stat_result6_GRF$data6gb.EXPENDITURE_TRANSFORMED,stat_result6_RF$data6gb.EXPENDITURE_TRANSFORMED_RF,stat_result6_GLMM$data6gb.EXPENDITURE_GLMM)
stat_resultjoin1
## stat_result1_GRF.data1gb.EXPENDITURE
## 1 2.033080
## 2 9.915294
## 3 13.250850
## 4 11.714031
## 5 127.170833
## 6 3.437205
## 7 21.718980
## stat_result1_GRF.data1gb.EXPENDITURE_TRANSFORMED
## 1 2.033080
## 2 9.915294
## 3 12.872859
## 4 9.806830
## 5 80.332327
## 6 2.048479
## 7 8.726427
## stat_result1_RF.data1gb.EXPENDITURE_TRANSFORMED_RF
## 1 2.033080
## 2 9.915294
## 3 12.936108
## 4 10.076471
## 5 95.609554
## 6 2.276207
## 7 10.953636
## stat_result1_GLMM.data1gb.EXPENDITURE_GLMM
## 1 2.033080
## 2 9.915294
## 3 12.949171
## 4 10.187435
## 5 95.609554
## 6 2.388826
## 7 11.872993
stat_resultjoin2
## stat_result2_GRF.data2gb.EXPENDITURE
## 1 2.333482
## 2 13.589762
## 3 17.687328
## 4 13.010775
## 5 118.431964
## 6 2.506760
## 7 12.187261
## stat_result2_GRF.data2gb.EXPENDITURE_TRANSFORMED
## 1 2.333482
## 2 13.589762
## 3 17.267744
## 4 11.382046
## 5 77.994286
## 6 1.742178
## 7 6.627008
## stat_result2_RF.data2gb.EXPENDITURE_TRANSFORMED_RF
## 1 2.333482
## 2 13.589762
## 3 17.272545
## 4 11.403723
## 5 81.852997
## 6 1.769758
## 7 6.906319
## stat_result2_GLMM.data2gb.EXPENDITURE_GLMM
## 1 2.333482
## 2 13.589762
## 3 17.223017
## 4 11.207483
## 5 68.531786
## 6 1.644271
## 7 5.988828
stat_resultjoin3
## stat_result3_GRF.data3gb.EXPENDITURE
## 1 2.010924
## 2 17.454929
## 3 23.718273
## 4 21.234222
## 5 252.304795
## 6 3.064812
## 7 20.674489
## stat_result3_GRF.data3gb.EXPENDITURE_TRANSFORMED
## 1 2.010924
## 2 17.454929
## 3 22.082682
## 4 15.741748
## 5 83.459048
## 6 1.254065
## 7 4.241088
## stat_result3_RF.data3gb.EXPENDITURE_TRANSFORMED_RF
## 1 2.010924
## 2 17.526349
## 3 22.238781
## 4 15.899304
## 5 90.086726
## 6 1.263215
## 7 4.349719
## stat_result3_GLMM.data3gb.EXPENDITURE_GLMM
## 1 2.010924
## 2 17.454929
## 3 22.110035
## 4 15.804007
## 5 90.086726
## 6 1.266653
## 7 4.307707
stat_resultjoin4
## stat_result4_GRF.data4gb.EXPENDITURE
## 1 3.293917
## 2 9.321898
## 3 12.248204
## 4 9.291516
## 5 97.110116
## 6 3.088072
## 7 18.138368
## stat_result4_GRF.data4gb.EXPENDITURE_TRANSFORMED
## 1 3.293917
## 2 9.321898
## 3 11.934295
## 4 7.931822
## 5 52.241844
## 6 1.995409
## 7 7.840135
## stat_result4_RF.data4gb.EXPENDITURE_TRANSFORMED_RF
## 1 3.293917
## 2 9.321898
## 3 11.893546
## 4 7.796691
## 5 52.241844
## 6 1.935012
## 7 7.569893
## stat_result4_GLMM.data4gb.EXPENDITURE_GLMM
## 1 3.293917
## 2 9.321898
## 3 11.945842
## 4 7.964193
## 5 52.241844
## 6 1.999369
## 7 7.805910
stat_resultjoin5
## stat_result5_GRF.data5gb.EXPENDITURE
## 1 2.316531
## 2 11.478651
## 3 16.425259
## 4 15.712320
## 5 122.983887
## 6 2.843729
## 7 14.405258
## stat_result5_GRF.data5gb.EXPENDITURE_TRANSFORMED
## 1 2.316531
## 2 11.478651
## 3 15.020277
## 4 11.143841
## 5 60.898952
## 6 1.346037
## 7 4.600952
## stat_result5_RF.data5gb.EXPENDITURE_TRANSFORMED_RF
## 1 2.316531
## 2 11.478651
## 3 15.097174
## 4 11.324277
## 5 61.602168
## 6 1.388342
## 7 4.771043
## stat_result5_GLMM.data5gb.EXPENDITURE_GLMM
## 1 2.316531
## 2 11.478651
## 3 15.134692
## 4 11.426751
## 5 61.602168
## 6 1.415885
## 7 4.869295
stat_resultjoin6
## stat_result6_GRF.data6gb.EXPENDITURE
## 1 2.091506
## 2 8.728882
## 3 11.290794
## 4 9.129295
## 5 105.056933
## 6 4.203416
## 7 31.412279
## stat_result6_GRF.data6gb.EXPENDITURE_TRANSFORMED
## 1 2.091506
## 2 8.728882
## 3 10.908078
## 4 7.073489
## 5 50.663533
## 6 2.027484
## 7 8.015274
## stat_result6_RF.data6gb.EXPENDITURE_TRANSFORMED_RF
## 1 2.091506
## 2 8.758651
## 3 10.899539
## 4 6.927329
## 5 45.059298
## 6 1.853023
## 7 6.804956
## stat_result6_GLMM.data6gb.EXPENDITURE_GLMM
## 1 2.091506
## 2 8.728882
## 3 10.908078
## 4 7.073489
## 5 50.663533
## 6 2.027484
## 7 8.015274
# BOGOR
train1_indices <- sample(1:nrow(data1gb), 0.7 * nrow(data1gb))
train1 <- data1gb[train1_indices, ]
test1 <- data1gb[-train1_indices, ]
# BEKASI
train2_indices <- sample(1:nrow(data2gb), 0.7 * nrow(data2gb))
train2 <- data2gb[train2_indices, ]
test2 <- data2gb[-train2_indices, ]
# BANDUNG
train3_indices <- sample(1:nrow(data3gb), 0.7 * nrow(data3gb))
train3 <- data3gb[train3_indices, ]
test3 <- data3gb[-train3_indices, ]
# KUNINGAN
train4_indices <- sample(1:nrow(data4gb), 0.7 * nrow(data4gb))
train4 <- data4gb[train4_indices, ]
test4 <- data4gb[-train4_indices, ]
# SUKABUMI
train5_indices <- sample(1:nrow(data5gb), 0.7 * nrow(data5gb))
train5 <- data5gb[train5_indices, ]
test5 <- data5gb[-train5_indices, ]
# BANJAR
train6_indices <- sample(1:nrow(data6gb), 0.7 * nrow(data6gb))
train6 <- data6gb[train6_indices, ]
test6 <- data6gb[-train6_indices, ]
#UJI ANOVA
# Load library
library(dplyr)
library(ggplot2)
# Import data
data_anova <- read.csv("D:/Download/data_anova.csv")
# Lakukan ANOVA
anova_model <- aov(MSE ~ PDRB * METODE, data = data_anova)
# Ringkasan hasil ANOVA
summary(anova_model)
## Df Sum Sq Mean Sq F value Pr(>F)
## PDRB 1 20455 20455 324937 <2e-16 ***
## METODE 2 1345 672 10681 <2e-16 ***
## PDRB:METODE 2 55 28 438 <2e-16 ***
## Residuals 54 3 0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Tukey HSD post-hoc test
tukey_result <- TukeyHSD(anova_model)
print(tukey_result)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = MSE ~ PDRB * METODE, data = data_anova)
##
## $PDRB
## diff lwr upr p adj
## PDRB Tinggi-PDRB Rendah 36.92745 36.79757 37.05732 0
##
## $METODE
## diff lwr upr p adj
## GRF-GLMM -8.766441 -8.957651 -8.575232 0
## RF-GLMM -10.957250 -11.148459 -10.766041 0
## RF-GRF -2.190808 -2.382018 -1.999599 0
##
## $`PDRB:METODE`
## diff lwr upr p adj
## PDRB Tinggi:GLMM-PDRB Rendah:GLMM 39.621183 39.289677 39.952689 0
## PDRB Rendah:GRF-PDRB Rendah:GLMM -6.612172 -6.943678 -6.280666 0
## PDRB Tinggi:GRF-PDRB Rendah:GLMM 28.700472 28.368966 29.031978 0
## PDRB Rendah:RF-PDRB Rendah:GLMM -9.070913 -9.402419 -8.739407 0
## PDRB Tinggi:RF-PDRB Rendah:GLMM 26.777596 26.446090 27.109102 0
## PDRB Rendah:GRF-PDRB Tinggi:GLMM -46.233355 -46.564861 -45.901849 0
## PDRB Tinggi:GRF-PDRB Tinggi:GLMM -10.920711 -11.252217 -10.589205 0
## PDRB Rendah:RF-PDRB Tinggi:GLMM -48.692096 -49.023602 -48.360590 0
## PDRB Tinggi:RF-PDRB Tinggi:GLMM -12.843587 -13.175093 -12.512081 0
## PDRB Tinggi:GRF-PDRB Rendah:GRF 35.312644 34.981138 35.644150 0
## PDRB Rendah:RF-PDRB Rendah:GRF -2.458741 -2.790247 -2.127235 0
## PDRB Tinggi:RF-PDRB Rendah:GRF 33.389768 33.058262 33.721274 0
## PDRB Rendah:RF-PDRB Tinggi:GRF -37.771385 -38.102891 -37.439879 0
## PDRB Tinggi:RF-PDRB Tinggi:GRF -1.922876 -2.254382 -1.591370 0
## PDRB Tinggi:RF-PDRB Rendah:RF 35.848509 35.517003 36.180015 0
# Visualisasi data
ggplot(data_anova, aes(x = METODE, y = MSE, fill = PDRB)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Distribusi MSE Berdasarkan PDRB dan Metode",
x = "Metode",
y = "Mean Squared Error (MSE)") +
theme(legend.position = "top")
#ANOVA INTERAKSI
# Membuat data frame berdasarkan data yang diberikan
data_anova <- data.frame(
PDRB = c(rep("Tinggi", 30), rep("Rendah", 30)),
METODE = rep(c(rep("GLMM", 10), rep("GRF", 10), rep("RF", 10)), 2),
MSE = c(
# PDRB Tinggi - GLMM
93.38905, 93.61902, 93.26212, 93.49928, 93.58057, 93.42714, 93.46934, 93.40885, 93.25724, 93.71052,
# PDRB Tinggi - GRF
82.14571, 82.65263, 82.54502, 82.35254, 82.4862, 82.46038, 82.65377, 82.55405, 82.47269, 83.09303,
# PDRB Tinggi - RF
80.38876, 80.5737, 80.56846, 80.58543, 80.17147, 80.19943, 80.9563, 81.04018, 81.29119, 80.41234,
# PDRB Rendah - GLMM
53.81285, 54.09778, 53.68323, 53.7038, 54.07479, 53.91415, 53.83406, 53.83886, 53.77484, 53.67694,
# PDRB Rendah - GRF
47.02052, 47.31874, 47.20572, 47.23893, 47.43792, 47.01509, 47.08057, 47.47576, 47.38874, 47.10759,
# PDRB Rendah - RF
44.51813, 44.98956, 44.98175, 44.60713, 45.12446, 45.24781, 44.20668, 44.71421, 44.43322, 44.87922
)
)
# ANOVA
anova_result <- aov(MSE ~ PDRB * METODE, data = data_anova)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## PDRB 1 20455 20455 324937 <2e-16 ***
## METODE 2 1345 672 10681 <2e-16 ***
## PDRB:METODE 2 55 28 438 <2e-16 ***
## Residuals 54 3 0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Subset data untuk PDRB Rendah
data_rendah <- subset(data_anova, PDRB == "Rendah")
# ANOVA untuk PDRB Rendah
anova_rendah <- aov(MSE ~ METODE, data = data_rendah)
summary(anova_rendah)
## Df Sum Sq Mean Sq F value Pr(>F)
## METODE 2 440.2 220.08 4100 <2e-16 ***
## Residuals 27 1.4 0.05
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Uji Tukey
tukey_rendah <- TukeyHSD(anova_rendah)
print(tukey_rendah)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = MSE ~ METODE, data = data_rendah)
##
## $METODE
## diff lwr upr p adj
## GRF-GLMM -6.612172 -6.869069 -6.355275 0
## RF-GLMM -9.070913 -9.327810 -8.814016 0
## RF-GRF -2.458741 -2.715638 -2.201844 0
# Filter untuk kelompok PDRB tinggi
data_tinggi <- subset(data_anova, PDRB == "Tinggi")
# Analisis ANOVA untuk kelompok PDRB tinggi
anova_tinggi <- aov(MSE ~ METODE, data = data_tinggi)
summary(anova_tinggi)
## Df Sum Sq Mean Sq F value Pr(>F)
## METODE 2 959.7 479.9 6644 <2e-16 ***
## Residuals 27 1.9 0.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Uji Tukey untuk kelompok PDRB tinggi
tukey_tinggi <- TukeyHSD(anova_tinggi)
print(tukey_tinggi)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = MSE ~ METODE, data = data_tinggi)
##
## $METODE
## diff lwr upr p adj
## GRF-GLMM -10.920711 -11.218698 -10.622724 0
## RF-GLMM -12.843587 -13.141574 -12.545600 0
## RF-GRF -1.922876 -2.220863 -1.624889 0
# Pilih 10 data acak dari setiap data test
test1_sample <- test1[sample(nrow(test1), 10), ]
test2_sample <- test2[sample(nrow(test2), 10), ]
test3_sample <- test3[sample(nrow(test3), 10), ]
test4_sample <- test4[sample(nrow(test4), 10), ]
test5_sample <- test5[sample(nrow(test5), 10), ]
test6_sample <- test6[sample(nrow(test6), 10), ]
#PREDIKSI GRF
prediksi1_grf<-predict(regression_forest(X = train_data1[,c(8:28)],
Y = train_data1$EXPENDITURE_TRANSFORMED,num.trees = 500, mtry = 9), newdata = test1_sample[,c(8:28)])$prediction
prediksi2_grf<-predict(regression_forest(X = train_data1[,c(8:28)],
Y = train_data1$EXPENDITURE_TRANSFORMED,num.trees = 500, mtry = 9), newdata = test2_sample[,c(8:28)])$prediction
prediksi3_grf<-predict(regression_forest(X = train_data1[,c(8:28)],
Y = train_data1$EXPENDITURE_TRANSFORMED,num.trees = 500, mtry = 9), newdata = test3_sample[,c(8:28)])$prediction
prediksi4_grf<-predict(regression_forest(X = train_data2[,c(8:28)],
Y = train_data2$EXPENDITURE_TRANSFORMED,num.trees = 500, mtry = 9), newdata = test4_sample[,c(8:28)])$prediction
prediksi5_grf<-predict(regression_forest(X = train_data2[,c(8:28)],
Y = train_data2$EXPENDITURE_TRANSFORMED,num.trees = 500, mtry = 9), newdata = test5_sample[,c(8:28)])$prediction
prediksi6_grf<-predict(regression_forest(X = train_data2[,c(8:28)],
Y = train_data2$EXPENDITURE_TRANSFORMED,num.trees = 500, mtry = 9), newdata = test6_sample[,c(8:28)])$prediction
#PREDIKSI RF
prediksi1_rf<-predict(randomForest(x = train_data1[,c(8:28)],
y = train_data1$EXPENDITURE_TRANSFORMED_RF,ntree = 500, mtry = 5), newdata = test1_sample[,c(8:28)])
prediksi2_rf<-predict(randomForest(x = train_data1[,c(8:28)],
y = train_data1$EXPENDITURE_TRANSFORMED_RF,ntree = 500, mtry = 5), newdata = test2_sample[,c(8:28)])
prediksi3_rf<-predict(randomForest(x = train_data1[,c(8:28)],
y = train_data1$EXPENDITURE_TRANSFORMED_RF,ntree = 500, mtry = 5), newdata = test3_sample[,c(8:28)])
prediksi4_rf<-predict(randomForest(x = train_data2[,c(8:28)],
y = train_data2$EXPENDITURE_TRANSFORMED_RF,ntree = 500, mtry = 5), newdata = test4_sample[,c(8:28)])
prediksi5_rf<-predict(randomForest(x = train_data2[,c(8:28)],
y = train_data2$EXPENDITURE_TRANSFORMED_RF,ntree = 500, mtry = 5), newdata = test5_sample[,c(8:28)])
prediksi6_rf<-predict(randomForest(x = train_data2[,c(8:28)],
y = train_data2$EXPENDITURE_TRANSFORMED_RF,ntree = 500, mtry = 5), newdata = test6_sample[,c(8:28)])
#PREDIKSI GLMM
prediksi1_glmm<-predict(lmer(EXPENDITURE_GLMM ~ . - REGION + (1|REGION), data = train_data1[, c(7:28,33)]),test1_sample[,c(7:28)])
prediksi2_glmm<-predict(lmer(EXPENDITURE_GLMM ~ . - REGION + (1|REGION), data = train_data1[, c(7:28,33)]),test2_sample[,c(7:28)])
prediksi3_glmm<-predict(lmer(EXPENDITURE_GLMM ~ . - REGION + (1|REGION), data = train_data1[, c(7:28,33)]),test3_sample[,c(7:28)])
prediksi4_glmm<-predict(lmer(EXPENDITURE_GLMM ~ . - REGION + (1|REGION), data = train_data2[, c(7:28,33)]),test4_sample[,c(7:28)])
prediksi5_glmm<-predict(lmer(EXPENDITURE_GLMM ~ . - REGION + (1|REGION), data = train_data2[, c(7:28,33)]),test5_sample[,c(7:28)])
prediksi6_glmm<-predict(lmer(EXPENDITURE_GLMM ~ . - REGION + (1|REGION), data = train_data2[, c(7:28,33)]),test6_sample[,c(7:28)])
# Gabungkan hasil prediksi dan data aktual
region_results <- data.frame(
PDRB = c(rep("PDRB TINGGI",30),rep("PDRB RENDAH",30)),
KabupatenKota = c(rep("Kabupaten Bogor",10),rep("Kabupaten Bekasi",10),rep("Kota Bandung",10),rep("Kabupaten Kuningan",10),rep("Kota Sukabumi",10),rep("Kota Banjar",10)),
Aktual = c(test1_sample$EXPENDITURE,test2_sample$EXPENDITURE,test3_sample$EXPENDITURE,test4_sample$EXPENDITURE,test5_sample$EXPENDITURE,test6_sample$EXPENDITURE),
Prediksi_GRF = c(prediksi1_grf,prediksi2_grf,prediksi3_grf,prediksi4_grf,prediksi5_grf,prediksi6_grf),
Prediksi_RF = c(prediksi1_rf,prediksi2_rf,prediksi3_rf,prediksi4_rf,prediksi5_rf,prediksi6_rf),
Prediksi_GLMM = c(prediksi1_glmm,prediksi2_glmm,prediksi3_glmm,prediksi4_glmm,prediksi5_glmm,prediksi6_glmm)
)
region_results
## PDRB KabupatenKota Aktual Prediksi_GRF Prediksi_RF
## 1 PDRB TINGGI Kabupaten Bogor 12.501071 10.778975 11.284608
## 2 PDRB TINGGI Kabupaten Bogor 17.106375 13.145658 14.566082
## 3 PDRB TINGGI Kabupaten Bogor 10.731459 9.787102 8.970344
## 4 PDRB TINGGI Kabupaten Bogor 9.200446 8.791372 8.196569
## 5 PDRB TINGGI Kabupaten Bogor 3.831905 10.500897 7.496021
## 6 PDRB TINGGI Kabupaten Bogor 10.029226 10.856241 10.131780
## 7 PDRB TINGGI Kabupaten Bogor 30.499405 20.512744 23.055110
## 8 PDRB TINGGI Kabupaten Bogor 11.396250 7.820770 6.554726
## 9 PDRB TINGGI Kabupaten Bogor 13.295595 13.459587 13.423461
## 10 PDRB TINGGI Kabupaten Bogor 3.430635 6.962394 5.190190
## 11 PDRB TINGGI Kabupaten Bekasi 10.870286 14.298423 11.885966
## 12 PDRB TINGGI Kabupaten Bekasi 7.182524 11.415484 9.607944
## 13 PDRB TINGGI Kabupaten Bekasi 8.472659 16.418329 14.200485
## 14 PDRB TINGGI Kabupaten Bekasi 7.747202 9.821248 8.623950
## 15 PDRB TINGGI Kabupaten Bekasi 9.226131 11.981859 14.619692
## 16 PDRB TINGGI Kabupaten Bekasi 17.165119 10.824364 13.112379
## 17 PDRB TINGGI Kabupaten Bekasi 15.994286 11.645147 12.850962
## 18 PDRB TINGGI Kabupaten Bekasi 4.553452 15.425967 11.128871
## 19 PDRB TINGGI Kabupaten Bekasi 27.339643 35.346662 32.909836
## 20 PDRB TINGGI Kabupaten Bekasi 16.875298 13.056681 15.420353
## 21 PDRB TINGGI Kota Bandung 5.667223 12.347925 9.036824
## 22 PDRB TINGGI Kota Bandung 26.143905 31.145097 32.054990
## 23 PDRB TINGGI Kota Bandung 12.414000 11.793861 12.000366
## 24 PDRB TINGGI Kota Bandung 18.752381 13.854910 16.189541
## 25 PDRB TINGGI Kota Bandung 52.057768 27.987115 35.533776
## 26 PDRB TINGGI Kota Bandung 100.566532 30.473205 33.417497
## 27 PDRB TINGGI Kota Bandung 21.891429 25.727784 24.238501
## 28 PDRB TINGGI Kota Bandung 30.480595 39.212108 39.499944
## 29 PDRB TINGGI Kota Bandung 8.748651 12.693336 12.128177
## 30 PDRB TINGGI Kota Bandung 53.782386 36.937251 45.216852
## 31 PDRB RENDAH Kabupaten Kuningan 7.301118 9.011768 8.208168
## 32 PDRB RENDAH Kabupaten Kuningan 6.987708 8.390512 8.806935
## 33 PDRB RENDAH Kabupaten Kuningan 12.283437 9.525051 10.628532
## 34 PDRB RENDAH Kabupaten Kuningan 6.095655 9.310449 8.661018
## 35 PDRB RENDAH Kabupaten Kuningan 4.798717 8.181103 6.824414
## 36 PDRB RENDAH Kabupaten Kuningan 6.200595 15.010269 12.185174
## 37 PDRB RENDAH Kabupaten Kuningan 8.014107 14.084846 12.666240
## 38 PDRB RENDAH Kabupaten Kuningan 12.759702 9.079341 9.008184
## 39 PDRB RENDAH Kabupaten Kuningan 20.528492 9.906819 12.495613
## 40 PDRB RENDAH Kabupaten Kuningan 10.353633 9.577055 10.083209
## 41 PDRB RENDAH Kota Sukabumi 50.960119 26.558425 37.097515
## 42 PDRB RENDAH Kota Sukabumi 9.631012 9.598113 9.758225
## 43 PDRB RENDAH Kota Sukabumi 3.528087 8.956434 6.682388
## 44 PDRB RENDAH Kota Sukabumi 9.079307 10.336427 9.422969
## 45 PDRB RENDAH Kota Sukabumi 32.956042 30.537479 33.537614
## 46 PDRB RENDAH Kota Sukabumi 6.405748 7.699734 6.384283
## 47 PDRB RENDAH Kota Sukabumi 23.347270 8.896924 14.512429
## 48 PDRB RENDAH Kota Sukabumi 6.763863 7.870526 7.397526
## 49 PDRB RENDAH Kota Sukabumi 4.001238 7.858054 5.895418
## 50 PDRB RENDAH Kota Sukabumi 7.858839 9.780995 9.825129
## 51 PDRB RENDAH Kota Banjar 3.892679 8.458266 6.575398
## 52 PDRB RENDAH Kota Banjar 4.944540 8.379326 7.647826
## 53 PDRB RENDAH Kota Banjar 5.096738 12.611554 9.269108
## 54 PDRB RENDAH Kota Banjar 31.930536 18.970397 25.614730
## 55 PDRB RENDAH Kota Banjar 6.787619 8.587227 7.649477
## 56 PDRB RENDAH Kota Banjar 6.169800 9.615779 7.760899
## 57 PDRB RENDAH Kota Banjar 14.204924 22.481574 21.348214
## 58 PDRB RENDAH Kota Banjar 34.614940 21.987906 23.683367
## 59 PDRB RENDAH Kota Banjar 20.330496 21.419065 20.365538
## 60 PDRB RENDAH Kota Banjar 9.614246 10.149164 11.782768
## Prediksi_GLMM
## 1 12.1340530
## 2 10.1874459
## 3 -0.8318712
## 4 6.5274716
## 5 11.6902093
## 6 7.2191546
## 7 18.6083784
## 8 5.7565196
## 9 12.3576602
## 10 2.2423002
## 11 19.5181214
## 12 10.6300943
## 13 20.6241903
## 14 10.0242776
## 15 16.3111925
## 16 9.7667986
## 17 10.2856004
## 18 21.0501171
## 19 24.5526742
## 20 16.4476702
## 21 14.1282739
## 22 25.9484938
## 23 7.9867442
## 24 7.9328760
## 25 27.5320905
## 26 27.6778884
## 27 25.0301523
## 28 30.9369436
## 29 11.9170967
## 30 34.9066155
## 31 10.5660903
## 32 4.2314220
## 33 11.8571302
## 34 10.1715539
## 35 9.7708017
## 36 15.1080927
## 37 16.6764552
## 38 8.2877625
## 39 9.1942093
## 40 10.5264990
## 41 21.2908190
## 42 10.4813651
## 43 7.1160977
## 44 6.9183997
## 45 24.5285213
## 46 6.8588499
## 47 4.8736837
## 48 7.2930623
## 49 8.6711066
## 50 8.1351052
## 51 5.3463632
## 52 6.6495345
## 53 14.7589172
## 54 23.3525389
## 55 8.4884041
## 56 11.6434171
## 57 20.8888274
## 58 17.6909857
## 59 18.8988670
## 60 10.3753906