Menginstal dan memasang library yang dibutuhkan
library(readxl)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(ggplot2)
library(yardstick)
##
## Attaching package: 'yardstick'
## The following objects are masked from 'package:caret':
##
## precision, recall, sensitivity, specificity
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Membaca dokumen excel pada sheet testing dan sheet training
data_training <- read_excel("C:/Wahyu/Kuliah/Semester 3/Statistika Ekonomi dan Industri/UTS/Level Risiko Investasi.xlsx", "Training")
data_testing <- read_excel("C:/Wahyu/Kuliah/Semester 3/Statistika Ekonomi dan Industri/UTS/Level Risiko Investasi.xlsx", sheet = "Testing")
Menampilkan struktur detail dari masing masing dataset
str(data_training)
## tibble [100 × 16] (S3: tbl_df/tbl/data.frame)
## $ Country : chr [1:100] "AD" "AE" "AE-AZ" "AE-RK" ...
## $ X1 : num [1:100] 17.5 18.2 18.7 NA 14 ...
## $ X2 : num [1:100] 38675 40105 76038 27883 4251 ...
## $ X3 : num [1:100] 172.8 103.5 31 24.8 89.6 ...
## $ X4 : num [1:100] 0.68 1.77 2.63 1.29 1.44 ...
## $ X5 : num [1:100] 1.221 0.87 1.489 1.753 0.256 ...
## $ X6 : num [1:100] 1.79 2.66 1.85 2.23 4.75 ...
## $ X7 : num [1:100] -2.084 -0.725 -1.901 -1.135 2.332 ...
## $ X8 : num [1:100] 55 103 103 103 167 ...
## $ X9 : num [1:100] -26.5 -13.6 -56.2 24.8 47.3 ...
## $ X10 : num [1:100] 2.86 352.91 199.93 10.11 12.65 ...
## $ X11 : num [1:100] 8 8.15 8.15 NA 6.6 ...
## $ X12 : num [1:100] 23.1 24.9 20.4 21.7 19.4 ...
## $ X13 : num [1:100] 26.9 32.5 31 17.3 15.1 ...
## $ X14 : num [1:100] 3 2.45 NA NA 18.5 ...
## $ Risk Level: chr [1:100] "low" "low" "low" "low" ...
str(data_testing)
## tibble [17 × 15] (S3: tbl_df/tbl/data.frame)
## $ Country: chr [1:17] "SE" "SG" "SI" "SK" ...
## $ X1 : num [1:17] 23.2 16.8 18.3 19.7 11.9 ...
## $ X2 : num [1:17] 60338 62433 28684 21043 49356 ...
## $ X3 : num [1:17] 175.4 409.7 103.1 102.7 60.2 ...
## $ X4 : num [1:17] 1.62 0.105 0.844 1.174 0.896 ...
## $ X5 : num [1:17] 0.6755 0.9068 0.0746 0.0734 0.5865 ...
## $ X6 : num [1:17] 2.47 2.78 3.55 3.22 1.75 ...
## $ X7 : num [1:17] 0.353 0.291 1.93 1.232 -1.134 ...
## $ X8 : num [1:17] 185.6 94 72.3 111.8 88.6 ...
## $ X9 : num [1:17] 64.1 -201 16.2 33.4 -145.4 ...
## $ X10 : num [1:17] 537.61 339.99 52.76 102.57 1.49 ...
## $ X11 : num [1:17] 0.5 1.31 3.02 2.53 63.5 ...
## $ X12 : num [1:17] 25.1 26.8 19.9 22.8 17.8 ...
## $ X13 : num [1:17] 28 47.3 25.8 21 23.2 ...
## $ X14 : num [1:17] 8.6 3 5 7 7.3 9 2 17 13.2 3.7 ...
Mengubah kolom country dalam dataset training menjadi tipe faktor dan menampilkan dataset training setelah dirubah
data_training$Country <- as.factor(data_training$Country)
str(data_training)
## tibble [100 × 16] (S3: tbl_df/tbl/data.frame)
## $ Country : Factor w/ 99 levels "AD","AE","AE-AZ",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ X1 : num [1:100] 17.5 18.2 18.7 NA 14 ...
## $ X2 : num [1:100] 38675 40105 76038 27883 4251 ...
## $ X3 : num [1:100] 172.8 103.5 31 24.8 89.6 ...
## $ X4 : num [1:100] 0.68 1.77 2.63 1.29 1.44 ...
## $ X5 : num [1:100] 1.221 0.87 1.489 1.753 0.256 ...
## $ X6 : num [1:100] 1.79 2.66 1.85 2.23 4.75 ...
## $ X7 : num [1:100] -2.084 -0.725 -1.901 -1.135 2.332 ...
## $ X8 : num [1:100] 55 103 103 103 167 ...
## $ X9 : num [1:100] -26.5 -13.6 -56.2 24.8 47.3 ...
## $ X10 : num [1:100] 2.86 352.91 199.93 10.11 12.65 ...
## $ X11 : num [1:100] 8 8.15 8.15 NA 6.6 ...
## $ X12 : num [1:100] 23.1 24.9 20.4 21.7 19.4 ...
## $ X13 : num [1:100] 26.9 32.5 31 17.3 15.1 ...
## $ X14 : num [1:100] 3 2.45 NA NA 18.5 ...
## $ Risk Level: chr [1:100] "low" "low" "low" "low" ...
Mengisi missing value pada dataset training dengan mean
data_training$X1[is.na(data_training$X1)] <- mean(data_training$X1, na.rm = TRUE)
data_training$X8[is.na(data_training$X8)] <- mean(data_training$X8, na.rm = TRUE)
data_training$X11[is.na(data_training$X11)] <- mean(data_training$X11, na.rm = TRUE)
data_training$X14[is.na(data_training$X14)] <- mean(data_training$X14, na.rm = TRUE)
data_training$`Risk Level` <- as.factor(data_training$`Risk Level`)
str(data_training)
## tibble [100 × 16] (S3: tbl_df/tbl/data.frame)
## $ Country : Factor w/ 99 levels "AD","AE","AE-AZ",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ X1 : num [1:100] 17.5 18.2 18.7 19 14 ...
## $ X2 : num [1:100] 38675 40105 76038 27883 4251 ...
## $ X3 : num [1:100] 172.8 103.5 31 24.8 89.6 ...
## $ X4 : num [1:100] 0.68 1.77 2.63 1.29 1.44 ...
## $ X5 : num [1:100] 1.221 0.87 1.489 1.753 0.256 ...
## $ X6 : num [1:100] 1.79 2.66 1.85 2.23 4.75 ...
## $ X7 : num [1:100] -2.084 -0.725 -1.901 -1.135 2.332 ...
## $ X8 : num [1:100] 55 103 103 103 167 ...
## $ X9 : num [1:100] -26.5 -13.6 -56.2 24.8 47.3 ...
## $ X10 : num [1:100] 2.86 352.91 199.93 10.11 12.65 ...
## $ X11 : num [1:100] 8 8.15 8.15 5.53 6.6 ...
## $ X12 : num [1:100] 23.1 24.9 20.4 21.7 19.4 ...
## $ X13 : num [1:100] 26.9 32.5 31 17.3 15.1 ...
## $ X14 : num [1:100] 3 2.45 8.44 8.44 18.5 ...
## $ Risk Level: Factor w/ 2 levels "high","low": 2 2 2 2 1 1 1 2 2 1 ...
Menentukan ukuran data training dan membagi dataset menjadi training dan testing
# Menentukan ukuran data training (80%)
set.seed(123) # Untuk menjaga konsistensi hasil
index <- sample(1:nrow(data_training), size = 0.8 * nrow(data_training))
# Membagi data menjadi data training dan data testing
data_baru_training <- data_training[index, ] # 80% data training
data_baru_testing <- data_training[-index, ] # 20% data testing
# Mengecek ukuran dataset
cat("Jumlah data training: ", nrow(data_baru_training), "\n")
## Jumlah data training: 80
cat("Jumlah data testing: ", nrow(data_baru_testing), "\n")
## Jumlah data testing: 20
Membuat contoh model Random Forest menggunakan data baru training dan menampilkan ringkasan model
# Melatih model Random Forest menggunakan data_baru_training
rf_model <- randomForest(`Risk Level` ~ . - Country,
data = data_baru_training,
ntree = 100)
# Menampilkan ringkasan model
print(rf_model)
##
## Call:
## randomForest(formula = `Risk Level` ~ . - Country, data = data_baru_training, ntree = 100)
## Type of random forest: classification
## Number of trees: 100
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 13.75%
## Confusion matrix:
## high low class.error
## high 37 6 0.1395349
## low 5 32 0.1351351
memprediksi dataset testing dan menampilkan beberapa hasil prediksi
# Prediksi pada data testing
prediksi_rf <- predict(rf_model, newdata = data_baru_testing)
# Menampilkan beberapa prediksi
head(prediksi_rf)
## 1 2 3 4 5 6
## low low low high high high
## Levels: high low
Memastikan kolos risk level pada dataset testing adalah faktor
# Pastikan kolom Risk_Level pada data testing adalah faktor
data_baru_testing$`Risk Level` <- as.factor(data_baru_testing$`Risk Level`)
# Pastikan prediksi juga diubah menjadi faktor dengan level yang sama
prediksi_rf <- factor(prediksi_rf, levels = levels(data_baru_testing$`Risk Level`))
Mengonversi risk level menjadi faktor
# Mengonversi Risk_Level menjadi faktor
data <- as.factor(data_baru_testing$`Risk Level`)
# Mengecek panjang dari faktor
length(data)
## [1] 20
data <- as.factor(data_baru_testing$`Risk Level`)
dim(data_baru_testing$`Risk Level`)
## NULL
Membuat model random forest dan hasil prediksi
model_rf <- randomForest(`Risk Level` ~ . - Country, data = data_baru_training, ntree = 100, mtry = 2)
prediksi_rf <- predict(model_rf, newdata = data_baru_testing)
print(prediksi_rf)
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## low low low high high high high high high high low low high low high low
## 17 18 19 20
## low high high high
## Levels: high low
table(data_baru_testing$`Risk Level`)
##
## high low
## 11 9
table(prediksi_rf)
## prediksi_rf
## high low
## 12 8
Menghitung confusion matrix dan menampilkannya serta mengambil akurasi dari confusion matrix
# Menghitung confusion matrix
conf_matrix <- confusionMatrix(prediksi_rf, data_baru_testing$`Risk Level`)
# Menampilkan confusion matrix dan metrik performa
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction high low
## high 10 2
## low 1 7
##
## Accuracy : 0.85
## 95% CI : (0.6211, 0.9679)
## No Information Rate : 0.55
## P-Value [Acc > NIR] : 0.004933
##
## Kappa : 0.6939
##
## Mcnemar's Test P-Value : 1.000000
##
## Sensitivity : 0.9091
## Specificity : 0.7778
## Pos Pred Value : 0.8333
## Neg Pred Value : 0.8750
## Prevalence : 0.5500
## Detection Rate : 0.5000
## Detection Prevalence : 0.6000
## Balanced Accuracy : 0.8434
##
## 'Positive' Class : high
##
# Mengambil akurasi dari confusion matrix
akurasi <- conf_matrix$overall['Accuracy']
cat("Akurasi model Random Forest: ", akurasi, "\n")
## Akurasi model Random Forest: 0.85
Menampilkan hasil prediksi dan akurasi
if (length(unique(prediksi_rf)) < 2) {
cat("Model tidak memprediksi kedua kelas. Coba periksa model atau data.\n")
} else {
conf_matrix <- confusionMatrix(prediksi_rf, data_baru_testing$`Risk Level`)
print(conf_matrix)
akurasi <- conf_matrix$overall['Accuracy']
cat("Akurasi model Random Forest: ", akurasi, "\n")
}
## Confusion Matrix and Statistics
##
## Reference
## Prediction high low
## high 10 2
## low 1 7
##
## Accuracy : 0.85
## 95% CI : (0.6211, 0.9679)
## No Information Rate : 0.55
## P-Value [Acc > NIR] : 0.004933
##
## Kappa : 0.6939
##
## Mcnemar's Test P-Value : 1.000000
##
## Sensitivity : 0.9091
## Specificity : 0.7778
## Pos Pred Value : 0.8333
## Neg Pred Value : 0.8750
## Prevalence : 0.5500
## Detection Rate : 0.5000
## Detection Prevalence : 0.6000
## Balanced Accuracy : 0.8434
##
## 'Positive' Class : high
##
## Akurasi model Random Forest: 0.85
Menvisualisasi random forest dengan ggplot2
importance <- importance(rf_model) # Mengambil nilai pentingnya variabel
varImpPlot(rf_model) # Visualisasi pentingnya variabel
# menampilkan dengan ggplot2
importance_df <- as.data.frame(importance)
importance_df$Variable <- rownames(importance_df)
ggplot(importance_df, aes(x = reorder(Variable, MeanDecreaseGini), y = MeanDecreaseGini)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Variable Importance in Random Forest Model", x = "Variables", y = "Mean Decrease in Gini")
Memvisualisasi akurasi model
# Menghitung akurasi dari confusion matrix
akurasi <- conf_matrix$overall['Accuracy']
# Data frame untuk memplot akurasi
akurasi_df <- data.frame(Model = "Random Forest", Accuracy = akurasi)
# Plot akurasi model
ggplot(akurasi_df, aes(x = Model, y = Accuracy, fill = Model)) +
geom_bar(stat = "identity") +
geom_text(aes(label = round(Accuracy, 2)), vjust = -0.3, size = 5) +
ylim(0, 1) +
labs(title = "Model Accuracy", y = "Accuracy", x = "") +
theme_minimal()
Membuat plot Prediksi nilai prediksi VS nilai asli
#Visualisasi Prediksi vs Nilai Asli
ggplot(data_baru_testing, aes(x = `Risk Level`, y = prediksi_rf)) +
geom_jitter(color = "blue", alpha = 0.5) +
labs(title = "Prediksi vs Nilai Asli", x = "Nilai Asli", y = "Prediksi") +
theme_minimal()
Visualisasi Confusion Matrix dengan Heatmap
# Buat confusion matrix seperti sebelumnya
conf_matrix <- confusionMatrix(prediksi_rf, data_baru_testing$`Risk Level`)
# Convert confusion matrix ke dataframe untuk visualisasi
cm_df <- as.data.frame(conf_matrix$table)
# Plot confusion matrix sebagai heatmap
ggplot(cm_df, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile() +
geom_text(aes(label = Freq), color = "white") +
scale_fill_gradient(low = "blue", high = "red") +
labs(title = "Confusion Matrix Heatmap", x = "Actual", y = "Predicted")
Manampilkan dataset testing setelah cleaning
print(data_baru_testing)
## # A tibble: 20 × 16
## Country X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AD 17.5 38675. 173. 0.68 1.22 1.79 -2.08 55 -26.5 2.86
## 2 AE 18.2 40105. 104. 1.77 0.870 2.66 -0.725 103. -13.6 353.
## 3 AW 33.5 24643. 92.8 1.22 0.797 2.06 -4.72 80.5 28.1 2.38
## 4 AZ 25.3 5083. 43.4 6.85 1.05 0.391 -1.74 111. -174. 42.6
## 5 CL 14.3 15986. 65.8 2.98 1.25 1.97 -0.892 117. 14.0 253.
## 6 CR 13.3 11955. 45.0 1.35 0.996 3.25 0.703 125. 1.75 61.5
## 7 EC 13.4 5830. 49.7 1.23 1.71 0.508 -2.77 99.7 8.91 98.8
## 8 EG 20.1 3756. 31.9 16.2 2.05 4.45 2.24 53.3 11.9 362.
## 9 GE 17.6 4423. 104. 3.94 -0.0317 4.12 2.31 145. 62.4 15.9
## 10 GH 15 2354. 59.6 12.9 2.24 5.29 2.69 50.1 47.7 67.5
## 11 HK 20.7 50215. 446. 2.44 0.851 1.99 -0.566 68.8 -283. 349.
## 12 IE 25.5 91715. 815. 0.323 1.20 10.1 4.53 84.4 -345. 418.
## 13 IQ 19.0 4271. 37.6 0.442 2.49 3.8 -1.35 99.9 1.49 165.
## 14 IS 24.8 66459. 109. 0.419 2.04 4.64 0.312 161. 28.7 21.7
## 15 JO 17.9 4433. 70.5 1.38 1.94 2.03 -0.688 93.9 10.6 43.7
## 16 LT 21.8 22636. 78.1 1.70 -0.886 3.42 3.73 68.2 16.9 55.8
## 17 MO 14.5 52074. 195. 2.78 1.21 -1.67 -9.85 81.9 -213. 24.3
## 18 NG 15.4 2150. 24.8 12.9 2.62 1.19 -2.31 64.2 -2.27 401.
## 19 PE 15.6 6528. 35.4 2.70 1.05 3.17 -0.748 117. -20.7 205.
## 20 RW 23.3 826. 53.0 4.21 2.64 7.37 2.29 112. 36.6 10.3
## # ℹ 5 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>,
## # `Risk Level` <fct>
Menampilkan dataset training setelah cleaning
print(data_baru_training)
## # A tibble: 80 × 16
## Country X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 CZ 21.4 27045. 79.2 1.58 0.207 3.72 1.32 76.5 -16.6 244.
## 2 MZ 26 434. 356. 9.04 2.94 3.93 -0.428 49.3 283. 14.4
## 3 ID 23.9 4223. 35.4 3.94 1.15 5.04 2.50 96.6 9.74 1062.
## 4 BG 22.7 11289. 70.3 0.779 -0.710 3.62 2.70 73.0 -13.0 69.1
## 5 LS 11 2638. 85.5 1.81 1.55 6.58 3.67 99.9 72.9 19.1
## 6 GA 19.0 7804. 39.1 2.80 2.70 2.25 -1.59 72.3 29.3 15.1
## 7 HU 18.3 18224. 115. 1.85 -0.242 4.08 2.54 79.0 11.7 155.
## 8 GB 21.6 46724. 406. 1.53 0.609 1.70 -1.35 91.3 31.4 2708.
## 9 RU 12.7 10274. 33.4 6.72 0.107 0.977 0.674 123. -30.7 1471.
## 10 CM 9.1 1691. 39.7 1.54 2.64 4.35 0.718 87.7 26.0 40.3
## # ℹ 70 more rows
## # ℹ 5 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>,
## # `Risk Level` <fct>
Jadi Insight yang didapatkan dari dari hasil diatas adalah : - Mengisi missing value dengan mean dan menghapus outlier - Mendapatkan model yang berakurasi tinggi yaitu sebesar 0,85 - Mendapatkan nilai prediksi untuk risk level - Memvisualisasi akurasi model, data asli VS data prediksi, confusion matrix, dan model random forest