#===================================Library===============================================#
packages <- c("nnet", "car", "ggplot2", "dplyr", "factoextra", "psych")
lapply(packages, function(x) if(!require(x, character.only=TRUE)) install.packages(x))
## Loading required package: nnet
## Warning: package 'nnet' was built under R version 4.4.3
## Loading required package: car
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: factoextra
## Warning: package 'factoextra' was built under R version 4.4.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
## Loading required package: psych
## Warning: package 'psych' was built under R version 4.4.3
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## The following object is masked from 'package:car':
##
## logit
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
#============================== 1. prepro ================================#
#---Read Csv
data <- read.csv('fetal_health.csv', header = TRUE)
#---mising
missing_data <- sapply(data, function(x) sum(is.na(x)))
print("Missing Values Count:")
## [1] "Missing Values Count:"
print(missing_data)
## baseline.value
## 0
## accelerations
## 0
## fetal_movement
## 0
## uterine_contractions
## 0
## light_decelerations
## 0
## severe_decelerations
## 0
## prolongued_decelerations
## 0
## abnormal_short_term_variability
## 0
## mean_value_of_short_term_variability
## 0
## percentage_of_time_with_abnormal_long_term_variability
## 0
## mean_value_of_long_term_variability
## 0
## histogram_width
## 0
## histogram_min
## 0
## histogram_max
## 0
## histogram_number_of_peaks
## 0
## histogram_number_of_zeroes
## 0
## histogram_mode
## 0
## histogram_mean
## 0
## histogram_median
## 0
## histogram_variance
## 0
## histogram_tendency
## 0
## fetal_health
## 0
#---cek_duplikat
duplicate_rows <- sum(duplicated(data))
cat("Jumlah duplikat dalam dataset: ", duplicate_rows, "\n")
## Jumlah duplikat dalam dataset: 13
#---Hapus duplikat
data <- data[!duplicated(data), ]
cat("Dataset setelah menghapus duplikat: ", nrow(data), "baris\n")
## Dataset setelah menghapus duplikat: 2113 baris
#---target to faktor
data$fetal_health <- as.factor(data$fetal_health)
colnames(data)
## [1] "baseline.value"
## [2] "accelerations"
## [3] "fetal_movement"
## [4] "uterine_contractions"
## [5] "light_decelerations"
## [6] "severe_decelerations"
## [7] "prolongued_decelerations"
## [8] "abnormal_short_term_variability"
## [9] "mean_value_of_short_term_variability"
## [10] "percentage_of_time_with_abnormal_long_term_variability"
## [11] "mean_value_of_long_term_variability"
## [12] "histogram_width"
## [13] "histogram_min"
## [14] "histogram_max"
## [15] "histogram_number_of_peaks"
## [16] "histogram_number_of_zeroes"
## [17] "histogram_mode"
## [18] "histogram_mean"
## [19] "histogram_median"
## [20] "histogram_variance"
## [21] "histogram_tendency"
## [22] "fetal_health"
#---hapus multikonielaritas data krn jika tdk dihapus,bisa terjadi redudansi yang menyebabkan regresi tidak bisa mengestimasi koefisien untuk semua variabel
data$histogram_max <- NULL
#============================= 2. uji asumsi (VIF) ========================#
library(car)
library(dplyr)
#---Buat model VIF hanya dari prediktor, tanpa pake target
predictors_only <- dplyr::select(data, -fetal_health)
model_vif <- lm(rep(1, nrow(predictors_only)) ~ ., data = predictors_only)
vif_values <- car::vif(model_vif)
print("VIF antar prediktor:")
## [1] "VIF antar prediktor:"
print(vif_values)
## baseline.value
## 6.681407
## accelerations
## 2.885972
## fetal_movement
## 1.138681
## uterine_contractions
## 1.281532
## light_decelerations
## 3.449187
## severe_decelerations
## 1.134936
## prolongued_decelerations
## 2.736979
## abnormal_short_term_variability
## 1.916973
## mean_value_of_short_term_variability
## 2.972435
## percentage_of_time_with_abnormal_long_term_variability
## 1.875265
## mean_value_of_long_term_variability
## 2.078607
## histogram_width
## 17.795686
## histogram_min
## 19.707950
## histogram_number_of_peaks
## 2.366522
## histogram_number_of_zeroes
## 1.184267
## histogram_mode
## 8.982942
## histogram_mean
## 20.070464
## histogram_median
## 26.780804
## histogram_variance
## 2.590868
## histogram_tendency
## 2.880995
#---Jika ada VIF > 5 atau 10, hapus
predictors_only <- predictors_only %>% dplyr::select(- histogram_mean) #isi dgn var yng nilainya >10
#---cek vif lagi (stlh dihapus)
model_vif <- lm(rep(1, nrow(predictors_only)) ~ ., data = predictors_only)
vif_values <- car::vif(model_vif)
print("VIF antar prediktor (setelah penghapusan):")
## [1] "VIF antar prediktor (setelah penghapusan):"
print(vif_values)
## baseline.value
## 6.227538
## accelerations
## 2.642784
## fetal_movement
## 1.138046
## uterine_contractions
## 1.274577
## light_decelerations
## 3.211755
## severe_decelerations
## 1.133418
## prolongued_decelerations
## 2.635461
## abnormal_short_term_variability
## 1.766872
## mean_value_of_short_term_variability
## 2.664290
## percentage_of_time_with_abnormal_long_term_variability
## 1.859580
## mean_value_of_long_term_variability
## 1.988585
## histogram_width
## 17.763757
## histogram_min
## 19.542255
## histogram_number_of_peaks
## 2.366111
## histogram_number_of_zeroes
## 1.180517
## histogram_mode
## 8.820754
## histogram_median
## 20.308901
## histogram_variance
## 2.590333
## histogram_tendency
## 2.880981
#================================== 3. EDA ========================================#
cat("Statistika Deskriptif:\n")
## Statistika Deskriptif:
summary(data)
## baseline.value accelerations fetal_movement uterine_contractions
## Min. :106.0 Min. :0.000000 Min. :0.000000 Min. :0.000000
## 1st Qu.:126.0 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.002000
## Median :133.0 Median :0.002000 Median :0.000000 Median :0.005000
## Mean :133.3 Mean :0.003188 Mean :0.009517 Mean :0.004387
## 3rd Qu.:140.0 3rd Qu.:0.006000 3rd Qu.:0.003000 3rd Qu.:0.007000
## Max. :160.0 Max. :0.019000 Max. :0.481000 Max. :0.015000
## light_decelerations severe_decelerations prolongued_decelerations
## Min. :0.000000 Min. :0.000e+00 Min. :0.0000000
## 1st Qu.:0.000000 1st Qu.:0.000e+00 1st Qu.:0.0000000
## Median :0.000000 Median :0.000e+00 Median :0.0000000
## Mean :0.001901 Mean :3.313e-06 Mean :0.0001595
## 3rd Qu.:0.003000 3rd Qu.:0.000e+00 3rd Qu.:0.0000000
## Max. :0.015000 Max. :1.000e-03 Max. :0.0050000
## abnormal_short_term_variability mean_value_of_short_term_variability
## Min. :12.00 Min. :0.200
## 1st Qu.:32.00 1st Qu.:0.700
## Median :49.00 Median :1.200
## Mean :46.99 Mean :1.335
## 3rd Qu.:61.00 3rd Qu.:1.700
## Max. :87.00 Max. :7.000
## percentage_of_time_with_abnormal_long_term_variability
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 9.795
## 3rd Qu.:11.000
## Max. :91.000
## mean_value_of_long_term_variability histogram_width histogram_min
## Min. : 0.000 Min. : 3.00 Min. : 50.00
## 1st Qu.: 4.600 1st Qu.: 37.00 1st Qu.: 67.00
## Median : 7.400 Median : 68.00 Median : 93.00
## Mean : 8.167 Mean : 70.54 Mean : 93.56
## 3rd Qu.:10.800 3rd Qu.:100.00 3rd Qu.:120.00
## Max. :50.700 Max. :180.00 Max. :159.00
## histogram_number_of_peaks histogram_number_of_zeroes histogram_mode
## Min. : 0.000 Min. : 0.0000 Min. : 60.0
## 1st Qu.: 2.000 1st Qu.: 0.0000 1st Qu.:129.0
## Median : 4.000 Median : 0.0000 Median :139.0
## Mean : 4.077 Mean : 0.3256 Mean :137.5
## 3rd Qu.: 6.000 3rd Qu.: 0.0000 3rd Qu.:148.0
## Max. :18.000 Max. :10.0000 Max. :187.0
## histogram_mean histogram_median histogram_variance histogram_tendency
## Min. : 73.0 Min. : 77.0 Min. : 0.00 Min. :-1.0000
## 1st Qu.:125.0 1st Qu.:129.0 1st Qu.: 2.00 1st Qu.: 0.0000
## Median :136.0 Median :139.0 Median : 7.00 Median : 0.0000
## Mean :134.6 Mean :138.1 Mean : 18.91 Mean : 0.3185
## 3rd Qu.:145.0 3rd Qu.:148.0 3rd Qu.: 24.00 3rd Qu.: 1.0000
## Max. :182.0 Max. :186.0 Max. :269.00 Max. : 1.0000
## fetal_health
## 1:1646
## 2: 292
## 3: 175
##
##
##
#--Distribusi variabel target (fetal_health)
cat("\nDistribusi Target (fetal_health):\n")
##
## Distribusi Target (fetal_health):
print(table(data$fetal_health))
##
## 1 2 3
## 1646 292 175
library(ggplot2)
ggplot(data, aes(x = factor(fetal_health))) +
geom_bar(fill = "steelblue") +
labs(title = "Distribusi Kesehatan Janin (fetal_health)", x = "Kategori Kesehatan", y = "Jumlah") +
theme_minimal()

#--korelasi
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
numeric_cols <- data[, sapply(data, is.numeric)]
corrplot(cor(numeric_cols), method = "color",
tl.cex = 0.3, number.cex = 0.2,
col = colorRampPalette(c("blue", "white", "red"))(200),
type = "full", addCoef.col = "black")

#--Boxplot semua variabel numerik terhadap fetal_health
num_vars <- names(data)[sapply(data, is.numeric)]
par(mfrow = c(3, 3))
for (col in num_vars) {
boxplot(data[[col]] ~ data$fetal_health,
main = paste("Boxplot:", col), xlab = "Fetal Health", ylab = col)
}


#--distribusi
par(mfrow = c(3, 3))

for (col in num_vars) {
plot(density(data[[col]]), main = paste("Density:", col),
xlab = col, col = "blue", lwd = 2)
}


#--Fungsi untuk winsorizing menggunakan IQR
winsorize_iqr <- function(dataset) {
for (col in names(dataset)) {
if (is.numeric(dataset[[col]])) {
Q1 <- quantile(dataset[[col]], 0.25, na.rm = TRUE)
Q3 <- quantile(dataset[[col]], 0.75, na.rm = TRUE)
IQR_value <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
dataset[[col]][dataset[[col]] < lower_bound] <- lower_bound
dataset[[col]][dataset[[col]] > upper_bound] <- upper_bound
}
}
return(dataset)
}
#--Terapkan winsorization pada dataset
data <- winsorize_iqr(data)
#--visualisasi setelah penanganan outliers
par(mfrow = c(3, 3))

for (col in num_vars) {
boxplot(data[[col]],
main = paste("Sesudah -", col),
col = "lightblue", border = "black")
}


# =========== 4. Klasifikasi: Multinomial Logistic Regression ========= #
library(nnet)
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
set.seed(123)
#--Split data: 80% train, 20% test
splitIndex <- createDataPartition(data$fetal_health, p = 0.8, list = FALSE)
train_data <- data[splitIndex, ]
test_data <- data[-splitIndex, ]
#--Pastikan target adalah faktor
train_data$fetal_health <- as.factor(train_data$fetal_health)
test_data$fetal_health <- as.factor(test_data$fetal_health)
#--Model multinomial logistic regression
model_multi <- multinom(fetal_health ~ ., data = train_data)
## # weights: 66 (42 variable)
## initial value 1857.753380
## iter 10 value 671.586762
## iter 20 value 624.952705
## iter 30 value 523.283409
## iter 40 value 506.165191
## iter 50 value 455.069285
## iter 60 value 446.665918
## iter 70 value 416.834690
## iter 80 value 410.271349
## iter 90 value 410.007417
## iter 100 value 410.005957
## final value 410.005957
## stopped after 100 iterations
#--Prediksi
prediksi <- predict(model_multi, newdata = test_data)
#--Confusion Matrix
confusion_matrix <- table(Predicted = prediksi, Actual = test_data$fetal_health)
print("Confusion Matrix:")
## [1] "Confusion Matrix:"
print(confusion_matrix)
## Actual
## Predicted 1 2 3
## 1 313 15 2
## 2 15 41 5
## 3 1 2 28
#--visualisasi cf
library(ggplot2)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.4.3
#--Buat dataframe dari confusion matrix
cm_df <- as.data.frame(confusion_matrix)
colnames(cm_df) <- c("Predicted", "Actual", "Freq")
#--Visualisasi heatmap confusion matrix
ggplot(data = cm_df, aes(x = Actual, y = Predicted, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Freq), vjust = 0.5, fontface = "bold", color = "black") +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
labs(title = "Confusion Matrix", x = "Actual Label", y = "Predicted Label") +
theme_minimal()
#--Akurasi
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
cat("Akurasi:", round(accuracy * 100, 2), "%\n")
## Akurasi: 90.52 %
#--Precision, Recall, F1 per kelas
precision <- diag(confusion_matrix) / colSums(confusion_matrix)
recall <- diag(confusion_matrix) / rowSums(confusion_matrix)
f1 <- 2 * precision * recall / (precision + recall)
#--macro
macro_precision <- mean(precision, na.rm = TRUE)
macro_recall <- mean(recall, na.rm = TRUE)
macro_f1 <- mean(f1, na.rm = TRUE)
cat("Macro Precision:", round(macro_precision, 4), "\n")
## Macro Precision: 0.8194
cat("Macro Recall:", round(macro_recall, 4), "\n")
## Macro Recall: 0.8413
cat("Macro F1 Score:", round(macro_f1, 4), "\n")
## Macro F1 Score: 0.8292
#===================================== 5. perceptual mapping=======================#
#--Ambil data numerik
numeric_features <- data[, sapply(data, is.numeric)]
#--Hitung jarak antar data
distance_matrix <- dist(scale(numeric_features)) # penting: dist perlu data diskalakan
#--Jalankan MDS
mds_result <- cmdscale(distance_matrix, k = 2)
#--Buat dataframe untuk visualisasi
mds_df <- data.frame(
Dim1 = mds_result[,1],
Dim2 = mds_result[,2],
Label = data$fetal_health
)
#--Visualisasi MDS
ggplot(mds_df, aes(x = Dim1, y = Dim2, color = Label)) +
geom_point(size = 2, alpha = 0.7) +
labs(title = "Perceptual Mapping (MDS)", x = "Dimensi 1", y = "Dimensi 2") +
theme_minimal() +
scale_color_brewer(palette = "Set2")
