# ================================== 1. PREPROCESSING =====================================#
data <- read.csv("fetal_health.csv")
#--Cek missing value
missing_data <- sapply(data, function(x) sum(is.na(x)))
print("Missing Values Count:")
## [1] "Missing Values Count:"
print(missing_data)
## baseline.value
## 0
## accelerations
## 0
## fetal_movement
## 0
## uterine_contractions
## 0
## light_decelerations
## 0
## severe_decelerations
## 0
## prolongued_decelerations
## 0
## abnormal_short_term_variability
## 0
## mean_value_of_short_term_variability
## 0
## percentage_of_time_with_abnormal_long_term_variability
## 0
## mean_value_of_long_term_variability
## 0
## histogram_width
## 0
## histogram_min
## 0
## histogram_max
## 0
## histogram_number_of_peaks
## 0
## histogram_number_of_zeroes
## 0
## histogram_mode
## 0
## histogram_mean
## 0
## histogram_median
## 0
## histogram_variance
## 0
## histogram_tendency
## 0
## fetal_health
## 0
#--Cek dan hapus duplikat
duplicate_rows <- sum(duplicated(data))
cat("Jumlah duplikat dalam dataset: ", duplicate_rows, "\n")
## Jumlah duplikat dalam dataset: 13
data <- data[!duplicated(data), ]
cat("Dataset setelah menghapus duplikat: ", nrow(data), "baris\n")
## Dataset setelah menghapus duplikat: 2113 baris
#--Ubah target menjadi faktor
data$fetal_health <- as.factor(data$fetal_health)
#--Handling Outliers
num_vars <- names(data)[sapply(data, is.numeric)]
winsorize_iqr <- function(dataset) {
for (col in names(dataset)) {
if (is.numeric(dataset[[col]])) {
Q1 <- quantile(dataset[[col]], 0.25, na.rm = TRUE)
Q3 <- quantile(dataset[[col]], 0.75, na.rm = TRUE)
IQR_value <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
dataset[[col]][dataset[[col]] < lower_bound] <- lower_bound
dataset[[col]][dataset[[col]] > upper_bound] <- upper_bound
}
}
return(dataset)
}
#--Terapkan winsorization pada dataset
data <- winsorize_iqr(data)
#================================== 2. EDA ========================================#
cat("Statistika Deskriptif:\n")
## Statistika Deskriptif:
summary(data)
## baseline.value accelerations fetal_movement uterine_contractions
## Min. :106.0 Min. :0.000000 Min. :0.000000 Min. :0.000000
## 1st Qu.:126.0 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.002000
## Median :133.0 Median :0.002000 Median :0.000000 Median :0.005000
## Mean :133.3 Mean :0.003177 Mean :0.001747 Mean :0.004387
## 3rd Qu.:140.0 3rd Qu.:0.006000 3rd Qu.:0.003000 3rd Qu.:0.007000
## Max. :160.0 Max. :0.015000 Max. :0.007500 Max. :0.014500
## light_decelerations severe_decelerations prolongued_decelerations
## Min. :0.000000 Min. :0 Min. :0
## 1st Qu.:0.000000 1st Qu.:0 1st Qu.:0
## Median :0.000000 Median :0 Median :0
## Mean :0.001744 Mean :0 Mean :0
## 3rd Qu.:0.003000 3rd Qu.:0 3rd Qu.:0
## Max. :0.007500 Max. :0 Max. :0
## abnormal_short_term_variability mean_value_of_short_term_variability
## Min. :12.00 Min. :0.200
## 1st Qu.:32.00 1st Qu.:0.700
## Median :49.00 Median :1.200
## Mean :46.99 Mean :1.302
## 3rd Qu.:61.00 3rd Qu.:1.700
## Max. :87.00 Max. :3.200
## percentage_of_time_with_abnormal_long_term_variability
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 6.631
## 3rd Qu.:11.000
## Max. :27.500
## mean_value_of_long_term_variability histogram_width histogram_min
## Min. : 0.00 Min. : 3.00 Min. : 50.00
## 1st Qu.: 4.60 1st Qu.: 37.00 1st Qu.: 67.00
## Median : 7.40 Median : 68.00 Median : 93.00
## Mean : 7.98 Mean : 70.54 Mean : 93.56
## 3rd Qu.:10.80 3rd Qu.:100.00 3rd Qu.:120.00
## Max. :20.10 Max. :180.00 Max. :159.00
## histogram_max histogram_number_of_peaks histogram_number_of_zeroes
## Min. :122.0 Min. : 0.00 Min. :0
## 1st Qu.:152.0 1st Qu.: 2.00 1st Qu.:0
## Median :162.0 Median : 4.00 Median :0
## Mean :163.9 Mean : 4.06 Mean :0
## 3rd Qu.:174.0 3rd Qu.: 6.00 3rd Qu.:0
## Max. :207.0 Max. :12.00 Max. :0
## histogram_mode histogram_mean histogram_median histogram_variance
## Min. :100.5 Min. : 95.0 Min. :100.5 Min. : 0.00
## 1st Qu.:129.0 1st Qu.:125.0 1st Qu.:129.0 1st Qu.: 2.00
## Median :139.0 Median :136.0 Median :139.0 Median : 7.00
## Mean :137.9 Mean :134.8 Mean :138.2 Mean :15.66
## 3rd Qu.:148.0 3rd Qu.:145.0 3rd Qu.:148.0 3rd Qu.:24.00
## Max. :176.5 Max. :175.0 Max. :176.5 Max. :57.00
## histogram_tendency fetal_health
## Min. :-1.0000 1:1646
## 1st Qu.: 0.0000 2: 292
## Median : 0.0000 3: 175
## Mean : 0.3185
## 3rd Qu.: 1.0000
## Max. : 1.0000
#--Distribusi variabel target (fetal_health)
cat("\nDistribusi Target (fetal_health):\n")
##
## Distribusi Target (fetal_health):
print(table(data$fetal_health))
##
## 1 2 3
## 1646 292 175
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
ggplot(data, aes(x = factor(fetal_health))) +
geom_bar(fill = "steelblue") +
labs(title = "Distribusi Kesehatan Janin (fetal_health)", x = "Kategori Kesehatan", y = "Jumlah") +
theme_minimal()

#--korelasi
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
numeric_cols <- data[, sapply(data, is.numeric)]
corrplot(cor(numeric_cols), method = "color",
tl.cex = 0.3, number.cex = 0.2,
col = colorRampPalette(c("blue", "white", "red"))(200),
type = "full", addCoef.col = "black")
## Warning in cor(numeric_cols): the standard deviation is zero

#--Boxplot semua variabel numerik terhadap fetal_health
num_vars <- names(data)[sapply(data, is.numeric)]
par(mfrow = c(3, 3))
for (col in num_vars) {
boxplot(data[[col]] ~ data$fetal_health,
main = paste("Boxplot:", col), xlab = "Fetal Health", ylab = col)
}


#--distribusi
par(mfrow = c(3, 3))

for (col in num_vars) {
plot(density(data[[col]]), main = paste("Density:", col),
xlab = col, col = "blue", lwd = 2)
}


#--visualisasi setelah penanganan outliers
num_vars <- names(data)[sapply(data, is.numeric)]
par(mfrow = c(3, 3))

for (col in num_vars) {
boxplot(data[[col]],
main = paste("Sesudah -", col),
col = "lightblue", border = "black")
}


# ================================== 3. SMOTE ==============================================#
library(UBL)
## Warning: package 'UBL' was built under R version 4.4.3
## Loading required package: MBA
## Warning: package 'MBA' was built under R version 4.4.3
## Loading required package: gstat
## Warning: package 'gstat' was built under R version 4.4.3
## Loading required package: automap
## Warning: package 'automap' was built under R version 4.4.3
## Loading required package: sp
## Warning: package 'sp' was built under R version 4.4.3
## Loading required package: randomForest
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
data_smote <- SmoteClassif(fetal_health ~ ., data, C.perc = "balance")
cat("Distribusi setelah SMOTE:\n")
## Distribusi setelah SMOTE:
print(table(data_smote$fetal_health))
##
## 1 2 3
## 704 703 704
# ================================== 4. PCA =====================================#
#---Ambil hanya prediktor numerik (exclude target dan faktor)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
numeric_predictors <- data_smote %>% dplyr::select_if(is.numeric)
non_constant_predictors <- numeric_predictors[, apply(numeric_predictors, 2, var) != 0]
#---Standardisasi data
numeric_scaled <- scale(non_constant_predictors)
#---PCA
pca_result <- prcomp(numeric_scaled, center = TRUE, scale. = TRUE)
#---Visualisasi varians masing-masing PC
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_eig(pca_result)
#--Ambil PC dengan kumulatif varians total > 80%
explained_var <- summary(pca_result)$importance[3, ]
n_components <- which(explained_var >= 0.80)[1]
cat("Jumlah komponen utama yang digunakan:", n_components, "\n")
## Jumlah komponen utama yang digunakan: 5
#--Ambil data hasil PCA (hanya komponen terpilih)
pca_data <- as.data.frame(pca_result$x[, 1:n_components])
colnames(pca_data) <- paste0("PC", 1:n_components)
#--Tambahkan kolom target kembali
data_pca <- cbind(pca_data, fetal_health = data_smote$fetal_health)
#================================= 5. Klasifikasi LDA =======================================#
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
library(ggplot2)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.4.3
library(factoextra)
library(MVN)
## Warning: package 'MVN' was built under R version 4.4.3
library(biotools)
## Warning: package 'biotools' was built under R version 4.4.3
## ---
## biotools version 4.3
#---Uji Normalitas Multivariat per kelas (Mardia)
cat("\n=== Uji Normalitas Multivariat (Mardia) ===\n")
##
## === Uji Normalitas Multivariat (Mardia) ===
for (kelas in unique(data_pca$fetal_health)) {
cat("\nKelas:", kelas, "\n")
hasil_mvn <- mvn(data_pca[data_pca$fetal_health == kelas, -which(names(data_pca) == "fetal_health")],
mvnTest = "mardia", multivariatePlot = "none")
print(hasil_mvn$multivariateNormality)
}
##
## Kelas: 1
## Test Statistic p value Result
## 1 Mardia Skewness 343.308826772124 2.71762036315286e-52 NO
## 2 Mardia Kurtosis -2.26205564963263 0.0236939652652923 NO
## 3 MVN <NA> <NA> NO
##
## Kelas: 2
## Test Statistic p value Result
## 1 Mardia Skewness 1369.21204695965 3.48402842503099e-265 NO
## 2 Mardia Kurtosis 20.9112427351724 0 NO
## 3 MVN <NA> <NA> NO
##
## Kelas: 3
## Test Statistic p value Result
## 1 Mardia Skewness 940.900219937538 7.33511401092477e-175 NO
## 2 Mardia Kurtosis 3.52217694966165 0.00042801834812578 NO
## 3 MVN <NA> <NA> NO
#---Uji Homogenitas Varians-Kovarians (Box's M Test)
cat("\n=== Uji Homogenitas Varians-Kovarians (Box's M Test) ===\n")
##
## === Uji Homogenitas Varians-Kovarians (Box's M Test) ===
boxm_result <- boxM(data_pca[, -which(names(data_pca) == "fetal_health")], data_pca$fetal_health)
print(boxm_result)
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: data_pca[, -which(names(data_pca) == "fetal_health")]
## Chi-Sq (approx.) = 2044.3, df = 30, p-value < 2.2e-16
#---mengubah target to faktor
data_pca$fetal_health <- as.factor(data_pca$fetal_health)
#---lda model
lda_model <- lda(fetal_health ~ ., data = data_pca)
#---uji signifikansi pakai wilks lambda
manova_lda <- manova(as.matrix(data_pca[, -which(names(data_pca) == "fetal_health")]) ~ data_pca$fetal_health)
summary_manova <- summary(manova_lda, test = "Wilks")
cat("\n=== Uji Signifikansi Model LDA (Wilks' Lambda) ===\n")
##
## === Uji Signifikansi Model LDA (Wilks' Lambda) ===
print(summary_manova)
## Df Wilks approx F num Df den Df Pr(>F)
## data_pca$fetal_health 2 0.25475 412.91 10 4208 < 2.2e-16 ***
## Residuals 2108
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#---Uji signifikansi Variabel (Koefisien Diskriminan)
print(round(lda_model$scaling, 4))
## LD1 LD2
## PC1 -0.0869 -0.3778
## PC2 -0.6499 -0.1899
## PC3 -0.8342 0.4319
## PC4 -0.1819 -0.0925
## PC5 -0.2784 0.2933
#---prediksi
lda_pred <- predict(lda_model, newdata = data_pca)
#---confusion
confusion_matrix <- table(Predicted = lda_pred$class, Actual = data_pca$fetal_health)
cat("Confusion Matrix:\n")
## Confusion Matrix:
print(confusion_matrix)
## Actual
## Predicted 1 2 3
## 1 522 56 21
## 2 110 598 156
## 3 72 49 527
#--heatmap
conf_mat_table <- table(Predicted = lda_pred$class, Actual = data_pca$fetal_health)
conf_mat_df <- as.data.frame(conf_mat_table)
ggplot(conf_mat_df, aes(x = Actual, y = Predicted, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Freq), size = 5, color = "black") +
scale_fill_gradient(low = "lightblue", high = "blue") +
labs(title = "Heatmap Confusion Matrix (LDA - In Sample)",
x = "Actual Class", y = "Predicted Class") +
theme_minimal()
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix) * 100
cat("Akurasi:", round(accuracy, 2), "%\n")
## Akurasi: 78.02 %
#===================================== 6. klasifikasi mlr =================================#
library(nnet)
## Warning: package 'nnet' was built under R version 4.4.3
library(caret)
library(ggplot2)
library(reshape2)
set.seed(123)
#=============================uji asumsi (VIF) ========================#
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(dplyr)
#---Buat model VIF hanya dari prediktor, tanpa pake target
predictors_only <- dplyr::select(data_pca, -fetal_health)
model_vif <- lm(rep(1, nrow(predictors_only)) ~ ., data = predictors_only)
vif_values <- car::vif(model_vif)
print("VIF antar prediktor:")
## [1] "VIF antar prediktor:"
print(vif_values)
## PC1 PC2 PC3 PC4 PC5
## 1 1 1 1 1
#==============================Pemodelan==============================#
set.seed(123)
#---Mengubah target menjadi faktor
data_pca$fetal_health <- as.factor(data_pca$fetal_health)
#---Latih model multinomial logistic regression pada seluruh data
model_multi <- multinom(fetal_health ~ ., data = data_pca)
## # weights: 21 (12 variable)
## initial value 2319.170541
## iter 10 value 1178.768641
## iter 20 value 1062.889808
## final value 1062.881286
## converged
#=================== UJI SERENTAK (Likelihood Ratio Test) ===================#
# Model null
model_null <- multinom(fetal_health ~ 1, data = data_pca)
## # weights: 6 (2 variable)
## initial value 2319.170541
## final value 2319.170068
## converged
# Ambil log-likelihood
loglik_full <- logLik(model_multi)
loglik_null <- logLik(model_null)
# Hitung statistik LRT
lrt_stat <- 2 * (loglik_full - loglik_null)
df_diff <- attr(loglik_full, "df") - attr(loglik_null, "df")
p_value_lrt <- pchisq(lrt_stat, df = df_diff, lower.tail = FALSE)
cat("\n=== Uji Serentak (Likelihood Ratio Test) ===\n")
##
## === Uji Serentak (Likelihood Ratio Test) ===
cat("Statistik LRT =", round(lrt_stat, 3), "\n")
## Statistik LRT = 2512.578
cat("Derajat kebebasan =", df_diff, "\n")
## Derajat kebebasan = 10
cat("P-value =", p_value_lrt, "\n")
## P-value = 0
#===================== UJI PARSIAL (Wald Test) ========================#
summary_model <- summary(model_multi)
# Ambil koefisien dan standard error
coefs <- summary_model$coefficients
std_err <- summary_model$standard.errors
# Hitung z dan p-value
z_values <- coefs / std_err
p_values_wald <- 2 * (1 - pnorm(abs(z_values)))
cat("\n=== Uji Parsial (Wald Test) untuk Tiap Koefisien ===\n")
##
## === Uji Parsial (Wald Test) untuk Tiap Koefisien ===
print(round(p_values_wald, 4))
## (Intercept) PC1 PC2 PC3 PC4 PC5
## 2 0.29 0.0000 0 0 0.0083 0
## 3 0.00 0.0017 0 0 0.0106 0
#---Prediksi
prediksi <- predict(model_multi, newdata = data_pca)
#---Confusion Matrix
confusion_matrix <- table(Predicted = prediksi, Actual = data_pca$fetal_health)
cat("Confusion Matrix:\n")
## Confusion Matrix:
print(confusion_matrix)
## Actual
## Predicted 1 2 3
## 1 562 66 42
## 2 84 566 104
## 3 58 71 558
#---Visualisasi confusion matrix sebagai heatmap
cm_df <- as.data.frame(confusion_matrix)
colnames(cm_df) <- c("Predicted", "Actual", "Freq")
ggplot(data = cm_df, aes(x = Actual, y = Predicted, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Freq), vjust = 0.5, fontface = "bold", color = "black") +
scale_fill_gradient(low = "lightblue", high = "blue") +
labs(title = "Confusion Matrix (Multinomial Logistic Regression - In Sample)",
x = "Actual Label", y = "Predicted Label") +
theme_minimal()
#---Hitung akurasi
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
cat("Akurasi:", round(accuracy * 100, 2), "%\n")
## Akurasi: 79.87 %
#--- Interpretasi menggunakan odds ratio
odds_ratios <- exp(coefs)
cat("\n=== Odds Ratio ===\n")
##
## === Odds Ratio ===
print(round(odds_ratios, 3))
## (Intercept) PC1 PC2 PC3 PC4 PC5
## 2 0.900 1.784 0.711 0.242 1.192 0.524
## 3 0.403 0.894 0.152 0.092 0.814 0.333
