# Set working directory
setwd("D:/analisis multivariat_smt4")

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Loading required package: lattice
library(UBL)
## Warning: package 'UBL' was built under R version 4.4.3
## Loading required package: MBA
## Warning: package 'MBA' was built under R version 4.4.3
## Loading required package: gstat
## Warning: package 'gstat' was built under R version 4.4.3
## Loading required package: automap
## Warning: package 'automap' was built under R version 4.4.3
## Loading required package: sp
## Warning: package 'sp' was built under R version 4.4.3
## Loading required package: randomForest
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(MASS)
## Warning: package 'MASS' was built under R version 4.4.3
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(nnet)
## Warning: package 'nnet' was built under R version 4.4.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
library(psych)
## Warning: package 'psych' was built under R version 4.4.3
## 
## Attaching package: 'psych'
## The following object is masked from 'package:UBL':
## 
##     phi
## The following object is masked from 'package:randomForest':
## 
##     outlier
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(mvnormtest)
library(biotools)
## Warning: package 'biotools' was built under R version 4.4.3
## ---
## biotools version 4.3
library(ggplot2)
library(tidyr)

# Set seed for reproducibility
set.seed(42)

Load Dataset

data <- read.csv("fetal_health.csv")
summary(data)
##  baseline.value  accelerations      fetal_movement     uterine_contractions
##  Min.   :106.0   Min.   :0.000000   Min.   :0.000000   Min.   :0.000000    
##  1st Qu.:126.0   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.002000    
##  Median :133.0   Median :0.002000   Median :0.000000   Median :0.004000    
##  Mean   :133.3   Mean   :0.003178   Mean   :0.009481   Mean   :0.004366    
##  3rd Qu.:140.0   3rd Qu.:0.006000   3rd Qu.:0.003000   3rd Qu.:0.007000    
##  Max.   :160.0   Max.   :0.019000   Max.   :0.481000   Max.   :0.015000    
##  light_decelerations severe_decelerations prolongued_decelerations
##  Min.   :0.000000    Min.   :0.000e+00    Min.   :0.0000000       
##  1st Qu.:0.000000    1st Qu.:0.000e+00    1st Qu.:0.0000000       
##  Median :0.000000    Median :0.000e+00    Median :0.0000000       
##  Mean   :0.001889    Mean   :3.293e-06    Mean   :0.0001585       
##  3rd Qu.:0.003000    3rd Qu.:0.000e+00    3rd Qu.:0.0000000       
##  Max.   :0.015000    Max.   :1.000e-03    Max.   :0.0050000       
##  abnormal_short_term_variability mean_value_of_short_term_variability
##  Min.   :12.00                   Min.   :0.200                       
##  1st Qu.:32.00                   1st Qu.:0.700                       
##  Median :49.00                   Median :1.200                       
##  Mean   :46.99                   Mean   :1.333                       
##  3rd Qu.:61.00                   3rd Qu.:1.700                       
##  Max.   :87.00                   Max.   :7.000                       
##  percentage_of_time_with_abnormal_long_term_variability
##  Min.   : 0.000                                        
##  1st Qu.: 0.000                                        
##  Median : 0.000                                        
##  Mean   : 9.847                                        
##  3rd Qu.:11.000                                        
##  Max.   :91.000                                        
##  mean_value_of_long_term_variability histogram_width  histogram_min   
##  Min.   : 0.000                      Min.   :  3.00   Min.   : 50.00  
##  1st Qu.: 4.600                      1st Qu.: 37.00   1st Qu.: 67.00  
##  Median : 7.400                      Median : 67.50   Median : 93.00  
##  Mean   : 8.188                      Mean   : 70.45   Mean   : 93.58  
##  3rd Qu.:10.800                      3rd Qu.:100.00   3rd Qu.:120.00  
##  Max.   :50.700                      Max.   :180.00   Max.   :159.00  
##  histogram_max histogram_number_of_peaks histogram_number_of_zeroes
##  Min.   :122   Min.   : 0.000            Min.   : 0.0000           
##  1st Qu.:152   1st Qu.: 2.000            1st Qu.: 0.0000           
##  Median :162   Median : 3.000            Median : 0.0000           
##  Mean   :164   Mean   : 4.068            Mean   : 0.3236           
##  3rd Qu.:174   3rd Qu.: 6.000            3rd Qu.: 0.0000           
##  Max.   :238   Max.   :18.000            Max.   :10.0000           
##  histogram_mode  histogram_mean  histogram_median histogram_variance
##  Min.   : 60.0   Min.   : 73.0   Min.   : 77.0    Min.   :  0.00    
##  1st Qu.:129.0   1st Qu.:125.0   1st Qu.:129.0    1st Qu.:  2.00    
##  Median :139.0   Median :136.0   Median :139.0    Median :  7.00    
##  Mean   :137.5   Mean   :134.6   Mean   :138.1    Mean   : 18.81    
##  3rd Qu.:148.0   3rd Qu.:145.0   3rd Qu.:148.0    3rd Qu.: 24.00    
##  Max.   :187.0   Max.   :182.0   Max.   :186.0    Max.   :269.00    
##  histogram_tendency  fetal_health  
##  Min.   :-1.0000    Min.   :1.000  
##  1st Qu.: 0.0000    1st Qu.:1.000  
##  Median : 0.0000    Median :1.000  
##  Mean   : 0.3203    Mean   :1.304  
##  3rd Qu.: 1.0000    3rd Qu.:1.000  
##  Max.   : 1.0000    Max.   :3.000

Korelasi Awal

# Mengambil hanya kolom numerik
numeric_data <- data[, sapply(data, is.numeric)]

cor_matrix <- cor(numeric_data, use = "pairwise.complete.obs")

# Visualisasi 
corrplot(cor_matrix, method = "color", type = "upper", 
         tl.col = "black", tl.srt = 45, 
         addCoef.col = "black", number.cex = 0.2,
         tl.cex = 0.4,
         cl.cex = 0.7,
         mar = c(0, 0, 2, 0),
         title = "Correlation Matrix Before Preprocessing")

Missing Value

missing_values <- colSums(is.na(data))
cat("Missing Values:\n")
## Missing Values:
print(missing_values)
##                                         baseline.value 
##                                                      0 
##                                          accelerations 
##                                                      0 
##                                         fetal_movement 
##                                                      0 
##                                   uterine_contractions 
##                                                      0 
##                                    light_decelerations 
##                                                      0 
##                                   severe_decelerations 
##                                                      0 
##                               prolongued_decelerations 
##                                                      0 
##                        abnormal_short_term_variability 
##                                                      0 
##                   mean_value_of_short_term_variability 
##                                                      0 
## percentage_of_time_with_abnormal_long_term_variability 
##                                                      0 
##                    mean_value_of_long_term_variability 
##                                                      0 
##                                        histogram_width 
##                                                      0 
##                                          histogram_min 
##                                                      0 
##                                          histogram_max 
##                                                      0 
##                              histogram_number_of_peaks 
##                                                      0 
##                             histogram_number_of_zeroes 
##                                                      0 
##                                         histogram_mode 
##                                                      0 
##                                         histogram_mean 
##                                                      0 
##                                       histogram_median 
##                                                      0 
##                                     histogram_variance 
##                                                      0 
##                                     histogram_tendency 
##                                                      0 
##                                           fetal_health 
##                                                      0
# Bar chart untuk missing values
missing_data <- data.frame(
  Variable = names(missing_values),
  Missing_Count = missing_values
)

p_missing <- ggplot(missing_data, aes(x = Variable, y = Missing_Count, fill = Variable)) +
  geom_bar(stat = "identity") +
  labs(title = "Missing Values per Variable", x = "Variable", y = "Count of Missing Values") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

p_missing

Deteksi Outlier

detect_outliers <- function(x) {
  Q1 <- quantile(x, 0.25)
  Q3 <- quantile(x, 0.75)
  IQR <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  outliers <- sum(x < lower_bound | x > upper_bound, na.rm = TRUE)
  return(outliers)
}

# Hitung outliers untuk tiap kolom (tanpa target)
outliers_report <- sapply(data[, -ncol(data)], detect_outliers)
cat("Outliers Count per Feature:\n")
## Outliers Count per Feature:
print(outliers_report)
##                                         baseline.value 
##                                                      0 
##                                          accelerations 
##                                                     14 
##                                         fetal_movement 
##                                                    307 
##                                   uterine_contractions 
##                                                      1 
##                                    light_decelerations 
##                                                    150 
##                                   severe_decelerations 
##                                                      7 
##                               prolongued_decelerations 
##                                                    178 
##                        abnormal_short_term_variability 
##                                                      0 
##                   mean_value_of_short_term_variability 
##                                                     70 
## percentage_of_time_with_abnormal_long_term_variability 
##                                                    309 
##                    mean_value_of_long_term_variability 
##                                                     71 
##                                        histogram_width 
##                                                      0 
##                                          histogram_min 
##                                                      0 
##                                          histogram_max 
##                                                     24 
##                              histogram_number_of_peaks 
##                                                     19 
##                             histogram_number_of_zeroes 
##                                                    502 
##                                         histogram_mode 
##                                                     73 
##                                         histogram_mean 
##                                                     45 
##                                       histogram_median 
##                                                     28 
##                                     histogram_variance 
##                                                    184 
##                                     histogram_tendency 
##                                                      0

Boxplot Sebelum Preprocessing

data_long <- pivot_longer(data[, -ncol(data)], cols = everything(), names_to = "Variable", values_to = "Value")

# Boxplot
p_box_before <- ggplot(data_long, aes(x = Variable, y = Value, fill = Variable)) +
  geom_boxplot(show.legend = FALSE) +
  theme_minimal() +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks = element_blank(),
    panel.grid = element_blank(),
    plot.title = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

p_box_before

Winsorizing untuk Mengurangi Outlier

winsorize <- function(x) {
  Q1 <- quantile(x, 0.25)
  Q3 <- quantile(x, 0.75)
  IQR <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  x[x < lower_bound] <- lower_bound
  x[x > upper_bound] <- upper_bound
  return(x)
}

# Terapkan winsorizing ke semua fitur kecuali target
data_winsorized <- data
data_winsorized[, -ncol(data)] <- lapply(data[, -ncol(data)], winsorize)

# Cek outlier setelah winsorizing
outliers_report_winsorized <- sapply(data_winsorized[, -ncol(data_winsorized)], detect_outliers)
cat("Outliers Count per Feature After Winsorizing:\n")
## Outliers Count per Feature After Winsorizing:
print(outliers_report_winsorized)
##                                         baseline.value 
##                                                      0 
##                                          accelerations 
##                                                      0 
##                                         fetal_movement 
##                                                      0 
##                                   uterine_contractions 
##                                                      0 
##                                    light_decelerations 
##                                                      0 
##                                   severe_decelerations 
##                                                      0 
##                               prolongued_decelerations 
##                                                      0 
##                        abnormal_short_term_variability 
##                                                      0 
##                   mean_value_of_short_term_variability 
##                                                      0 
## percentage_of_time_with_abnormal_long_term_variability 
##                                                      0 
##                    mean_value_of_long_term_variability 
##                                                      0 
##                                        histogram_width 
##                                                      0 
##                                          histogram_min 
##                                                      0 
##                                          histogram_max 
##                                                      0 
##                              histogram_number_of_peaks 
##                                                      0 
##                             histogram_number_of_zeroes 
##                                                      0 
##                                         histogram_mode 
##                                                      0 
##                                         histogram_mean 
##                                                      0 
##                                       histogram_median 
##                                                      0 
##                                     histogram_variance 
##                                                      0 
##                                     histogram_tendency 
##                                                      0

Visualisasi Boxplot Setelah Winsorizing

data_winsorized_long <- pivot_longer(data_winsorized[, -ncol(data_winsorized)], 
                                     cols = everything(), 
                                     names_to = "Variable", values_to = "Value")

# Boxplot setelah winsorizing
p_box_after <- ggplot(data_winsorized_long, aes(x = Variable, y = Value, fill = Variable)) +
  geom_boxplot() +
  labs(title = "Box Plot of Features After Winsorizing", x = "Variable", y = "Value") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

p_box_after

Pengecekan dan Penghapusan Kolom Konstan atau Nol Varians

# Hitung varians tiap kolom numerik
variances <- apply(data_winsorized[, -ncol(data_winsorized)], 2, var)

# Deteksi kolom dengan varians nol atau NA
zero_variance_cols <- names(variances[variances == 0 | is.na(variances)])

if (length(zero_variance_cols) > 0) {
  cat("Removing constant or zero-variance columns:", zero_variance_cols, "\n")
  data_winsorized <- data_winsorized[, !names(data_winsorized) %in% zero_variance_cols]
} else {
  cat("No constant or zero-variance columns detected.\n")
}
## Removing constant or zero-variance columns: severe_decelerations prolongued_decelerations histogram_number_of_zeroes

Deteksi dan Penghapusan Baris Duplikat

# Jumlah baris duplikat
duplicates <- sum(duplicated(data_winsorized))
cat("Number of Duplicate Rows:\n")
## Number of Duplicate Rows:
print(duplicates)
## [1] 17
# Hapus baris duplikat
data_winsorized <- data_winsorized[!duplicated(data_winsorized), ]
cat("Dataset Dimensions after Removing Duplicates:\n")
## Dataset Dimensions after Removing Duplicates:
print(dim(data_winsorized))
## [1] 2109   19

Penghapusan Kolom yang Sangat Berkorelasi

# Fungsi hapus fitur yang sangat berkorelasi
remove_correlated <- function(X, threshold = 0.8) {
  cor_matrix <- cor(X, use = "complete.obs")
  high_cor <- which(abs(cor_matrix) > threshold & upper.tri(cor_matrix), arr.ind = TRUE)
  if (nrow(high_cor) > 0) {
    to_remove <- unique(high_cor[, 2])
    cat("Removing highly correlated variables:", colnames(X)[to_remove], "\n")
    X <- X[, -to_remove, drop = FALSE]
  } else {
    cat("No highly correlated variables detected.\n")
  }
  return(X)
}

# Terapkan pada data winsorized tanpa target
numerical_features <- data_winsorized[, -which(names(data_winsorized) == "fetal_health")]
numerical_features <- remove_correlated(numerical_features, threshold = 0.8)
## Removing highly correlated variables: histogram_min histogram_mean histogram_median
# Gabungkan kembali dengan target
data_winsorized <- cbind(numerical_features, fetal_health = data_winsorized$fetal_health)

# Tampilkan matriks korelasi setelah penghapusan
cor_matrix <- cor(numerical_features, use = "complete.obs")
cat("Correlation Matrix After Removing Highly Correlated Columns:\n")
## Correlation Matrix After Removing Highly Correlated Columns:
print(round(cor_matrix, 3))
##                                                        baseline.value
## baseline.value                                                  1.000
## accelerations                                                  -0.081
## fetal_movement                                                 -0.024
## uterine_contractions                                           -0.148
## light_decelerations                                            -0.162
## abnormal_short_term_variability                                 0.303
## mean_value_of_short_term_variability                           -0.312
## percentage_of_time_with_abnormal_long_term_variability          0.332
## mean_value_of_long_term_variability                            -0.046
## histogram_width                                                -0.146
## histogram_max                                                   0.294
## histogram_number_of_peaks                                      -0.111
## histogram_mode                                                  0.765
## histogram_variance                                             -0.175
## histogram_tendency                                              0.295
##                                                        accelerations
## baseline.value                                                -0.081
## accelerations                                                  1.000
## fetal_movement                                                 0.088
## uterine_contractions                                           0.088
## light_decelerations                                           -0.098
## abnormal_short_term_variability                               -0.280
## mean_value_of_short_term_variability                           0.232
## percentage_of_time_with_abnormal_long_term_variability        -0.425
## mean_value_of_long_term_variability                           -0.157
## histogram_width                                                0.299
## histogram_max                                                  0.411
## histogram_number_of_peaks                                      0.194
## histogram_mode                                                 0.247
## histogram_variance                                             0.204
## histogram_tendency                                             0.030
##                                                        fetal_movement
## baseline.value                                                 -0.024
## accelerations                                                   0.088
## fetal_movement                                                  1.000
## uterine_contractions                                           -0.317
## light_decelerations                                            -0.018
## abnormal_short_term_variability                                 0.125
## mean_value_of_short_term_variability                            0.094
## percentage_of_time_with_abnormal_long_term_variability         -0.032
## mean_value_of_long_term_variability                            -0.029
## histogram_width                                                 0.182
## histogram_max                                                   0.127
## histogram_number_of_peaks                                       0.200
## histogram_mode                                                  0.014
## histogram_variance                                              0.098
## histogram_tendency                                              0.010
##                                                        uterine_contractions
## baseline.value                                                       -0.148
## accelerations                                                         0.088
## fetal_movement                                                       -0.317
## uterine_contractions                                                  1.000
## light_decelerations                                                   0.291
## abnormal_short_term_variability                                      -0.234
## mean_value_of_short_term_variability                                  0.309
## percentage_of_time_with_abnormal_long_term_variability               -0.296
## mean_value_of_long_term_variability                                  -0.067
## histogram_width                                                       0.140
## histogram_max                                                         0.112
## histogram_number_of_peaks                                             0.084
## histogram_mode                                                       -0.105
## histogram_variance                                                    0.252
## histogram_tendency                                                   -0.069
##                                                        light_decelerations
## baseline.value                                                      -0.162
## accelerations                                                       -0.098
## fetal_movement                                                      -0.018
## uterine_contractions                                                 0.291
## light_decelerations                                                  1.000
## abnormal_short_term_variability                                     -0.135
## mean_value_of_short_term_variability                                 0.586
## percentage_of_time_with_abnormal_long_term_variability              -0.314
## mean_value_of_long_term_variability                                 -0.246
## histogram_width                                                      0.546
## histogram_max                                                        0.220
## histogram_number_of_peaks                                            0.426
## histogram_mode                                                      -0.305
## histogram_variance                                                   0.676
## histogram_tendency                                                   0.032
##                                                        abnormal_short_term_variability
## baseline.value                                                                   0.303
## accelerations                                                                   -0.280
## fetal_movement                                                                   0.125
## uterine_contractions                                                            -0.234
## light_decelerations                                                             -0.135
## abnormal_short_term_variability                                                  1.000
## mean_value_of_short_term_variability                                            -0.452
## percentage_of_time_with_abnormal_long_term_variability                           0.456
## mean_value_of_long_term_variability                                             -0.333
## histogram_width                                                                 -0.261
## histogram_max                                                                   -0.125
## histogram_number_of_peaks                                                       -0.170
## histogram_mode                                                                   0.084
## histogram_variance                                                              -0.211
## histogram_tendency                                                              -0.008
##                                                        mean_value_of_short_term_variability
## baseline.value                                                                       -0.312
## accelerations                                                                         0.232
## fetal_movement                                                                        0.094
## uterine_contractions                                                                  0.309
## light_decelerations                                                                   0.586
## abnormal_short_term_variability                                                      -0.452
## mean_value_of_short_term_variability                                                  1.000
## percentage_of_time_with_abnormal_long_term_variability                               -0.565
## mean_value_of_long_term_variability                                                   0.016
## histogram_width                                                                       0.685
## histogram_max                                                                         0.401
## histogram_number_of_peaks                                                             0.522
## histogram_mode                                                                       -0.312
## histogram_variance                                                                    0.655
## histogram_tendency                                                                   -0.062
##                                                        percentage_of_time_with_abnormal_long_term_variability
## baseline.value                                                                                          0.332
## accelerations                                                                                          -0.425
## fetal_movement                                                                                         -0.032
## uterine_contractions                                                                                   -0.296
## light_decelerations                                                                                    -0.314
## abnormal_short_term_variability                                                                         0.456
## mean_value_of_short_term_variability                                                                   -0.565
## percentage_of_time_with_abnormal_long_term_variability                                                  1.000
## mean_value_of_long_term_variability                                                                    -0.130
## histogram_width                                                                                        -0.462
## histogram_max                                                                                          -0.272
## histogram_number_of_peaks                                                                              -0.289
## histogram_mode                                                                                          0.218
## histogram_variance                                                                                     -0.400
## histogram_tendency                                                                                      0.049
##                                                        mean_value_of_long_term_variability
## baseline.value                                                                      -0.046
## accelerations                                                                       -0.157
## fetal_movement                                                                      -0.029
## uterine_contractions                                                                -0.067
## light_decelerations                                                                 -0.246
## abnormal_short_term_variability                                                     -0.333
## mean_value_of_short_term_variability                                                 0.016
## percentage_of_time_with_abnormal_long_term_variability                              -0.130
## mean_value_of_long_term_variability                                                  1.000
## histogram_width                                                                      0.080
## histogram_max                                                                       -0.013
## histogram_number_of_peaks                                                            0.044
## histogram_mode                                                                       0.041
## histogram_variance                                                                  -0.159
## histogram_tendency                                                                   0.148
##                                                        histogram_width
## baseline.value                                                  -0.146
## accelerations                                                    0.299
## fetal_movement                                                   0.182
## uterine_contractions                                             0.140
## light_decelerations                                              0.546
## abnormal_short_term_variability                                 -0.261
## mean_value_of_short_term_variability                             0.685
## percentage_of_time_with_abnormal_long_term_variability          -0.462
## mean_value_of_long_term_variability                              0.080
## histogram_width                                                  1.000
## histogram_max                                                    0.689
## histogram_number_of_peaks                                        0.752
## histogram_mode                                                  -0.134
## histogram_variance                                               0.721
## histogram_tendency                                               0.117
##                                                        histogram_max
## baseline.value                                                 0.294
## accelerations                                                  0.411
## fetal_movement                                                 0.127
## uterine_contractions                                           0.112
## light_decelerations                                            0.220
## abnormal_short_term_variability                               -0.125
## mean_value_of_short_term_variability                           0.401
## percentage_of_time_with_abnormal_long_term_variability        -0.272
## mean_value_of_long_term_variability                           -0.013
## histogram_width                                                0.689
## histogram_max                                                  1.000
## histogram_number_of_peaks                                      0.524
## histogram_mode                                                 0.312
## histogram_variance                                             0.484
## histogram_tendency                                            -0.130
##                                                        histogram_number_of_peaks
## baseline.value                                                            -0.111
## accelerations                                                              0.194
## fetal_movement                                                             0.200
## uterine_contractions                                                       0.084
## light_decelerations                                                        0.426
## abnormal_short_term_variability                                           -0.170
## mean_value_of_short_term_variability                                       0.522
## percentage_of_time_with_abnormal_long_term_variability                    -0.289
## mean_value_of_long_term_variability                                        0.044
## histogram_width                                                            0.752
## histogram_max                                                              0.524
## histogram_number_of_peaks                                                  1.000
## histogram_mode                                                            -0.083
## histogram_variance                                                         0.548
## histogram_tendency                                                         0.112
##                                                        histogram_mode
## baseline.value                                                  0.765
## accelerations                                                   0.247
## fetal_movement                                                  0.014
## uterine_contractions                                           -0.105
## light_decelerations                                            -0.305
## abnormal_short_term_variability                                 0.084
## mean_value_of_short_term_variability                           -0.312
## percentage_of_time_with_abnormal_long_term_variability          0.218
## mean_value_of_long_term_variability                             0.041
## histogram_width                                                -0.134
## histogram_max                                                   0.312
## histogram_number_of_peaks                                      -0.083
## histogram_mode                                                  1.000
## histogram_variance                                             -0.191
## histogram_tendency                                              0.405
##                                                        histogram_variance
## baseline.value                                                     -0.175
## accelerations                                                       0.204
## fetal_movement                                                      0.098
## uterine_contractions                                                0.252
## light_decelerations                                                 0.676
## abnormal_short_term_variability                                    -0.211
## mean_value_of_short_term_variability                                0.655
## percentage_of_time_with_abnormal_long_term_variability             -0.400
## mean_value_of_long_term_variability                                -0.159
## histogram_width                                                     0.721
## histogram_max                                                       0.484
## histogram_number_of_peaks                                           0.548
## histogram_mode                                                     -0.191
## histogram_variance                                                  1.000
## histogram_tendency                                                 -0.006
##                                                        histogram_tendency
## baseline.value                                                      0.295
## accelerations                                                       0.030
## fetal_movement                                                      0.010
## uterine_contractions                                               -0.069
## light_decelerations                                                 0.032
## abnormal_short_term_variability                                    -0.008
## mean_value_of_short_term_variability                               -0.062
## percentage_of_time_with_abnormal_long_term_variability              0.049
## mean_value_of_long_term_variability                                 0.148
## histogram_width                                                     0.117
## histogram_max                                                      -0.130
## histogram_number_of_peaks                                           0.112
## histogram_mode                                                      0.405
## histogram_variance                                                 -0.006
## histogram_tendency                                                  1.000
# Visualisasi korelasi
corrplot(cor_matrix, method = "color", type = "upper", 
         tl.col = "black", tl.srt = 45, 
         addCoef.col = "black", number.cex = 0.2,
         tl.cex = 0.4,
         cl.cex = 0.7,
         mar = c(0, 0, 2, 0), 
         title = "Correlation Matrix After Removing Highly Correlated Columns", 
         )

SMOTE untuk Menyeimbangkan Data

# Pastikan target sebagai faktor
data_winsorized$fetal_health <- as.factor(data_winsorized$fetal_health)

# Distribusi target sebelum SMOTE
cat("Distribution Without SMOTE:\n")
## Distribution Without SMOTE:
print(table(data_winsorized$fetal_health))
## 
##    1    2    3 
## 1645  291  173
# Terapkan SMOTE dengan package UBL
data_smote <- SmoteClassif(fetal_health ~ ., data_winsorized, C.perc = "balance")

# Pisah fitur dan target
X_smote <- data_smote[, -which(names(data_smote) == "fetal_health")]
y_smote <- as.factor(data_smote$fetal_health)

# Hapus fitur sangat berkorelasi di data SMOTE
X_smote <- remove_correlated(X_smote, threshold = 0.8)
## Removing highly correlated variables: histogram_number_of_peaks
# Cek kolom konstan di data SMOTE
variances_smote <- apply(X_smote, 2, var)
zero_variance_smote_cols <- names(variances_smote[variances_smote == 0 | is.na(variances_smote)])

if (length(zero_variance_smote_cols) > 0) {
  cat("Removing constant or zero-variance columns in SMOTE data:", zero_variance_smote_cols, "\n")
  X_smote <- X_smote[, !names(X_smote) %in% zero_variance_smote_cols]
} else {
  cat("No constant or zero-variance columns detected in SMOTE data.\n")
}
## No constant or zero-variance columns detected in SMOTE data.
# Distribution of target variable after SMOTE
cat("\nDistribution With SMOTE:\n")
## 
## Distribution With SMOTE:
print(table(y_smote))
## y_smote
##   1   2   3 
## 703 703 702

Visualization: Class distribution before and after SMOTE

dist_before <- as.data.frame(table(data_winsorized$fetal_health))
dist_after <- as.data.frame(table(y_smote))
dist_before$Dataset <- "Before SMOTE"
dist_after$Dataset <- "After SMOTE"
colnames(dist_before) <- c("Class", "Count", "Dataset")
colnames(dist_after) <- c("Class", "Count", "Dataset")
dist_data <- rbind(dist_before, dist_after)

p_dist <- ggplot(dist_data, aes(x = Class, y = Count, fill = Dataset)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Class Distribution Before and After SMOTE", x = "Fetal Health Class", y = "Count") +
  theme_minimal()
print(p_dist)

## PCA

# --- Dimensionality Reduction (PCA and FA) ---
# Standardize the features, handling potential NA/NaN issues
X_scaled <- scale(numerical_features, center = TRUE, scale = TRUE)
X_scaled[is.na(X_scaled)] <- 0 # Replace NA/NaN with 0 if any remain
X_smote_scaled <- scale(X_smote, center = TRUE, scale = TRUE)
X_smote_scaled[is.na(X_smote_scaled)] <- 0 # Replace NA/NaN with 0 if any remain

# E1: PCA without SMOTE
cat("\nE1 - PCA without SMOTE (Kaiser Rule)\n")
## 
## E1 - PCA without SMOTE (Kaiser Rule)
# Jalankan PCA
pca <- prcomp(X_scaled, scale. = FALSE)

# Hitung eigenvalues dari komponen PCA
eigenvalues <- pca$sdev^2

# Gunakan Kaiser Rule: pilih komponen dengan eigenvalue > 1
num_components <- sum(eigenvalues > 1)
cat("Jumlah komponen dengan eigenvalue > 1 (Kaiser Rule):", num_components, "\n")
## Jumlah komponen dengan eigenvalue > 1 (Kaiser Rule): 5
# Transformasi data menggunakan komponen tersebut
X_pca <- predict(pca)[, 1:num_components]

# Tampilkan proporsi variansi yang dijelaskan
cat("\nE1 - PCA Explained Variance Ratio (Without SMOTE):\n")
## 
## E1 - PCA Explained Variance Ratio (Without SMOTE):
print(summary(pca)$importance[2, 1:num_components])
##     PC1     PC2     PC3     PC4     PC5 
## 0.30936 0.15458 0.10820 0.09122 0.08444
# Jalankan PCA pada data hasil SMOTE yang telah diskalakan
pca_smote <- prcomp(X_smote_scaled, scale. = FALSE)

# Hitung eigenvalue
eigenvalues_smote <- pca_smote$sdev^2

# Tentukan jumlah komponen dengan eigenvalue > 1 (Kaiser Rule)
num_components_smote <- sum(eigenvalues_smote > 1)
cat("Jumlah komponen dengan eigenvalue > 1 (Kaiser Rule):", num_components_smote, "\n")
## Jumlah komponen dengan eigenvalue > 1 (Kaiser Rule): 5
# Transformasi data ke ruang PCA
X_smote_pca <- predict(pca_smote)[, 1:num_components_smote]

# Tampilkan variansi yang dijelaskan oleh komponen-komponen terpilih
cat("\nF1 - PCA Explained Variance Ratio (With SMOTE):\n")
## 
## F1 - PCA Explained Variance Ratio (With SMOTE):
print(summary(pca_smote)$importance[2, 1:num_components_smote])
##     PC1     PC2     PC3     PC4     PC5 
## 0.38888 0.14965 0.10227 0.08428 0.07405
# E2: Factor Analysis without SMOTE
cat("\nE2 - Factor Analysis without SMOTE\n")
## 
## E2 - Factor Analysis without SMOTE
fa_model <- fa(numerical_features, nfactors = 2, rotate = "varimax")
X_fa <- fa_model$scores
cat("\nE2 - Factor Loadings (Without SMOTE):\n")
## 
## E2 - Factor Loadings (Without SMOTE):
print(fa_model$loadings)
## 
## Loadings:
##                                                        MR1    MR2   
## baseline.value                                         -0.201  0.787
## accelerations                                           0.351  0.182
## fetal_movement                                          0.118       
## uterine_contractions                                    0.276 -0.170
## light_decelerations                                     0.594 -0.203
## abnormal_short_term_variability                        -0.369  0.175
## mean_value_of_short_term_variability                    0.819 -0.242
## percentage_of_time_with_abnormal_long_term_variability -0.580  0.209
## mean_value_of_long_term_variability                                 
## histogram_width                                         0.921       
## histogram_max                                           0.654  0.461
## histogram_number_of_peaks                               0.699       
## histogram_mode                                         -0.144  0.914
## histogram_variance                                      0.794       
## histogram_tendency                                             0.311
## 
##                  MR1   MR2
## SS loadings    4.165 2.031
## Proportion Var 0.278 0.135
## Cumulative Var 0.278 0.413
# Visualization: Factor Loadings Plot
fa_loadings <- as.data.frame(fa_model$loadings[, 1:2])
fa_loadings$Variable <- rownames(fa_loadings)
fa_loadings_long <- pivot_longer(fa_loadings, cols = c("MR1", "MR2"), names_to = "Factor", values_to = "Loading")
p_fa <- ggplot(fa_loadings_long, aes(x = Variable, y = Loading, fill = Factor)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "E2 - Factor Loadings (Without SMOTE)", x = "Variable", y = "Loading") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p_fa)

# Hitung eigenvalue dari matriks korelasi
eigen_vals <- eigen(cor(X_smote))$values
num_factors <- sum(eigen_vals > 1)
cat("Jumlah faktor dengan eigenvalue > 1 (Kaiser Rule):", num_factors, "\n")
## Jumlah faktor dengan eigenvalue > 1 (Kaiser Rule): 5
# Jalankan FA dengan jumlah faktor optimal
fa_model_smote <- fa(X_smote, nfactors = num_factors, rotate = "varimax")
X_smote_fa <- fa_model_smote$scores

cat("\nF2 - Factor Loadings (With SMOTE):\n")
## 
## F2 - Factor Loadings (With SMOTE):
print(fa_model_smote$loadings)
## 
## Loadings:
##                                                        MR1    MR2    MR3   
## baseline.value                                         -0.187  0.810 -0.177
## accelerations                                                         0.545
## fetal_movement                                                             
## uterine_contractions                                    0.398         0.247
## light_decelerations                                     0.746 -0.253 -0.126
## abnormal_short_term_variability                        -0.234  0.166 -0.674
## mean_value_of_short_term_variability                    0.779 -0.266  0.270
## percentage_of_time_with_abnormal_long_term_variability -0.531  0.335 -0.515
## mean_value_of_long_term_variability                    -0.122         0.317
## histogram_width                                         0.923         0.246
## histogram_max                                           0.641  0.506  0.465
## histogram_mode                                         -0.432  0.788  0.115
## histogram_variance                                      0.799 -0.196       
## histogram_tendency                                     -0.152  0.249 -0.128
##                                                        MR5    MR4   
## baseline.value                                          0.150       
## accelerations                                                       
## fetal_movement                                                -0.410
## uterine_contractions                                           0.696
## light_decelerations                                    -0.164  0.276
## abnormal_short_term_variability                        -0.323 -0.194
## mean_value_of_short_term_variability                   -0.108  0.241
## percentage_of_time_with_abnormal_long_term_variability        -0.246
## mean_value_of_long_term_variability                     0.538 -0.165
## histogram_width                                                     
## histogram_max                                          -0.311 -0.121
## histogram_mode                                          0.376       
## histogram_variance                                     -0.202  0.131
## histogram_tendency                                      0.652       
## 
##                  MR1   MR2   MR3   MR5   MR4
## SS loadings    3.822 1.918 1.612 1.170 0.960
## Proportion Var 0.273 0.137 0.115 0.084 0.069
## Cumulative Var 0.273 0.410 0.525 0.609 0.677
# Visualization: Factor Loadings Plot
fa_loadings_smote <- as.data.frame(fa_model_smote$loadings[, 1:2])
fa_loadings_smote$Variable <- rownames(fa_loadings_smote)
fa_loadings_smote_long <- pivot_longer(fa_loadings_smote, cols = c("MR1", "MR2"), names_to = "Factor", values_to = "Loading")
p_fa_smote <- ggplot(fa_loadings_smote_long, aes(x = Variable, y = Loading, fill = Factor)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "F2 - Factor Loadings (With SMOTE)", x = "Variable", y = "Loading") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p_fa_smote)

library(ggplot2)
library(dplyr)
library(psych)

fa_scores <- as.data.frame(fa_model_smote$scores)
colnames(fa_scores) <- c("F1", "F2")
fa_scores$label <- y_smote

fa_loadings <- as.data.frame(fa_model_smote$loadings[, 1:2])
fa_loadings$Variable <- rownames(fa_loadings)
colnames(fa_loadings) <- c("F1_loading", "F2_loading", "Variable")

scale_factor <- 3
fa_loadings <- fa_loadings %>%
  mutate(F1s = F1_loading * scale_factor,
         F2s = F2_loading * scale_factor)

# Plot
p_fa_biplot <- ggplot(fa_scores, aes(x = F1, y = F2, color = label)) +
  geom_point(alpha = 0.6, size = 2) +
  geom_segment(data = fa_loadings, aes(x = 0, y = 0, xend = F1s, yend = F2s),
               arrow = arrow(length = unit(0.3, "cm")), color = "black") +
  geom_text(data = fa_loadings, aes(x = F1s, y = F2s, label = Variable),
            color = "black", vjust = -0.5) +
  # Tambah correlation circle
  annotate("path",
           x = scale_factor * cos(seq(0, 2 * pi, length.out = 100)),
           y = scale_factor * sin(seq(0, 2 * pi, length.out = 100)),
           color = "blue", linetype = "dashed", alpha = 0.5) +
  labs(title = "G1 - Factor Analysis Biplot (With SMOTE)",
       x = "Factor 1", y = "Factor 2") +
  theme_minimal() +
  theme(legend.title = element_blank())

print(p_fa_biplot)

Pemisahan Dataset

# --- LDA and MLR ---
# Define datasets
# D1: PCA + FA without SMOTE
fa_pca_model <- fa(X_pca, nfactors = 2, rotate = "varimax")
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
X_pca_fa <- fa_pca_model$scores

# H1: PCA + FA with SMOTE
fa_smote_pca_model <- fa(X_smote_pca, nfactors = 2, rotate = "varimax")
X_smote_pca_fa <- fa_smote_pca_model$scores


# Prepare datasets
datasets <- list(
  A1 = X_scaled,       # Scaled original data
  B1 = X_pca,          # PCA without SMOTE
  C1 = X_fa,           # FA without SMOTE
  D1 = X_pca_fa,       # PCA + FA without SMOTE
  E1 = X_smote_scaled, # Scaled SMOTE data
  F1 = X_smote_pca,    # PCA with SMOTE
  G1 = X_smote_fa,     # FA with SMOTE
  H1 = X_smote_pca_fa  # PCA + FA with SMOTE
)

targets <- list(
  A1 = data_winsorized$fetal_health,
  B1 = data_winsorized$fetal_health,
  C1 = data_winsorized$fetal_health,
  D1 = data_winsorized$fetal_health,
  E1 = y_smote,
  F1 = y_smote,
  G1 = y_smote,
  H1 = y_smote
)

labels_lda <- list(
  A1 = "AA1",
  B1 = "BB1",
  C1 = "CC1",
  D1 = "DD1",
  E1 = "EE1",
  F1 = "FF1",
  G1 = "GG1",
  H1 = "HH1"
)

labels_mlr <- list(
  A1 = "AB1",
  B1 = "BC1",
  C1 = "CD1",
  D1 = "DE1",
  E1 = "EF1",
  F1 = "FG1",
  G1 = "GH1",
  H1 = "HI1"
)

# Initialize lists to store accuracies for comparison
lda_accuracies <- list()
mlr_accuracies <- list()

Uji asumsi, pemodelan, uji signifikansi

# Mardia's Test for Multivariate Normality
# Fungsi untuk menghitung Mardia's Skewness dan Kurtosis dengan pengecekan error
# Fungsi untuk menghitung Mardia's Skewness dan Kurtosis tanpa tryCatch
mardia_test_manual <- function(X) {
  if (!is.matrix(X)) {
    stop("Input harus berupa matriks numerik.")
  }
  if (ncol(X) < 2) {
    stop("Matriks harus memiliki minimal 2 kolom (variabel).")
  }
  if (nrow(X) <= ncol(X)) {
    stop("Jumlah observasi harus lebih banyak dari jumlah variabel.")
  }
  
  n <- nrow(X)
  p <- ncol(X)
  
  # Hitung mean dan covariance matrix
  mean_vec <- colMeans(X)
  S <- cov(X)
  S_inv <- solve(S)
  
  # Hitung deviasi setiap observasi dari mean
  dev <- sweep(X, 2, mean_vec)
  
  # Hitung Mahalanobis distances untuk setiap observasi
  D <- apply(dev, 1, function(x) t(x) %*% S_inv %*% x)
  
  # Mardia's multivariate skewness
  skewness <- (1 / (n^2)) * sum(sapply(1:n, function(i) {
    sapply(1:n, function(j) {
      t(dev[i,]) %*% S_inv %*% dev[j,] %*% t(dev[j,]) %*% S_inv %*% dev[i,]
    }) %>% sum()
  }))
  
  # Alternatif perhitungan lebih efisien:
  # skewness <- (1 / n^2) * sum((t(dev) %*% S_inv %*% dev)^3)
  # Namun, untuk kejelasan, kita gunakan loop
  
  # Mardia's multivariate kurtosis
  kurtosis <- mean(D^2)
  
  # Statistik uji dan p-value untuk skewness (approximate)
  k <- p * (p + 1) * (p + 2) / 6
  skew_stat <- n * skewness / 6
  p_value_skew <- 1 - pchisq(skew_stat, df = k)
  
  # Statistik uji dan p-value untuk kurtosis (approximate)
  expected_kurtosis <- p * (p + 2)
  z_kurtosis <- (kurtosis - expected_kurtosis) / sqrt(8 * p * (p + 2) / n)
  p_value_kurt <- 2 * (1 - pnorm(abs(z_kurtosis)))
  
  # Return hasil
  list(
    skewness = skewness,
    skewness_stat = skew_stat,
    p_value_skewness = p_value_skew,
    kurtosis = kurtosis,
    kurtosis_z = z_kurtosis,
    p_value_kurtosis = p_value_kurt
  )
}

# Inisialisasi list untuk menyimpan akurasi LDA
lda_accuracies <- list()

# Loop analisis untuk setiap dataset
for (key in names(datasets)) {
  X_data <- datasets[[key]]
  y_data <- targets[[key]]
  label <- labels_lda[[key]]
  
  X_data <- as.matrix(X_data)
  
  # Cek jumlah observasi dan kelas
  if (nrow(X_data) < length(unique(y_data)) || any(table(y_data) < 2)) {
    cat(sprintf("\n%s - LDA Skipped: Insufficient observations per class\n", label))
    next
  }
  
  # Mardia's Test
  cat(sprintf("\n%s - Mardia's Test (Skewness, p-value, Kurtosis):\n", label))
  mardia_res <- tryCatch(mardia_test_manual(X_data), error = function(e) {
    cat("Mardia's test gagal: ", e$message, "\n")
    return(list(skewness = NA, p_value_skewness = NA, kurtosis = NA))
  })
  
  if (any(is.na(unlist(mardia_res)))) {
    cat("Mardia's test gagal dijalankan (data mungkin tidak memenuhi syarat).\n")
  } else {
    cat(sprintf("Skewness: %.4f\n", mardia_res$skewness))
    cat(sprintf("P-value Skewness: %.4f\n", mardia_res$p_value_skewness))
    cat(sprintf("Kurtosis: %.4f\n", mardia_res$kurtosis))
    if (mardia_res$p_value_skewness < 0.05) {
      cat("Kesimpulan: Data tidak mengikuti distribusi normal multivariat (signifikan).\n")
    } else {
      cat("Kesimpulan: Data mengikuti distribusi normal multivariat (tidak signifikan).\n")
    }
  }
  
  # Box's M Test untuk kesamaan kovarians antar grup
  boxm_result <- tryCatch(biotools::boxM(X_data, y_data), error = function(e) {
    cat(sprintf("\n%s - Box's M Test Failed: %s\n", label, e$message))
    return(NULL)
  })
  if (!is.null(boxm_result)) {
    cat(sprintf("\n%s - Box's M Test:\n", label))
    print(boxm_result)
  }
  
  # Model LDA
  lda_model <- tryCatch(MASS::lda(X_data, grouping = y_data, tol = 1e-6), error = function(e) {
    cat(sprintf("\n%s - LDA Failed: %s\n", label, e$message))
    return(NULL)
  })
  
  if (!is.null(lda_model)) {
    # Wilks' Lambda (aproksimasi)
    svd_vals <- lda_model$svd^2
    wilks_lambda <- prod(1 / (1 + svd_vals))
    cat(sprintf("\n%s - Wilks' Lambda (Approximated):\n", label))
    print(wilks_lambda)
    
    # Koefisien diskriminan
    cat(sprintf("\n%s - LDA Discriminant Coefficients:\n", label))
    print(lda_model$scaling)
    
    # Uji signifikansi variabel dengan ANOVA
    cat(sprintf("\n%s - Significance Test for Variables (ANOVA):\n", label))
    anova_data <- as.data.frame(X_data)
    anova_data$Group <- y_data
    anova_results <- lapply(colnames(X_data), function(var) {
      formula <- as.formula(paste(var, "~ Group"))
      anova_model <- tryCatch(aov(formula, data = anova_data), error = function(e) NULL)
      if (!is.null(anova_model)) {
        summary_anova <- summary(anova_model)
        p_value <- summary_anova[[1]]$`Pr(>F)`[1]
        f_value <- summary_anova[[1]]$`F value`[1]
        return(data.frame(Variable = var, F_Value = f_value, P_Value = p_value))
      } else {
        return(data.frame(Variable = var, F_Value = NA, P_Value = NA))
      }
    })
    anova_results_df <- do.call(rbind, anova_results)
    print(anova_results_df)
    
    # Visualisasi koefisien diskriminan yang distandarisasi
    std_coefs <- lda_model$scaling
    var_sds <- apply(X_data, 2, sd)
    std_coefs <- sweep(std_coefs, 1, var_sds, "*")
    coefs_df <- as.data.frame(std_coefs)
    coefs_df$Variable <- rownames(coefs_df)
    coefs_df <- tidyr::pivot_longer(coefs_df, cols = starts_with("LD"), names_to = "Discriminant", values_to = "Coefficient")
    coefs_df <- merge(coefs_df, anova_results_df[, c("Variable", "P_Value")], by = "Variable", all.x = TRUE)
    coefs_df$Significance <- ifelse(coefs_df$P_Value < 0.05, "*", "")
    
    p_lda <- ggplot(coefs_df, aes(x = Variable, y = Coefficient, fill = Discriminant)) +
      geom_bar(stat = "identity", position = "dodge") +
      geom_text(aes(label = Significance), vjust = -0.5, position = position_dodge(width = 0.45)) +
      labs(title = paste("Standardized LDA Coefficients (", label, ")"), x = "Variable", y = "Standardized Coefficient") +
      theme_minimal() +
      theme(axis.text.x = element_text(angle = 45, hjust = 1))
    print(p_lda)
    
    # Evaluasi LDA dengan confusion matrix dan akurasi
    y_pred <- predict(lda_model, X_data)$class
    cm <- table(y_data, y_pred)
    cat(sprintf("\n%s - Confusion Matrix:\n", label))
    print(cm)
    
    cm_data <- as.data.frame.matrix(cm)
    cm_data$Actual <- rownames(cm_data)
    cm_data <- tidyr::pivot_longer(cm_data, cols = colnames(cm), names_to = "Predicted", values_to = "Count")
    p_cm <- ggplot(cm_data, aes(x = Predicted, y = Actual, fill = Count)) +
      geom_tile() +
      geom_text(aes(label = Count), color = "white") +
      scale_fill_gradient(low = "#f7f7f7", high = "#4e79a7") +
      labs(title = paste("Confusion Matrix for", label, "(LDA)"), x = "Predicted", y = "Actual") +
      theme_minimal()
    print(p_cm)
    
    acc <- mean(y_data == y_pred)
    cat(sprintf("\n%s - Accuracy:\n", label))
    print(acc)
    
    lda_accuracies[[label]] <- acc
  }
}
## 
## AA1 - Mardia's Test (Skewness, p-value, Kurtosis):
## Skewness: 14.9858
## P-value Skewness: 0.0000
## Kurtosis: 297.0291
## Kesimpulan: Data tidak mengikuti distribusi normal multivariat (signifikan).
## 
## AA1 - Box's M Test:
## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  X_data
## Chi-Sq (approx.) = 3493.5, df = 240, p-value < 2.2e-16
## 
## 
## AA1 - Wilks' Lambda (Approximated):
## [1] 2.364257e-06
## 
## AA1 - LDA Discriminant Coefficients:
##                                                                 LD1        LD2
## baseline.value                                          0.701426821  0.4162142
## accelerations                                          -0.416189367  0.2419189
## fetal_movement                                         -0.006382276 -0.1917747
## uterine_contractions                                   -0.257259460  0.0582607
## light_decelerations                                    -0.507157913 -0.2617604
## abnormal_short_term_variability                         0.502898143  0.1288263
## mean_value_of_short_term_variability                    0.094254326  0.2534899
## percentage_of_time_with_abnormal_long_term_variability  0.401801596 -0.5154852
## mean_value_of_long_term_variability                    -0.226844625 -0.2535886
## histogram_width                                        -0.293525982  0.2534092
## histogram_max                                           0.273431666 -0.4216371
## histogram_number_of_peaks                               0.052213461 -0.1150758
## histogram_mode                                         -1.013979563 -0.8182263
## histogram_variance                                      0.771339899  0.3846071
## histogram_tendency                                      0.069021985 -0.1346064
## 
## AA1 - Significance Test for Variables (ANOVA):
##                                                  Variable    F_Value
## 1                                          baseline.value 137.603512
## 2                                           accelerations 195.971542
## 3                                          fetal_movement  10.066204
## 4                                    uterine_contractions  92.535685
## 5                                     light_decelerations  62.601111
## 6                         abnormal_short_term_variability 334.558940
## 7                    mean_value_of_short_term_variability 153.668687
## 8  percentage_of_time_with_abnormal_long_term_variability 389.185482
## 9                     mean_value_of_long_term_variability  79.350509
## 10                                        histogram_width  53.943891
## 11                                          histogram_max   2.256021
## 12                              histogram_number_of_peaks  12.087430
## 13                                         histogram_mode 214.056236
## 14                                     histogram_variance 100.580770
## 15                                     histogram_tendency  44.820861
##          P_Value
## 1   6.827411e-57
## 2   8.856190e-79
## 3   4.457249e-05
## 4   3.026880e-39
## 5   3.891911e-27
## 6  6.703609e-127
## 7   5.068339e-63
## 8  1.471757e-144
## 9   5.959481e-34
## 10  1.421706e-23
## 11  1.050197e-01
## 12  6.031104e-06
## 13  2.360489e-85
## 14  1.907755e-42
## 15  8.658192e-20

## 
## AA1 - Confusion Matrix:
##       y_pred
## y_data    1    2    3
##      1 1529   95   21
##      2   88  195    8
##      3   18   56   99

## 
## AA1 - Accuracy:
## [1] 0.8643907
## 
## BB1 - Mardia's Test (Skewness, p-value, Kurtosis):
## Skewness: 4.9953
## P-value Skewness: 0.0000
## Kurtosis: 32.5453
## Kesimpulan: Data tidak mengikuti distribusi normal multivariat (signifikan).
## 
## BB1 - Box's M Test:
## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  X_data
## Chi-Sq (approx.) = 706.61, df = 30, p-value < 2.2e-16
## 
## 
## BB1 - Wilks' Lambda (Approximated):
## [1] 6.242841e-06
## 
## BB1 - LDA Discriminant Coefficients:
##            LD1        LD2
## PC1 0.27046361 -0.3008567
## PC2 0.10859520 -0.4557624
## PC3 0.84311202  0.4072280
## PC4 0.08814101 -0.1067632
## PC5 0.18607476 -0.2206565
## 
## BB1 - Significance Test for Variables (ANOVA):
##   Variable    F_Value       P_Value
## 1      PC1 179.958069  7.055170e-73
## 2      PC2  95.742432  1.594449e-40
## 3      PC3 475.030478 5.321832e-171
## 4      PC4   5.250309  5.314781e-03
## 5      PC5  21.533755  5.525210e-10

## 
## BB1 - Confusion Matrix:
##       y_pred
## y_data    1    2    3
##      1 1552   41   52
##      2  114  173    4
##      3   40   54   79

## 
## BB1 - Accuracy:
## [1] 0.8553817
## 
## CC1 - Mardia's Test (Skewness, p-value, Kurtosis):
## Skewness: 1.9981
## P-value Skewness: 0.0000
## Kurtosis: 7.1346
## Kesimpulan: Data tidak mengikuti distribusi normal multivariat (signifikan).
## 
## CC1 - Box's M Test:
## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  X_data
## Chi-Sq (approx.) = 226.36, df = 6, p-value < 2.2e-16
## 
## 
## CC1 - Wilks' Lambda (Approximated):
## [1] 0.0001647984
## 
## CC1 - LDA Discriminant Coefficients:
##            LD1       LD2
## MR1  0.7786038 0.7611211
## MR2 -0.8629648 0.6765822
## 
## CC1 - Significance Test for Variables (ANOVA):
##   Variable  F_Value      P_Value
## 1      MR1 117.0307 6.388077e-49
## 2      MR2 144.2239 1.987955e-59

## 
## CC1 - Confusion Matrix:
##       y_pred
## y_data    1    2    3
##      1 1626   19    0
##      2  252   39    0
##      3  157   16    0

## 
## CC1 - Accuracy:
## [1] 0.7894737
## 
## DD1 - Mardia's Test (Skewness, p-value, Kurtosis):
## Skewness: 1.9981
## P-value Skewness: 0.0000
## Kurtosis: 7.7626
## Kesimpulan: Data tidak mengikuti distribusi normal multivariat (signifikan).
## 
## DD1 - Box's M Test:
## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  X_data
## Chi-Sq (approx.) = 93.262, df = 6, p-value < 2.2e-16
## 
## 
## DD1 - LDA Failed: variables 1 2 appear to be constant within groups
## 
## EE1 - Mardia's Test (Skewness, p-value, Kurtosis):
## Skewness: 13.9867
## P-value Skewness: 0.0000
## Kurtosis: 258.0314
## Kesimpulan: Data tidak mengikuti distribusi normal multivariat (signifikan).
## 
## EE1 - Box's M Test:
## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  X_data
## Chi-Sq (approx.) = 8584.6, df = 210, p-value < 2.2e-16
## 
## 
## EE1 - Wilks' Lambda (Approximated):
## [1] 5.766711e-07
## 
## EE1 - LDA Discriminant Coefficients:
##                                                                LD1          LD2
## baseline.value                                          0.43111518  0.001588134
## accelerations                                          -0.66080101  0.527655774
## fetal_movement                                         -0.05063304 -0.145927114
## uterine_contractions                                   -0.25627467 -0.020740375
## light_decelerations                                    -0.46759642  0.021077773
## abnormal_short_term_variability                         0.77571867  0.061172600
## mean_value_of_short_term_variability                    0.45442796  0.405646249
## percentage_of_time_with_abnormal_long_term_variability  0.32422327 -0.404875593
## mean_value_of_long_term_variability                    -0.31085939 -0.163255805
## histogram_width                                        -0.26402935 -0.035580897
## histogram_max                                           0.20068766 -0.480803248
## histogram_mode                                         -0.88362263 -0.481210420
## histogram_variance                                      0.77604189  0.090063147
## histogram_tendency                                      0.01862363 -0.104061863
## 
## EE1 - Significance Test for Variables (ANOVA):
##                                                  Variable    F_Value
## 1                                          baseline.value 304.228800
## 2                                           accelerations 551.961113
## 3                                          fetal_movement  22.183989
## 4                                    uterine_contractions 124.711906
## 5                                     light_decelerations 194.941513
## 6                         abnormal_short_term_variability 561.352273
## 7                    mean_value_of_short_term_variability 274.366100
## 8  percentage_of_time_with_abnormal_long_term_variability 425.063606
## 9                     mean_value_of_long_term_variability 321.388461
## 10                                        histogram_width  93.642562
## 11                                          histogram_max   2.452833
## 12                                         histogram_mode 639.101591
## 13                                     histogram_variance 271.488996
## 14                                     histogram_tendency 158.487670
##          P_Value
## 1  8.706348e-117
## 2  1.904524e-193
## 3   2.922438e-10
## 4   6.520038e-52
## 5   2.125750e-78
## 6  4.093910e-196
## 7  1.297908e-106
## 8  8.737857e-156
## 9  1.566161e-122
## 10  1.096720e-39
## 11  8.629535e-02
## 12 1.273593e-217
## 13 1.274852e-105
## 14  7.659638e-65

## 
## EE1 - Confusion Matrix:
##       y_pred
## y_data   1   2   3
##      1 586  93  24
##      2  43 636  24
##      3   0 167 535

## 
## EE1 - Accuracy:
## [1] 0.8334915
## 
## FF1 - Mardia's Test (Skewness, p-value, Kurtosis):
## Skewness: 4.9953
## P-value Skewness: 0.0000
## Kurtosis: 34.4231
## Kesimpulan: Data tidak mengikuti distribusi normal multivariat (signifikan).
## 
## FF1 - Box's M Test:
## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  X_data
## Chi-Sq (approx.) = 2117.2, df = 30, p-value < 2.2e-16
## 
## 
## FF1 - Wilks' Lambda (Approximated):
## [1] 1.052511e-06
## 
## FF1 - LDA Discriminant Coefficients:
##            LD1         LD2
## PC1 -0.0115421  0.44923823
## PC2  0.8763763  0.28876662
## PC3  0.6843459 -0.39018593
## PC4  0.1266701 -0.01265415
## PC5 -0.3615343  0.31421408
## 
## FF1 - Significance Test for Variables (ANOVA):
##   Variable   F_Value       P_Value
## 1      PC1 364.90299 8.750254e-137
## 2      PC2 795.97192 3.700256e-258
## 3      PC3 285.59847 1.819984e-110
## 4      PC4   4.89484  7.570521e-03
## 5      PC5  63.36136  1.901023e-27

## 
## FF1 - Confusion Matrix:
##       y_pred
## y_data   1   2   3
##      1 540  88  75
##      2  56 605  42
##      3  18 159 525

## 
## FF1 - Accuracy:
## [1] 0.7922201
## 
## GG1 - Mardia's Test (Skewness, p-value, Kurtosis):
## Skewness: 4.9953
## P-value Skewness: 0.0000
## Kurtosis: 33.2906
## Kesimpulan: Data tidak mengikuti distribusi normal multivariat (signifikan).
## 
## GG1 - Box's M Test:
## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  X_data
## Chi-Sq (approx.) = 1604.4, df = 30, p-value < 2.2e-16
## 
## 
## GG1 - Wilks' Lambda (Approximated):
## [1] 1.356584e-06
## 
## GG1 - LDA Discriminant Coefficients:
##            LD1        LD2
## MR1  0.4499719  0.4740041
## MR2 -0.1359885 -0.9574111
## MR3 -1.2109584  0.4616812
## MR5 -0.8507201 -0.2687832
## MR4 -0.3372380  0.5033044
## 
## GG1 - Significance Test for Variables (ANOVA):
##   Variable   F_Value       P_Value
## 1      MR1 120.10987  4.024017e-50
## 2      MR2 283.01971 1.386185e-109
## 3      MR3 511.99030 6.476115e-182
## 4      MR5 225.16611  2.423701e-89
## 5      MR4  92.78798  2.404584e-39

## 
## GG1 - Confusion Matrix:
##       y_pred
## y_data   1   2   3
##      1 555  96  52
##      2  92 565  46
##      3  37 166 499

## 
## GG1 - Accuracy:
## [1] 0.7680266
## 
## HH1 - Mardia's Test (Skewness, p-value, Kurtosis):
## Skewness: 1.9981
## P-value Skewness: 0.0000
## Kurtosis: 8.7146
## Kesimpulan: Data tidak mengikuti distribusi normal multivariat (signifikan).
## 
## HH1 - Box's M Test:
## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  X_data
## Chi-Sq (approx.) = 687.39, df = 6, p-value < 2.2e-16
## 
## 
## HH1 - LDA Failed: variables 1 2 appear to be constant within groups
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
## The following object is masked from 'package:dplyr':
## 
##     recode
# MLR Model
for (key in names(datasets)) {
  X_data <- as.data.frame(datasets[[key]])
  y_data <- targets[[key]]
  label <- labels_mlr[[key]]
  
  cat(sprintf("\n========== %s ==========\n", label))
  
  # Hitung VIF dengan membuat model linier dummy
  temp_data <- X_data
  temp_data$dummy_y <- rnorm(nrow(temp_data))  # dummy target
  vif_model <- lm(dummy_y ~ ., data = temp_data)
  vif_values <- vif(vif_model)
  cat(">>> VIF (Variance Inflation Factor):\n")
  print(round(vif_values, 3))
  
  # Cek cukupnya data
  if (nrow(X_data) < length(unique(y_data)) || any(table(y_data) < 2)) {
    cat("Skipped: Tidak cukup data untuk semua kelas.\n")
    next
  }
  
  data_mlr <- cbind(X_data, y = y_data)
  
  # MLR Model
  mlr_model <- tryCatch(nnet::multinom(y ~ ., data = data_mlr, trace = FALSE, maxit = 1000), error = function(e) {
    cat(sprintf("\n%s - MLR Failed: %s\n", label, e$message))
    return(NULL)
  })
  
  if (!is.null(mlr_model)) {
    # Likelihood Ratio Test
    null_model <- tryCatch(nnet::multinom(y ~ 1, data = data_mlr, trace = FALSE, maxit = 1000), error = function(e) NULL)
    if (!is.null(null_model)) {
      loglik_full <- logLik(mlr_model)
      loglik_null <- logLik(null_model)
      lrt_stat <- 2 * (loglik_full - loglik_null)
      df <- length(coef(mlr_model)) - length(coef(null_model))
      lrt_pvalue <- pchisq(lrt_stat, df, lower.tail = FALSE)
      
      cat(sprintf("\n%s - Likelihood Ratio Test (Manual Calculation):\n", label))
      cat(sprintf("LRT Statistic: %.3f, df: %d, p-value: %.3e\n", lrt_stat, df, lrt_pvalue))
    } else {
      cat(sprintf("\n%s - Likelihood Ratio Test Skipped: Null model failed\n", label))
    }
    
    # Wald Test
    cat(sprintf("\n%s - Wald Test (Coefficients and p-values):\n", label))
    print(summary(mlr_model))
    
    # Confusion Matrix
    y_pred <- predict(mlr_model, X_data, type = "class")
    cm <- table(y_data, y_pred)
    cat(sprintf("\n%s - Confusion Matrix:\n", label))
    print(cm)
    
    cm_data <- as.data.frame.matrix(cm)
    cm_data$Actual <- rownames(cm_data)
    cm_data <- tidyr::pivot_longer(cm_data, cols = c("1", "2", "3"), names_to = "Predicted", values_to = "Count")
    p_cm_mlr <- ggplot(cm_data, aes(x = Predicted, y = Actual, fill = Count)) +
      geom_tile() +
      geom_text(aes(label = Count), color = "white") +
      scale_fill_gradient(low = "#f7f7f7", high = "#f28e2b") +
      labs(title = paste("Confusion Matrix for", label, "(MLR)"), x = "Predicted", y = "Actual") +
      theme_minimal()
    print(p_cm_mlr)
    
    # Odds Ratios
    odds_ratios <- exp(coef(mlr_model))
    cat(sprintf("\n%s - Odds Ratios:\n", label))
    print(odds_ratios)
    
    acc <- mean(y_data == y_pred)
    cat(sprintf("\n%s - Accuracy:\n", label))
    print(acc)
    
    mlr_accuracies[[label]] <- acc
  }
}
## 
## ========== AB1 ==========
## >>> VIF (Variance Inflation Factor):
##                                         baseline.value 
##                                                  4.063 
##                                          accelerations 
##                                                  2.567 
##                                         fetal_movement 
##                                                  1.269 
##                                   uterine_contractions 
##                                                  1.395 
##                                    light_decelerations 
##                                                  3.045 
##                        abnormal_short_term_variability 
##                                                  1.859 
##                   mean_value_of_short_term_variability 
##                                                  3.117 
## percentage_of_time_with_abnormal_long_term_variability 
##                                                  2.074 
##                    mean_value_of_long_term_variability 
##                                                  1.851 
##                                        histogram_width 
##                                                  9.006 
##                                          histogram_max 
##                                                  6.991 
##                              histogram_number_of_peaks 
##                                                  2.386 
##                                         histogram_mode 
##                                                  5.100 
##                                     histogram_variance 
##                                                  3.002 
##                                     histogram_tendency 
##                                                  2.714 
## 
## AB1 - Likelihood Ratio Test (Manual Calculation):
## LRT Statistic: 1755.499, df: 30, p-value: 0.000e+00
## 
## AB1 - Wald Test (Coefficients and p-values):
## Call:
## nnet::multinom(formula = y ~ ., data = data_mlr, trace = FALSE, 
##     maxit = 1000)
## 
## Coefficients:
##   (Intercept) baseline.value accelerations fetal_movement uterine_contractions
## 2   -4.535360     -0.1546641     -3.327839     0.38686473           -0.5213949
## 3   -9.027453      1.3305182     -5.608926     0.04413364           -0.9672625
##   light_decelerations abnormal_short_term_variability
## 2          -0.9960602                       0.8520202
## 3          -1.1044701                       2.4437219
##   mean_value_of_short_term_variability
## 2                          -0.91154625
## 3                          -0.05280065
##   percentage_of_time_with_abnormal_long_term_variability
## 2                                              0.4828328
## 3                                              0.2118075
##   mean_value_of_long_term_variability histogram_width histogram_max
## 2                         -0.01137033      -0.7768076      1.286825
## 3                         -0.88491386      -0.7132121      1.036181
##   histogram_number_of_peaks histogram_mode histogram_variance
## 2                 0.2994720      0.3915495           1.454514
## 3                -0.1510739     -2.8150510           2.266049
##   histogram_tendency
## 2          0.2791977
## 3          0.3425407
## 
## Std. Errors:
##   (Intercept) baseline.value accelerations fetal_movement uterine_contractions
## 2   0.2996099      0.2577133     0.3854525      0.1104507            0.1212088
## 3   0.8989512      0.3122373     0.9893373      0.1649483            0.1864061
##   light_decelerations abnormal_short_term_variability
## 2           0.2717582                       0.1709663
## 3           0.2648024                       0.2864991
##   mean_value_of_short_term_variability
## 2                            0.2650570
## 3                            0.2981736
##   percentage_of_time_with_abnormal_long_term_variability
## 2                                              0.1163989
## 3                                              0.2258749
##   mean_value_of_long_term_variability histogram_width histogram_max
## 2                           0.1944055       0.2667072     0.2709873
## 3                           0.3275101       0.4426077     0.3714317
##   histogram_number_of_peaks histogram_mode histogram_variance
## 2                 0.1499308      0.3339761          0.2505191
## 3                 0.2538637      0.3538485          0.2681127
##   histogram_tendency
## 2          0.1507782
## 3          0.2053751
## 
## Residual Deviance: 1079.949 
## AIC: 1143.949 
## 
## AB1 - Confusion Matrix:
##       y_pred
## y_data    1    2    3
##      1 1575   53   17
##      2   83  193   15
##      3   20   35  118

## 
## AB1 - Odds Ratios:
##    (Intercept) baseline.value accelerations fetal_movement uterine_contractions
## 2 0.0107230482      0.8567029   0.035870535       1.472357            0.5936919
## 3 0.0001200679      3.7830032   0.003665004       1.045122            0.3801222
##   light_decelerations abnormal_short_term_variability
## 2           0.3693317                        2.344378
## 3           0.3313864                       11.515822
##   mean_value_of_short_term_variability
## 2                            0.4019023
## 3                            0.9485691
##   percentage_of_time_with_abnormal_long_term_variability
## 2                                               1.620659
## 3                                               1.235910
##   mean_value_of_long_term_variability histogram_width histogram_max
## 2                           0.9886941       0.4598717      3.621272
## 3                           0.4127497       0.4900675      2.818432
##   histogram_number_of_peaks histogram_mode histogram_variance
## 2                 1.3491462     1.47927114           4.282403
## 3                 0.8597841     0.05990166           9.641231
##   histogram_tendency
## 2           1.322069
## 3           1.408522
## 
## AB1 - Accuracy:
## [1] 0.8942627
## 
## ========== BC1 ==========
## >>> VIF (Variance Inflation Factor):
## PC1 PC2 PC3 PC4 PC5 
##   1   1   1   1   1 
## 
## BC1 - Likelihood Ratio Test (Manual Calculation):
## LRT Statistic: 1305.404, df: 10, p-value: 2.609e-274
## 
## BC1 - Wald Test (Coefficients and p-values):
## Call:
## nnet::multinom(formula = y ~ ., data = data_mlr, trace = FALSE, 
##     maxit = 1000)
## 
## Coefficients:
##   (Intercept)       PC1       PC2      PC3        PC4       PC5
## 2   -3.008824 0.6864185  0.675013 1.027433 0.24137535 0.5220185
## 3   -4.687199 0.2899734 -0.308525 2.384946 0.05931237 0.3202953
## 
## Std. Errors:
##   (Intercept)        PC1        PC2        PC3        PC4        PC5
## 2   0.1376679 0.04900339 0.06917199 0.09464289 0.07002781 0.08670944
## 3   0.2830351 0.04617997 0.08537680 0.16078922 0.10040872 0.12508875
## 
## Residual Deviance: 1530.045 
## AIC: 1554.045 
## 
## BC1 - Confusion Matrix:
##       y_pred
## y_data    1    2    3
##      1 1570   34   41
##      2  120  161   10
##      3   50   41   82

## 
## BC1 - Odds Ratios:
##   (Intercept)      PC1       PC2       PC3      PC4      PC5
## 2 0.049349661 1.986588 1.9640586  2.793885 1.272999 1.685426
## 3 0.009212457 1.336392 0.7345296 10.858480 1.061107 1.377534
## 
## BC1 - Accuracy:
## [1] 0.8596491
## 
## ========== CD1 ==========
## >>> VIF (Variance Inflation Factor):
##   MR1   MR2 
## 1.001 1.001 
## 
## CD1 - Likelihood Ratio Test (Manual Calculation):
## LRT Statistic: 578.244, df: 4, p-value: 7.915e-124
## 
## CD1 - Wald Test (Coefficients and p-values):
## Call:
## nnet::multinom(formula = y ~ ., data = data_mlr, trace = FALSE, 
##     maxit = 1000)
## 
## Coefficients:
##   (Intercept)        MR1        MR2
## 2   -2.730659 -1.4724532  1.2206809
## 3   -2.678121  0.1397596 -0.9330897
## 
## Std. Errors:
##   (Intercept)        MR1        MR2
## 2   0.1219215 0.10288603 0.09833167
## 3   0.1087513 0.08745025 0.09424308
## 
## Residual Deviance: 2257.204 
## AIC: 2269.204 
## 
## CD1 - Confusion Matrix:
##       y_pred
## y_data    1    2    3
##      1 1607   38    0
##      2  206   85    0
##      3  150   23    0

## 
## CD1 - Odds Ratios:
##   (Intercept)       MR1       MR2
## 2  0.06517632 0.2293621 3.3894947
## 3  0.06869213 1.1499973 0.3933366
## 
## CD1 - Accuracy:
## [1] 0.802276
## 
## ========== DE1 ==========
## >>> VIF (Variance Inflation Factor):
##   MR1   MR2 
## 1.134 1.134 
## 
## DE1 - Likelihood Ratio Test (Manual Calculation):
## LRT Statistic: 0.000, df: 4, p-value: 1.000e+00
## 
## DE1 - Wald Test (Coefficients and p-values):
## Call:
## nnet::multinom(formula = y ~ ., data = data_mlr, trace = FALSE, 
##     maxit = 1000)
## 
## Coefficients:
##   (Intercept)          MR1          MR2
## 2   -1.732172 4.225877e-06 2.641869e-05
## 3   -2.252204 1.486263e-05 1.722302e-05
## 
## Std. Errors:
##   (Intercept)          MR1          MR2
## 2  0.06359502 0.000000e+00 1.099654e-21
## 3  0.07992653 2.597782e-21 1.770602e-21
## 
## Residual Deviance: 2835.448 
## AIC: 2847.448 
## 
## DE1 - Confusion Matrix:
##       y_pred
## y_data    1    2    3
##      1 1645    0    0
##      2  291    0    0
##      3  173    0    0

## 
## DE1 - Odds Ratios:
##   (Intercept)      MR1      MR2
## 2   0.1768998 1.000004 1.000026
## 3   0.1051672 1.000015 1.000017
## 
## DE1 - Accuracy:
## [1] 0.7799905
## 
## ========== EF1 ==========
## >>> VIF (Variance Inflation Factor):
##                                         baseline.value 
##                                                  4.233 
##                                          accelerations 
##                                                  2.054 
##                                         fetal_movement 
##                                                  1.183 
##                                   uterine_contractions 
##                                                  1.935 
##                                    light_decelerations 
##                                                  3.006 
##                        abnormal_short_term_variability 
##                                                  2.184 
##                   mean_value_of_short_term_variability 
##                                                  4.988 
## percentage_of_time_with_abnormal_long_term_variability 
##                                                  3.189 
##                    mean_value_of_long_term_variability 
##                                                  2.071 
##                                        histogram_width 
##                                                  9.528 
##                                          histogram_max 
##                                                  6.759 
##                                         histogram_mode 
##                                                  7.002 
##                                     histogram_variance 
##                                                  3.530 
##                                     histogram_tendency 
##                                                  2.925 
## 
## EF1 - Likelihood Ratio Test (Manual Calculation):
## LRT Statistic: 3088.537, df: 28, p-value: 0.000e+00
## 
## EF1 - Wald Test (Coefficients and p-values):
## Call:
## nnet::multinom(formula = y ~ ., data = data_mlr, trace = FALSE, 
##     maxit = 1000)
## 
## Coefficients:
##   (Intercept) baseline.value accelerations fetal_movement uterine_contractions
## 2  -0.2976572      0.1076334     -2.719055     0.22304836           -0.5192612
## 3  -1.8311926      1.5362856     -5.239806    -0.09119814           -1.2979018
##   light_decelerations abnormal_short_term_variability
## 2           -1.113091                       0.8353494
## 3           -2.138346                       2.6489640
##   mean_value_of_short_term_variability
## 2                           -1.3311877
## 3                            0.9814443
##   percentage_of_time_with_abnormal_long_term_variability
## 2                                              0.5779501
## 3                                              0.1791453
##   mean_value_of_long_term_variability histogram_width histogram_max
## 2                          0.02941503      -0.7429038      1.522455
## 3                         -1.34000521      -1.0186417      1.074816
##   histogram_mode histogram_variance histogram_tendency
## 2     -0.1309325           2.296594          0.2873952
## 3     -3.2242140           3.396115          0.1205096
## 
## Std. Errors:
##   (Intercept) baseline.value accelerations fetal_movement uterine_contractions
## 2   0.1755196      0.2353804     0.2574797      0.1064790            0.1364606
## 3   0.3274820      0.2944149     0.7099174      0.1322297            0.1833212
##   light_decelerations abnormal_short_term_variability
## 2           0.2523423                       0.1741852
## 3           0.2950798                       0.2252022
##   mean_value_of_short_term_variability
## 2                            0.3285242
## 3                            0.3285351
##   percentage_of_time_with_abnormal_long_term_variability
## 2                                              0.1380616
## 3                                              0.2154641
##   mean_value_of_long_term_variability histogram_width histogram_max
## 2                           0.1773035       0.2999940     0.2946158
## 3                           0.2304425       0.3863953     0.3769490
##   histogram_mode histogram_variance histogram_tendency
## 2      0.3563585          0.2720778          0.1803078
## 3      0.4273933          0.3078605          0.2156736
## 
## Residual Deviance: 1543.211 
## AIC: 1603.211 
## 
## EF1 - Confusion Matrix:
##       y_pred
## y_data   1   2   3
##      1 611  74  18
##      2  39 601  63
##      3   9  93 600

## 
## EF1 - Odds Ratios:
##   (Intercept) baseline.value accelerations fetal_movement uterine_contractions
## 2   0.7425559       1.113639   0.065937064      1.2498810            0.5949599
## 3   0.1602224       4.647296   0.005301285      0.9128368            0.2731042
##   light_decelerations abnormal_short_term_variability
## 2           0.3285419                         2.30562
## 3           0.1178496                        14.13938
##   mean_value_of_short_term_variability
## 2                            0.2641633
## 3                            2.6683072
##   percentage_of_time_with_abnormal_long_term_variability
## 2                                               1.782381
## 3                                               1.196195
##   mean_value_of_long_term_variability histogram_width histogram_max
## 2                           1.0298519       0.4757305      4.583463
## 3                           0.2618443       0.3610851      2.929453
##   histogram_mode histogram_variance histogram_tendency
## 2     0.87727702           9.940271           1.332951
## 3     0.03978704          29.847928           1.128072
## 
## EF1 - Accuracy:
## [1] 0.8595825
## 
## ========== FG1 ==========
## >>> VIF (Variance Inflation Factor):
## PC1 PC2 PC3 PC4 PC5 
##   1   1   1   1   1 
## 
## FG1 - Likelihood Ratio Test (Manual Calculation):
## LRT Statistic: 2513.253, df: 10, p-value: 0.000e+00
## 
## FG1 - Wald Test (Coefficients and p-values):
## Call:
## nnet::multinom(formula = y ~ ., data = data_mlr, trace = FALSE, 
##     maxit = 1000)
## 
## Coefficients:
##   (Intercept)        PC1       PC2      PC3         PC4        PC5
## 2  -0.0396679 -0.7903702 0.5405675 1.281506 -0.03577445 -0.9398919
## 3  -0.8478772 -0.1596825 2.5236663 1.881434  0.25415190 -1.4049281
## 
## Std. Errors:
##   (Intercept)        PC1        PC2        PC3        PC4        PC5
## 2  0.09978608 0.05074461 0.07356033 0.08803727 0.07019422 0.09143347
## 3  0.14135923 0.04418639 0.12317339 0.11631403 0.08519247 0.13834163
## 
## Residual Deviance: 2118.495 
## AIC: 2142.495 
## 
## FG1 - Confusion Matrix:
##       y_pred
## y_data   1   2   3
##      1 557  87  59
##      2  67 548  88
##      3  38 101 563

## 
## FG1 - Odds Ratios:
##   (Intercept)       PC1       PC2      PC3       PC4       PC5
## 2   0.9611086 0.4536768  1.716981 3.602060 0.9648579 0.3906700
## 3   0.4283232 0.8524144 12.474247 6.562912 1.2893676 0.2453847
## 
## FG1 - Accuracy:
## [1] 0.7912713
## 
## ========== GH1 ==========
## >>> VIF (Variance Inflation Factor):
##   MR1   MR2   MR3   MR5   MR4 
## 1.008 1.018 1.019 1.007 1.008 
## 
## GH1 - Likelihood Ratio Test (Manual Calculation):
## LRT Statistic: 2302.203, df: 10, p-value: 0.000e+00
## 
## GH1 - Wald Test (Coefficients and p-values):
## Call:
## nnet::multinom(formula = y ~ ., data = data_mlr, trace = FALSE, 
##     maxit = 1000)
## 
## Coefficients:
##   (Intercept)        MR1        MR2       MR3         MR5        MR4
## 2 -0.08495295 -0.7069997  1.3848634 -1.539444 -0.08338301 -1.2436374
## 3 -0.18678609  0.9869159 -0.2080233 -2.981898 -1.87820724 -0.9567872
## 
## Std. Errors:
##   (Intercept)       MR1        MR2       MR3       MR5       MR4
## 2   0.1080107 0.1082165 0.09592049 0.1008310 0.1103074 0.0952941
## 3   0.1073241 0.1051512 0.10271432 0.1475438 0.1261135 0.1028907
## 
## Residual Deviance: 2329.546 
## AIC: 2353.546 
## 
## GH1 - Confusion Matrix:
##       y_pred
## y_data   1   2   3
##      1 558  91  54
##      2  71 542  90
##      3  53 119 530

## 
## GH1 - Odds Ratios:
##   (Intercept)       MR1       MR2        MR3       MR5       MR4
## 2   0.9185555 0.4931215 3.9942803 0.21450029 0.9199987 0.2883335
## 3   0.8296212 2.6829473 0.8121881 0.05069652 0.1528639 0.3841250
## 
## GH1 - Accuracy:
## [1] 0.7732448
## 
## ========== HI1 ==========
## >>> VIF (Variance Inflation Factor):
##   MR1   MR2 
## 1.016 1.016 
## 
## HI1 - Likelihood Ratio Test (Manual Calculation):
## LRT Statistic: 0.000, df: 4, p-value: 1.000e+00
## 
## HI1 - Wald Test (Coefficients and p-values):
## Warning in sqrt(diag(vc)): NaNs produced
## Call:
## nnet::multinom(formula = y ~ ., data = data_mlr, trace = FALSE, 
##     maxit = 1000)
## 
## Coefficients:
##     (Intercept)           MR1           MR2
## 2 -1.684779e-08 -5.866299e-06 -5.866196e-06
## 3 -1.423518e-03 -1.414946e-05  2.756714e-06
## 
## Std. Errors:
##   (Intercept)          MR1 MR2
## 2  0.05333807 0.000000e+00 NaN
## 3  0.05335707 2.068111e-22 NaN
## 
## Residual Deviance: 4631.748 
## AIC: 4643.748 
## 
## HI1 - Confusion Matrix:
##       y_pred
## y_data   1   2   3
##      1 355 348   0
##      2 352 351   0
##      3 347 355   0

## 
## HI1 - Odds Ratios:
##   (Intercept)       MR1       MR2
## 2   1.0000000 0.9999941 0.9999941
## 3   0.9985775 0.9999859 1.0000028
## 
## HI1 - Accuracy:
## [1] 0.3349146
# Visualization: Model Accuracy Comparison
accuracy_data <- data.frame(
  Model = c(names(lda_accuracies), names(mlr_accuracies)),
  Accuracy = c(unlist(lda_accuracies), unlist(mlr_accuracies)),
  Method = c(rep("LDA", length(lda_accuracies)), rep("MLR", length(mlr_accuracies)))
)

# Create bar chart for accuracy comparison
p_accuracy <- ggplot(accuracy_data, aes(x = Model, y = Accuracy, fill = Method)) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_manual(values = c("LDA" = "#4e79a7", "MLR" = "#f28e2b")) +
  labs(title = "Model Accuracy Comparison (LDA vs MLR)", x = "Model", y = "Accuracy") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p_accuracy)

print(accuracy_data)
##     Model  Accuracy Method
## AA1   AA1 0.8643907    LDA
## BB1   BB1 0.8553817    LDA
## CC1   CC1 0.7894737    LDA
## EE1   EE1 0.8334915    LDA
## FF1   FF1 0.7922201    LDA
## GG1   GG1 0.7680266    LDA
## AB1   AB1 0.8942627    MLR
## BC1   BC1 0.8596491    MLR
## CD1   CD1 0.8022760    MLR
## DE1   DE1 0.7799905    MLR
## EF1   EF1 0.8595825    MLR
## FG1   FG1 0.7912713    MLR
## GH1   GH1 0.7732448    MLR
## HI1   HI1 0.3349146    MLR
#