Kelompok 3
Nafila Hanum Al Hasaniy
Sheira En Nadia
Dafanov Dixie Einkinderen
# Set working directory
setwd("D:/analisis multivariat_smt4")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Loading required package: lattice
library(UBL)
## Warning: package 'UBL' was built under R version 4.4.3
## Loading required package: MBA
## Warning: package 'MBA' was built under R version 4.4.3
## Loading required package: gstat
## Warning: package 'gstat' was built under R version 4.4.3
## Loading required package: automap
## Warning: package 'automap' was built under R version 4.4.3
## Loading required package: sp
## Warning: package 'sp' was built under R version 4.4.3
## Loading required package: randomForest
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(MASS)
## Warning: package 'MASS' was built under R version 4.4.3
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(nnet)
## Warning: package 'nnet' was built under R version 4.4.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
library(psych)
## Warning: package 'psych' was built under R version 4.4.3
##
## Attaching package: 'psych'
## The following object is masked from 'package:UBL':
##
## phi
## The following object is masked from 'package:randomForest':
##
## outlier
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(mvnormtest)
library(biotools)
## Warning: package 'biotools' was built under R version 4.4.3
## ---
## biotools version 4.3
library(ggplot2)
library(tidyr)
# Set seed for reproducibility
set.seed(42)
data <- read.csv("fetal_health.csv")
summary(data)
## baseline.value accelerations fetal_movement uterine_contractions
## Min. :106.0 Min. :0.000000 Min. :0.000000 Min. :0.000000
## 1st Qu.:126.0 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.002000
## Median :133.0 Median :0.002000 Median :0.000000 Median :0.004000
## Mean :133.3 Mean :0.003178 Mean :0.009481 Mean :0.004366
## 3rd Qu.:140.0 3rd Qu.:0.006000 3rd Qu.:0.003000 3rd Qu.:0.007000
## Max. :160.0 Max. :0.019000 Max. :0.481000 Max. :0.015000
## light_decelerations severe_decelerations prolongued_decelerations
## Min. :0.000000 Min. :0.000e+00 Min. :0.0000000
## 1st Qu.:0.000000 1st Qu.:0.000e+00 1st Qu.:0.0000000
## Median :0.000000 Median :0.000e+00 Median :0.0000000
## Mean :0.001889 Mean :3.293e-06 Mean :0.0001585
## 3rd Qu.:0.003000 3rd Qu.:0.000e+00 3rd Qu.:0.0000000
## Max. :0.015000 Max. :1.000e-03 Max. :0.0050000
## abnormal_short_term_variability mean_value_of_short_term_variability
## Min. :12.00 Min. :0.200
## 1st Qu.:32.00 1st Qu.:0.700
## Median :49.00 Median :1.200
## Mean :46.99 Mean :1.333
## 3rd Qu.:61.00 3rd Qu.:1.700
## Max. :87.00 Max. :7.000
## percentage_of_time_with_abnormal_long_term_variability
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 9.847
## 3rd Qu.:11.000
## Max. :91.000
## mean_value_of_long_term_variability histogram_width histogram_min
## Min. : 0.000 Min. : 3.00 Min. : 50.00
## 1st Qu.: 4.600 1st Qu.: 37.00 1st Qu.: 67.00
## Median : 7.400 Median : 67.50 Median : 93.00
## Mean : 8.188 Mean : 70.45 Mean : 93.58
## 3rd Qu.:10.800 3rd Qu.:100.00 3rd Qu.:120.00
## Max. :50.700 Max. :180.00 Max. :159.00
## histogram_max histogram_number_of_peaks histogram_number_of_zeroes
## Min. :122 Min. : 0.000 Min. : 0.0000
## 1st Qu.:152 1st Qu.: 2.000 1st Qu.: 0.0000
## Median :162 Median : 3.000 Median : 0.0000
## Mean :164 Mean : 4.068 Mean : 0.3236
## 3rd Qu.:174 3rd Qu.: 6.000 3rd Qu.: 0.0000
## Max. :238 Max. :18.000 Max. :10.0000
## histogram_mode histogram_mean histogram_median histogram_variance
## Min. : 60.0 Min. : 73.0 Min. : 77.0 Min. : 0.00
## 1st Qu.:129.0 1st Qu.:125.0 1st Qu.:129.0 1st Qu.: 2.00
## Median :139.0 Median :136.0 Median :139.0 Median : 7.00
## Mean :137.5 Mean :134.6 Mean :138.1 Mean : 18.81
## 3rd Qu.:148.0 3rd Qu.:145.0 3rd Qu.:148.0 3rd Qu.: 24.00
## Max. :187.0 Max. :182.0 Max. :186.0 Max. :269.00
## histogram_tendency fetal_health
## Min. :-1.0000 Min. :1.000
## 1st Qu.: 0.0000 1st Qu.:1.000
## Median : 0.0000 Median :1.000
## Mean : 0.3203 Mean :1.304
## 3rd Qu.: 1.0000 3rd Qu.:1.000
## Max. : 1.0000 Max. :3.000
# Mengambil hanya kolom numerik
numeric_data <- data[, sapply(data, is.numeric)]
cor_matrix <- cor(numeric_data, use = "pairwise.complete.obs")
# Visualisasi
corrplot(cor_matrix, method = "color", type = "upper",
tl.col = "black", tl.srt = 45,
addCoef.col = "black", number.cex = 0.2,
tl.cex = 0.4,
cl.cex = 0.7,
mar = c(0, 0, 2, 0),
title = "Correlation Matrix Before Preprocessing")
missing_values <- colSums(is.na(data))
cat("Missing Values:\n")
## Missing Values:
print(missing_values)
## baseline.value
## 0
## accelerations
## 0
## fetal_movement
## 0
## uterine_contractions
## 0
## light_decelerations
## 0
## severe_decelerations
## 0
## prolongued_decelerations
## 0
## abnormal_short_term_variability
## 0
## mean_value_of_short_term_variability
## 0
## percentage_of_time_with_abnormal_long_term_variability
## 0
## mean_value_of_long_term_variability
## 0
## histogram_width
## 0
## histogram_min
## 0
## histogram_max
## 0
## histogram_number_of_peaks
## 0
## histogram_number_of_zeroes
## 0
## histogram_mode
## 0
## histogram_mean
## 0
## histogram_median
## 0
## histogram_variance
## 0
## histogram_tendency
## 0
## fetal_health
## 0
# Bar chart untuk missing values
missing_data <- data.frame(
Variable = names(missing_values),
Missing_Count = missing_values
)
p_missing <- ggplot(missing_data, aes(x = Variable, y = Missing_Count, fill = Variable)) +
geom_bar(stat = "identity") +
labs(title = "Missing Values per Variable", x = "Variable", y = "Count of Missing Values") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
p_missing
detect_outliers <- function(x) {
Q1 <- quantile(x, 0.25)
Q3 <- quantile(x, 0.75)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
outliers <- sum(x < lower_bound | x > upper_bound, na.rm = TRUE)
return(outliers)
}
# Hitung outliers untuk tiap kolom (tanpa target)
outliers_report <- sapply(data[, -ncol(data)], detect_outliers)
cat("Outliers Count per Feature:\n")
## Outliers Count per Feature:
print(outliers_report)
## baseline.value
## 0
## accelerations
## 14
## fetal_movement
## 307
## uterine_contractions
## 1
## light_decelerations
## 150
## severe_decelerations
## 7
## prolongued_decelerations
## 178
## abnormal_short_term_variability
## 0
## mean_value_of_short_term_variability
## 70
## percentage_of_time_with_abnormal_long_term_variability
## 309
## mean_value_of_long_term_variability
## 71
## histogram_width
## 0
## histogram_min
## 0
## histogram_max
## 24
## histogram_number_of_peaks
## 19
## histogram_number_of_zeroes
## 502
## histogram_mode
## 73
## histogram_mean
## 45
## histogram_median
## 28
## histogram_variance
## 184
## histogram_tendency
## 0
data_long <- pivot_longer(data[, -ncol(data)], cols = everything(), names_to = "Variable", values_to = "Value")
# Boxplot
p_box_before <- ggplot(data_long, aes(x = Variable, y = Value, fill = Variable)) +
geom_boxplot(show.legend = FALSE) +
theme_minimal() +
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1)
)
p_box_before
winsorize <- function(x) {
Q1 <- quantile(x, 0.25)
Q3 <- quantile(x, 0.75)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
x[x < lower_bound] <- lower_bound
x[x > upper_bound] <- upper_bound
return(x)
}
# Terapkan winsorizing ke semua fitur kecuali target
data_winsorized <- data
data_winsorized[, -ncol(data)] <- lapply(data[, -ncol(data)], winsorize)
# Cek outlier setelah winsorizing
outliers_report_winsorized <- sapply(data_winsorized[, -ncol(data_winsorized)], detect_outliers)
cat("Outliers Count per Feature After Winsorizing:\n")
## Outliers Count per Feature After Winsorizing:
print(outliers_report_winsorized)
## baseline.value
## 0
## accelerations
## 0
## fetal_movement
## 0
## uterine_contractions
## 0
## light_decelerations
## 0
## severe_decelerations
## 0
## prolongued_decelerations
## 0
## abnormal_short_term_variability
## 0
## mean_value_of_short_term_variability
## 0
## percentage_of_time_with_abnormal_long_term_variability
## 0
## mean_value_of_long_term_variability
## 0
## histogram_width
## 0
## histogram_min
## 0
## histogram_max
## 0
## histogram_number_of_peaks
## 0
## histogram_number_of_zeroes
## 0
## histogram_mode
## 0
## histogram_mean
## 0
## histogram_median
## 0
## histogram_variance
## 0
## histogram_tendency
## 0
data_winsorized_long <- pivot_longer(data_winsorized[, -ncol(data_winsorized)],
cols = everything(),
names_to = "Variable", values_to = "Value")
# Boxplot setelah winsorizing
p_box_after <- ggplot(data_winsorized_long, aes(x = Variable, y = Value, fill = Variable)) +
geom_boxplot() +
labs(title = "Box Plot of Features After Winsorizing", x = "Variable", y = "Value") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
p_box_after
# Hitung varians tiap kolom numerik
variances <- apply(data_winsorized[, -ncol(data_winsorized)], 2, var)
# Deteksi kolom dengan varians nol atau NA
zero_variance_cols <- names(variances[variances == 0 | is.na(variances)])
if (length(zero_variance_cols) > 0) {
cat("Removing constant or zero-variance columns:", zero_variance_cols, "\n")
data_winsorized <- data_winsorized[, !names(data_winsorized) %in% zero_variance_cols]
} else {
cat("No constant or zero-variance columns detected.\n")
}
## Removing constant or zero-variance columns: severe_decelerations prolongued_decelerations histogram_number_of_zeroes
# Jumlah baris duplikat
duplicates <- sum(duplicated(data_winsorized))
cat("Number of Duplicate Rows:\n")
## Number of Duplicate Rows:
print(duplicates)
## [1] 17
# Hapus baris duplikat
data_winsorized <- data_winsorized[!duplicated(data_winsorized), ]
cat("Dataset Dimensions after Removing Duplicates:\n")
## Dataset Dimensions after Removing Duplicates:
print(dim(data_winsorized))
## [1] 2109 19
# Fungsi hapus fitur yang sangat berkorelasi
remove_correlated <- function(X, threshold = 0.8) {
cor_matrix <- cor(X, use = "complete.obs")
high_cor <- which(abs(cor_matrix) > threshold & upper.tri(cor_matrix), arr.ind = TRUE)
if (nrow(high_cor) > 0) {
to_remove <- unique(high_cor[, 2])
cat("Removing highly correlated variables:", colnames(X)[to_remove], "\n")
X <- X[, -to_remove, drop = FALSE]
} else {
cat("No highly correlated variables detected.\n")
}
return(X)
}
# Terapkan pada data winsorized tanpa target
numerical_features <- data_winsorized[, -which(names(data_winsorized) == "fetal_health")]
numerical_features <- remove_correlated(numerical_features, threshold = 0.8)
## Removing highly correlated variables: histogram_min histogram_mean histogram_median
# Gabungkan kembali dengan target
data_winsorized <- cbind(numerical_features, fetal_health = data_winsorized$fetal_health)
# Tampilkan matriks korelasi setelah penghapusan
cor_matrix <- cor(numerical_features, use = "complete.obs")
cat("Correlation Matrix After Removing Highly Correlated Columns:\n")
## Correlation Matrix After Removing Highly Correlated Columns:
print(round(cor_matrix, 3))
## baseline.value
## baseline.value 1.000
## accelerations -0.081
## fetal_movement -0.024
## uterine_contractions -0.148
## light_decelerations -0.162
## abnormal_short_term_variability 0.303
## mean_value_of_short_term_variability -0.312
## percentage_of_time_with_abnormal_long_term_variability 0.332
## mean_value_of_long_term_variability -0.046
## histogram_width -0.146
## histogram_max 0.294
## histogram_number_of_peaks -0.111
## histogram_mode 0.765
## histogram_variance -0.175
## histogram_tendency 0.295
## accelerations
## baseline.value -0.081
## accelerations 1.000
## fetal_movement 0.088
## uterine_contractions 0.088
## light_decelerations -0.098
## abnormal_short_term_variability -0.280
## mean_value_of_short_term_variability 0.232
## percentage_of_time_with_abnormal_long_term_variability -0.425
## mean_value_of_long_term_variability -0.157
## histogram_width 0.299
## histogram_max 0.411
## histogram_number_of_peaks 0.194
## histogram_mode 0.247
## histogram_variance 0.204
## histogram_tendency 0.030
## fetal_movement
## baseline.value -0.024
## accelerations 0.088
## fetal_movement 1.000
## uterine_contractions -0.317
## light_decelerations -0.018
## abnormal_short_term_variability 0.125
## mean_value_of_short_term_variability 0.094
## percentage_of_time_with_abnormal_long_term_variability -0.032
## mean_value_of_long_term_variability -0.029
## histogram_width 0.182
## histogram_max 0.127
## histogram_number_of_peaks 0.200
## histogram_mode 0.014
## histogram_variance 0.098
## histogram_tendency 0.010
## uterine_contractions
## baseline.value -0.148
## accelerations 0.088
## fetal_movement -0.317
## uterine_contractions 1.000
## light_decelerations 0.291
## abnormal_short_term_variability -0.234
## mean_value_of_short_term_variability 0.309
## percentage_of_time_with_abnormal_long_term_variability -0.296
## mean_value_of_long_term_variability -0.067
## histogram_width 0.140
## histogram_max 0.112
## histogram_number_of_peaks 0.084
## histogram_mode -0.105
## histogram_variance 0.252
## histogram_tendency -0.069
## light_decelerations
## baseline.value -0.162
## accelerations -0.098
## fetal_movement -0.018
## uterine_contractions 0.291
## light_decelerations 1.000
## abnormal_short_term_variability -0.135
## mean_value_of_short_term_variability 0.586
## percentage_of_time_with_abnormal_long_term_variability -0.314
## mean_value_of_long_term_variability -0.246
## histogram_width 0.546
## histogram_max 0.220
## histogram_number_of_peaks 0.426
## histogram_mode -0.305
## histogram_variance 0.676
## histogram_tendency 0.032
## abnormal_short_term_variability
## baseline.value 0.303
## accelerations -0.280
## fetal_movement 0.125
## uterine_contractions -0.234
## light_decelerations -0.135
## abnormal_short_term_variability 1.000
## mean_value_of_short_term_variability -0.452
## percentage_of_time_with_abnormal_long_term_variability 0.456
## mean_value_of_long_term_variability -0.333
## histogram_width -0.261
## histogram_max -0.125
## histogram_number_of_peaks -0.170
## histogram_mode 0.084
## histogram_variance -0.211
## histogram_tendency -0.008
## mean_value_of_short_term_variability
## baseline.value -0.312
## accelerations 0.232
## fetal_movement 0.094
## uterine_contractions 0.309
## light_decelerations 0.586
## abnormal_short_term_variability -0.452
## mean_value_of_short_term_variability 1.000
## percentage_of_time_with_abnormal_long_term_variability -0.565
## mean_value_of_long_term_variability 0.016
## histogram_width 0.685
## histogram_max 0.401
## histogram_number_of_peaks 0.522
## histogram_mode -0.312
## histogram_variance 0.655
## histogram_tendency -0.062
## percentage_of_time_with_abnormal_long_term_variability
## baseline.value 0.332
## accelerations -0.425
## fetal_movement -0.032
## uterine_contractions -0.296
## light_decelerations -0.314
## abnormal_short_term_variability 0.456
## mean_value_of_short_term_variability -0.565
## percentage_of_time_with_abnormal_long_term_variability 1.000
## mean_value_of_long_term_variability -0.130
## histogram_width -0.462
## histogram_max -0.272
## histogram_number_of_peaks -0.289
## histogram_mode 0.218
## histogram_variance -0.400
## histogram_tendency 0.049
## mean_value_of_long_term_variability
## baseline.value -0.046
## accelerations -0.157
## fetal_movement -0.029
## uterine_contractions -0.067
## light_decelerations -0.246
## abnormal_short_term_variability -0.333
## mean_value_of_short_term_variability 0.016
## percentage_of_time_with_abnormal_long_term_variability -0.130
## mean_value_of_long_term_variability 1.000
## histogram_width 0.080
## histogram_max -0.013
## histogram_number_of_peaks 0.044
## histogram_mode 0.041
## histogram_variance -0.159
## histogram_tendency 0.148
## histogram_width
## baseline.value -0.146
## accelerations 0.299
## fetal_movement 0.182
## uterine_contractions 0.140
## light_decelerations 0.546
## abnormal_short_term_variability -0.261
## mean_value_of_short_term_variability 0.685
## percentage_of_time_with_abnormal_long_term_variability -0.462
## mean_value_of_long_term_variability 0.080
## histogram_width 1.000
## histogram_max 0.689
## histogram_number_of_peaks 0.752
## histogram_mode -0.134
## histogram_variance 0.721
## histogram_tendency 0.117
## histogram_max
## baseline.value 0.294
## accelerations 0.411
## fetal_movement 0.127
## uterine_contractions 0.112
## light_decelerations 0.220
## abnormal_short_term_variability -0.125
## mean_value_of_short_term_variability 0.401
## percentage_of_time_with_abnormal_long_term_variability -0.272
## mean_value_of_long_term_variability -0.013
## histogram_width 0.689
## histogram_max 1.000
## histogram_number_of_peaks 0.524
## histogram_mode 0.312
## histogram_variance 0.484
## histogram_tendency -0.130
## histogram_number_of_peaks
## baseline.value -0.111
## accelerations 0.194
## fetal_movement 0.200
## uterine_contractions 0.084
## light_decelerations 0.426
## abnormal_short_term_variability -0.170
## mean_value_of_short_term_variability 0.522
## percentage_of_time_with_abnormal_long_term_variability -0.289
## mean_value_of_long_term_variability 0.044
## histogram_width 0.752
## histogram_max 0.524
## histogram_number_of_peaks 1.000
## histogram_mode -0.083
## histogram_variance 0.548
## histogram_tendency 0.112
## histogram_mode
## baseline.value 0.765
## accelerations 0.247
## fetal_movement 0.014
## uterine_contractions -0.105
## light_decelerations -0.305
## abnormal_short_term_variability 0.084
## mean_value_of_short_term_variability -0.312
## percentage_of_time_with_abnormal_long_term_variability 0.218
## mean_value_of_long_term_variability 0.041
## histogram_width -0.134
## histogram_max 0.312
## histogram_number_of_peaks -0.083
## histogram_mode 1.000
## histogram_variance -0.191
## histogram_tendency 0.405
## histogram_variance
## baseline.value -0.175
## accelerations 0.204
## fetal_movement 0.098
## uterine_contractions 0.252
## light_decelerations 0.676
## abnormal_short_term_variability -0.211
## mean_value_of_short_term_variability 0.655
## percentage_of_time_with_abnormal_long_term_variability -0.400
## mean_value_of_long_term_variability -0.159
## histogram_width 0.721
## histogram_max 0.484
## histogram_number_of_peaks 0.548
## histogram_mode -0.191
## histogram_variance 1.000
## histogram_tendency -0.006
## histogram_tendency
## baseline.value 0.295
## accelerations 0.030
## fetal_movement 0.010
## uterine_contractions -0.069
## light_decelerations 0.032
## abnormal_short_term_variability -0.008
## mean_value_of_short_term_variability -0.062
## percentage_of_time_with_abnormal_long_term_variability 0.049
## mean_value_of_long_term_variability 0.148
## histogram_width 0.117
## histogram_max -0.130
## histogram_number_of_peaks 0.112
## histogram_mode 0.405
## histogram_variance -0.006
## histogram_tendency 1.000
# Visualisasi korelasi
corrplot(cor_matrix, method = "color", type = "upper",
tl.col = "black", tl.srt = 45,
addCoef.col = "black", number.cex = 0.2,
tl.cex = 0.4,
cl.cex = 0.7,
mar = c(0, 0, 2, 0),
title = "Correlation Matrix After Removing Highly Correlated Columns",
)
# Pastikan target sebagai faktor
data_winsorized$fetal_health <- as.factor(data_winsorized$fetal_health)
# Distribusi target sebelum SMOTE
cat("Distribution Without SMOTE:\n")
## Distribution Without SMOTE:
print(table(data_winsorized$fetal_health))
##
## 1 2 3
## 1645 291 173
# Terapkan SMOTE dengan package UBL
data_smote <- SmoteClassif(fetal_health ~ ., data_winsorized, C.perc = "balance")
# Pisah fitur dan target
X_smote <- data_smote[, -which(names(data_smote) == "fetal_health")]
y_smote <- as.factor(data_smote$fetal_health)
# Hapus fitur sangat berkorelasi di data SMOTE
X_smote <- remove_correlated(X_smote, threshold = 0.8)
## Removing highly correlated variables: histogram_number_of_peaks
# Cek kolom konstan di data SMOTE
variances_smote <- apply(X_smote, 2, var)
zero_variance_smote_cols <- names(variances_smote[variances_smote == 0 | is.na(variances_smote)])
if (length(zero_variance_smote_cols) > 0) {
cat("Removing constant or zero-variance columns in SMOTE data:", zero_variance_smote_cols, "\n")
X_smote <- X_smote[, !names(X_smote) %in% zero_variance_smote_cols]
} else {
cat("No constant or zero-variance columns detected in SMOTE data.\n")
}
## No constant or zero-variance columns detected in SMOTE data.
# Distribution of target variable after SMOTE
cat("\nDistribution With SMOTE:\n")
##
## Distribution With SMOTE:
print(table(y_smote))
## y_smote
## 1 2 3
## 703 703 702
dist_before <- as.data.frame(table(data_winsorized$fetal_health))
dist_after <- as.data.frame(table(y_smote))
dist_before$Dataset <- "Before SMOTE"
dist_after$Dataset <- "After SMOTE"
colnames(dist_before) <- c("Class", "Count", "Dataset")
colnames(dist_after) <- c("Class", "Count", "Dataset")
dist_data <- rbind(dist_before, dist_after)
p_dist <- ggplot(dist_data, aes(x = Class, y = Count, fill = Dataset)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Class Distribution Before and After SMOTE", x = "Fetal Health Class", y = "Count") +
theme_minimal()
print(p_dist)
## PCA
# --- Dimensionality Reduction (PCA and FA) ---
# Standardize the features, handling potential NA/NaN issues
X_scaled <- scale(numerical_features, center = TRUE, scale = TRUE)
X_scaled[is.na(X_scaled)] <- 0 # Replace NA/NaN with 0 if any remain
X_smote_scaled <- scale(X_smote, center = TRUE, scale = TRUE)
X_smote_scaled[is.na(X_smote_scaled)] <- 0 # Replace NA/NaN with 0 if any remain
# E1: PCA without SMOTE
cat("\nE1 - PCA without SMOTE (Kaiser Rule)\n")
##
## E1 - PCA without SMOTE (Kaiser Rule)
# Jalankan PCA
pca <- prcomp(X_scaled, scale. = FALSE)
# Hitung eigenvalues dari komponen PCA
eigenvalues <- pca$sdev^2
# Gunakan Kaiser Rule: pilih komponen dengan eigenvalue > 1
num_components <- sum(eigenvalues > 1)
cat("Jumlah komponen dengan eigenvalue > 1 (Kaiser Rule):", num_components, "\n")
## Jumlah komponen dengan eigenvalue > 1 (Kaiser Rule): 5
# Transformasi data menggunakan komponen tersebut
X_pca <- predict(pca)[, 1:num_components]
# Tampilkan proporsi variansi yang dijelaskan
cat("\nE1 - PCA Explained Variance Ratio (Without SMOTE):\n")
##
## E1 - PCA Explained Variance Ratio (Without SMOTE):
print(summary(pca)$importance[2, 1:num_components])
## PC1 PC2 PC3 PC4 PC5
## 0.30936 0.15458 0.10820 0.09122 0.08444
# Jalankan PCA pada data hasil SMOTE yang telah diskalakan
pca_smote <- prcomp(X_smote_scaled, scale. = FALSE)
# Hitung eigenvalue
eigenvalues_smote <- pca_smote$sdev^2
# Tentukan jumlah komponen dengan eigenvalue > 1 (Kaiser Rule)
num_components_smote <- sum(eigenvalues_smote > 1)
cat("Jumlah komponen dengan eigenvalue > 1 (Kaiser Rule):", num_components_smote, "\n")
## Jumlah komponen dengan eigenvalue > 1 (Kaiser Rule): 5
# Transformasi data ke ruang PCA
X_smote_pca <- predict(pca_smote)[, 1:num_components_smote]
# Tampilkan variansi yang dijelaskan oleh komponen-komponen terpilih
cat("\nF1 - PCA Explained Variance Ratio (With SMOTE):\n")
##
## F1 - PCA Explained Variance Ratio (With SMOTE):
print(summary(pca_smote)$importance[2, 1:num_components_smote])
## PC1 PC2 PC3 PC4 PC5
## 0.38888 0.14965 0.10227 0.08428 0.07405
# E2: Factor Analysis without SMOTE
cat("\nE2 - Factor Analysis without SMOTE\n")
##
## E2 - Factor Analysis without SMOTE
fa_model <- fa(numerical_features, nfactors = 2, rotate = "varimax")
X_fa <- fa_model$scores
cat("\nE2 - Factor Loadings (Without SMOTE):\n")
##
## E2 - Factor Loadings (Without SMOTE):
print(fa_model$loadings)
##
## Loadings:
## MR1 MR2
## baseline.value -0.201 0.787
## accelerations 0.351 0.182
## fetal_movement 0.118
## uterine_contractions 0.276 -0.170
## light_decelerations 0.594 -0.203
## abnormal_short_term_variability -0.369 0.175
## mean_value_of_short_term_variability 0.819 -0.242
## percentage_of_time_with_abnormal_long_term_variability -0.580 0.209
## mean_value_of_long_term_variability
## histogram_width 0.921
## histogram_max 0.654 0.461
## histogram_number_of_peaks 0.699
## histogram_mode -0.144 0.914
## histogram_variance 0.794
## histogram_tendency 0.311
##
## MR1 MR2
## SS loadings 4.165 2.031
## Proportion Var 0.278 0.135
## Cumulative Var 0.278 0.413
# Visualization: Factor Loadings Plot
fa_loadings <- as.data.frame(fa_model$loadings[, 1:2])
fa_loadings$Variable <- rownames(fa_loadings)
fa_loadings_long <- pivot_longer(fa_loadings, cols = c("MR1", "MR2"), names_to = "Factor", values_to = "Loading")
p_fa <- ggplot(fa_loadings_long, aes(x = Variable, y = Loading, fill = Factor)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "E2 - Factor Loadings (Without SMOTE)", x = "Variable", y = "Loading") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p_fa)
# Hitung eigenvalue dari matriks korelasi
eigen_vals <- eigen(cor(X_smote))$values
num_factors <- sum(eigen_vals > 1)
cat("Jumlah faktor dengan eigenvalue > 1 (Kaiser Rule):", num_factors, "\n")
## Jumlah faktor dengan eigenvalue > 1 (Kaiser Rule): 5
# Jalankan FA dengan jumlah faktor optimal
fa_model_smote <- fa(X_smote, nfactors = num_factors, rotate = "varimax")
X_smote_fa <- fa_model_smote$scores
cat("\nF2 - Factor Loadings (With SMOTE):\n")
##
## F2 - Factor Loadings (With SMOTE):
print(fa_model_smote$loadings)
##
## Loadings:
## MR1 MR2 MR3
## baseline.value -0.187 0.810 -0.177
## accelerations 0.545
## fetal_movement
## uterine_contractions 0.398 0.247
## light_decelerations 0.746 -0.253 -0.126
## abnormal_short_term_variability -0.234 0.166 -0.674
## mean_value_of_short_term_variability 0.779 -0.266 0.270
## percentage_of_time_with_abnormal_long_term_variability -0.531 0.335 -0.515
## mean_value_of_long_term_variability -0.122 0.317
## histogram_width 0.923 0.246
## histogram_max 0.641 0.506 0.465
## histogram_mode -0.432 0.788 0.115
## histogram_variance 0.799 -0.196
## histogram_tendency -0.152 0.249 -0.128
## MR5 MR4
## baseline.value 0.150
## accelerations
## fetal_movement -0.410
## uterine_contractions 0.696
## light_decelerations -0.164 0.276
## abnormal_short_term_variability -0.323 -0.194
## mean_value_of_short_term_variability -0.108 0.241
## percentage_of_time_with_abnormal_long_term_variability -0.246
## mean_value_of_long_term_variability 0.538 -0.165
## histogram_width
## histogram_max -0.311 -0.121
## histogram_mode 0.376
## histogram_variance -0.202 0.131
## histogram_tendency 0.652
##
## MR1 MR2 MR3 MR5 MR4
## SS loadings 3.822 1.918 1.612 1.170 0.960
## Proportion Var 0.273 0.137 0.115 0.084 0.069
## Cumulative Var 0.273 0.410 0.525 0.609 0.677
# Visualization: Factor Loadings Plot
fa_loadings_smote <- as.data.frame(fa_model_smote$loadings[, 1:2])
fa_loadings_smote$Variable <- rownames(fa_loadings_smote)
fa_loadings_smote_long <- pivot_longer(fa_loadings_smote, cols = c("MR1", "MR2"), names_to = "Factor", values_to = "Loading")
p_fa_smote <- ggplot(fa_loadings_smote_long, aes(x = Variable, y = Loading, fill = Factor)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "F2 - Factor Loadings (With SMOTE)", x = "Variable", y = "Loading") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p_fa_smote)
library(ggplot2)
library(dplyr)
library(psych)
fa_scores <- as.data.frame(fa_model_smote$scores)
colnames(fa_scores) <- c("F1", "F2")
fa_scores$label <- y_smote
fa_loadings <- as.data.frame(fa_model_smote$loadings[, 1:2])
fa_loadings$Variable <- rownames(fa_loadings)
colnames(fa_loadings) <- c("F1_loading", "F2_loading", "Variable")
scale_factor <- 3
fa_loadings <- fa_loadings %>%
mutate(F1s = F1_loading * scale_factor,
F2s = F2_loading * scale_factor)
# Plot
p_fa_biplot <- ggplot(fa_scores, aes(x = F1, y = F2, color = label)) +
geom_point(alpha = 0.6, size = 2) +
geom_segment(data = fa_loadings, aes(x = 0, y = 0, xend = F1s, yend = F2s),
arrow = arrow(length = unit(0.3, "cm")), color = "black") +
geom_text(data = fa_loadings, aes(x = F1s, y = F2s, label = Variable),
color = "black", vjust = -0.5) +
# Tambah correlation circle
annotate("path",
x = scale_factor * cos(seq(0, 2 * pi, length.out = 100)),
y = scale_factor * sin(seq(0, 2 * pi, length.out = 100)),
color = "blue", linetype = "dashed", alpha = 0.5) +
labs(title = "G1 - Factor Analysis Biplot (With SMOTE)",
x = "Factor 1", y = "Factor 2") +
theme_minimal() +
theme(legend.title = element_blank())
print(p_fa_biplot)
# --- LDA and MLR ---
# Define datasets
# D1: PCA + FA without SMOTE
fa_pca_model <- fa(X_pca, nfactors = 2, rotate = "varimax")
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
X_pca_fa <- fa_pca_model$scores
# H1: PCA + FA with SMOTE
fa_smote_pca_model <- fa(X_smote_pca, nfactors = 2, rotate = "varimax")
X_smote_pca_fa <- fa_smote_pca_model$scores
# Prepare datasets
datasets <- list(
A1 = X_scaled, # Scaled original data
B1 = X_pca, # PCA without SMOTE
C1 = X_fa, # FA without SMOTE
D1 = X_pca_fa, # PCA + FA without SMOTE
E1 = X_smote_scaled, # Scaled SMOTE data
F1 = X_smote_pca, # PCA with SMOTE
G1 = X_smote_fa, # FA with SMOTE
H1 = X_smote_pca_fa # PCA + FA with SMOTE
)
targets <- list(
A1 = data_winsorized$fetal_health,
B1 = data_winsorized$fetal_health,
C1 = data_winsorized$fetal_health,
D1 = data_winsorized$fetal_health,
E1 = y_smote,
F1 = y_smote,
G1 = y_smote,
H1 = y_smote
)
labels_lda <- list(
A1 = "AA1",
B1 = "BB1",
C1 = "CC1",
D1 = "DD1",
E1 = "EE1",
F1 = "FF1",
G1 = "GG1",
H1 = "HH1"
)
labels_mlr <- list(
A1 = "AB1",
B1 = "BC1",
C1 = "CD1",
D1 = "DE1",
E1 = "EF1",
F1 = "FG1",
G1 = "GH1",
H1 = "HI1"
)
# Initialize lists to store accuracies for comparison
lda_accuracies <- list()
mlr_accuracies <- list()
# Mardia's Test for Multivariate Normality
# Fungsi untuk menghitung Mardia's Skewness dan Kurtosis dengan pengecekan error
# Fungsi untuk menghitung Mardia's Skewness dan Kurtosis tanpa tryCatch
mardia_test_manual <- function(X) {
if (!is.matrix(X)) {
stop("Input harus berupa matriks numerik.")
}
if (ncol(X) < 2) {
stop("Matriks harus memiliki minimal 2 kolom (variabel).")
}
if (nrow(X) <= ncol(X)) {
stop("Jumlah observasi harus lebih banyak dari jumlah variabel.")
}
n <- nrow(X)
p <- ncol(X)
# Hitung mean dan covariance matrix
mean_vec <- colMeans(X)
S <- cov(X)
S_inv <- solve(S)
# Hitung deviasi setiap observasi dari mean
dev <- sweep(X, 2, mean_vec)
# Hitung Mahalanobis distances untuk setiap observasi
D <- apply(dev, 1, function(x) t(x) %*% S_inv %*% x)
# Mardia's multivariate skewness
skewness <- (1 / (n^2)) * sum(sapply(1:n, function(i) {
sapply(1:n, function(j) {
t(dev[i,]) %*% S_inv %*% dev[j,] %*% t(dev[j,]) %*% S_inv %*% dev[i,]
}) %>% sum()
}))
# Alternatif perhitungan lebih efisien:
# skewness <- (1 / n^2) * sum((t(dev) %*% S_inv %*% dev)^3)
# Namun, untuk kejelasan, kita gunakan loop
# Mardia's multivariate kurtosis
kurtosis <- mean(D^2)
# Statistik uji dan p-value untuk skewness (approximate)
k <- p * (p + 1) * (p + 2) / 6
skew_stat <- n * skewness / 6
p_value_skew <- 1 - pchisq(skew_stat, df = k)
# Statistik uji dan p-value untuk kurtosis (approximate)
expected_kurtosis <- p * (p + 2)
z_kurtosis <- (kurtosis - expected_kurtosis) / sqrt(8 * p * (p + 2) / n)
p_value_kurt <- 2 * (1 - pnorm(abs(z_kurtosis)))
# Return hasil
list(
skewness = skewness,
skewness_stat = skew_stat,
p_value_skewness = p_value_skew,
kurtosis = kurtosis,
kurtosis_z = z_kurtosis,
p_value_kurtosis = p_value_kurt
)
}
# Inisialisasi list untuk menyimpan akurasi LDA
lda_accuracies <- list()
# Loop analisis untuk setiap dataset
for (key in names(datasets)) {
X_data <- datasets[[key]]
y_data <- targets[[key]]
label <- labels_lda[[key]]
X_data <- as.matrix(X_data)
# Cek jumlah observasi dan kelas
if (nrow(X_data) < length(unique(y_data)) || any(table(y_data) < 2)) {
cat(sprintf("\n%s - LDA Skipped: Insufficient observations per class\n", label))
next
}
# Mardia's Test
cat(sprintf("\n%s - Mardia's Test (Skewness, p-value, Kurtosis):\n", label))
mardia_res <- tryCatch(mardia_test_manual(X_data), error = function(e) {
cat("Mardia's test gagal: ", e$message, "\n")
return(list(skewness = NA, p_value_skewness = NA, kurtosis = NA))
})
if (any(is.na(unlist(mardia_res)))) {
cat("Mardia's test gagal dijalankan (data mungkin tidak memenuhi syarat).\n")
} else {
cat(sprintf("Skewness: %.4f\n", mardia_res$skewness))
cat(sprintf("P-value Skewness: %.4f\n", mardia_res$p_value_skewness))
cat(sprintf("Kurtosis: %.4f\n", mardia_res$kurtosis))
if (mardia_res$p_value_skewness < 0.05) {
cat("Kesimpulan: Data tidak mengikuti distribusi normal multivariat (signifikan).\n")
} else {
cat("Kesimpulan: Data mengikuti distribusi normal multivariat (tidak signifikan).\n")
}
}
# Box's M Test untuk kesamaan kovarians antar grup
boxm_result <- tryCatch(biotools::boxM(X_data, y_data), error = function(e) {
cat(sprintf("\n%s - Box's M Test Failed: %s\n", label, e$message))
return(NULL)
})
if (!is.null(boxm_result)) {
cat(sprintf("\n%s - Box's M Test:\n", label))
print(boxm_result)
}
# Model LDA
lda_model <- tryCatch(MASS::lda(X_data, grouping = y_data, tol = 1e-6), error = function(e) {
cat(sprintf("\n%s - LDA Failed: %s\n", label, e$message))
return(NULL)
})
if (!is.null(lda_model)) {
# Wilks' Lambda (aproksimasi)
svd_vals <- lda_model$svd^2
wilks_lambda <- prod(1 / (1 + svd_vals))
cat(sprintf("\n%s - Wilks' Lambda (Approximated):\n", label))
print(wilks_lambda)
# Koefisien diskriminan
cat(sprintf("\n%s - LDA Discriminant Coefficients:\n", label))
print(lda_model$scaling)
# Uji signifikansi variabel dengan ANOVA
cat(sprintf("\n%s - Significance Test for Variables (ANOVA):\n", label))
anova_data <- as.data.frame(X_data)
anova_data$Group <- y_data
anova_results <- lapply(colnames(X_data), function(var) {
formula <- as.formula(paste(var, "~ Group"))
anova_model <- tryCatch(aov(formula, data = anova_data), error = function(e) NULL)
if (!is.null(anova_model)) {
summary_anova <- summary(anova_model)
p_value <- summary_anova[[1]]$`Pr(>F)`[1]
f_value <- summary_anova[[1]]$`F value`[1]
return(data.frame(Variable = var, F_Value = f_value, P_Value = p_value))
} else {
return(data.frame(Variable = var, F_Value = NA, P_Value = NA))
}
})
anova_results_df <- do.call(rbind, anova_results)
print(anova_results_df)
# Visualisasi koefisien diskriminan yang distandarisasi
std_coefs <- lda_model$scaling
var_sds <- apply(X_data, 2, sd)
std_coefs <- sweep(std_coefs, 1, var_sds, "*")
coefs_df <- as.data.frame(std_coefs)
coefs_df$Variable <- rownames(coefs_df)
coefs_df <- tidyr::pivot_longer(coefs_df, cols = starts_with("LD"), names_to = "Discriminant", values_to = "Coefficient")
coefs_df <- merge(coefs_df, anova_results_df[, c("Variable", "P_Value")], by = "Variable", all.x = TRUE)
coefs_df$Significance <- ifelse(coefs_df$P_Value < 0.05, "*", "")
p_lda <- ggplot(coefs_df, aes(x = Variable, y = Coefficient, fill = Discriminant)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = Significance), vjust = -0.5, position = position_dodge(width = 0.45)) +
labs(title = paste("Standardized LDA Coefficients (", label, ")"), x = "Variable", y = "Standardized Coefficient") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p_lda)
# Evaluasi LDA dengan confusion matrix dan akurasi
y_pred <- predict(lda_model, X_data)$class
cm <- table(y_data, y_pred)
cat(sprintf("\n%s - Confusion Matrix:\n", label))
print(cm)
cm_data <- as.data.frame.matrix(cm)
cm_data$Actual <- rownames(cm_data)
cm_data <- tidyr::pivot_longer(cm_data, cols = colnames(cm), names_to = "Predicted", values_to = "Count")
p_cm <- ggplot(cm_data, aes(x = Predicted, y = Actual, fill = Count)) +
geom_tile() +
geom_text(aes(label = Count), color = "white") +
scale_fill_gradient(low = "#f7f7f7", high = "#4e79a7") +
labs(title = paste("Confusion Matrix for", label, "(LDA)"), x = "Predicted", y = "Actual") +
theme_minimal()
print(p_cm)
acc <- mean(y_data == y_pred)
cat(sprintf("\n%s - Accuracy:\n", label))
print(acc)
lda_accuracies[[label]] <- acc
}
}
##
## AA1 - Mardia's Test (Skewness, p-value, Kurtosis):
## Skewness: 14.9858
## P-value Skewness: 0.0000
## Kurtosis: 297.0291
## Kesimpulan: Data tidak mengikuti distribusi normal multivariat (signifikan).
##
## AA1 - Box's M Test:
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: X_data
## Chi-Sq (approx.) = 3493.5, df = 240, p-value < 2.2e-16
##
##
## AA1 - Wilks' Lambda (Approximated):
## [1] 2.364257e-06
##
## AA1 - LDA Discriminant Coefficients:
## LD1 LD2
## baseline.value 0.701426821 0.4162142
## accelerations -0.416189367 0.2419189
## fetal_movement -0.006382276 -0.1917747
## uterine_contractions -0.257259460 0.0582607
## light_decelerations -0.507157913 -0.2617604
## abnormal_short_term_variability 0.502898143 0.1288263
## mean_value_of_short_term_variability 0.094254326 0.2534899
## percentage_of_time_with_abnormal_long_term_variability 0.401801596 -0.5154852
## mean_value_of_long_term_variability -0.226844625 -0.2535886
## histogram_width -0.293525982 0.2534092
## histogram_max 0.273431666 -0.4216371
## histogram_number_of_peaks 0.052213461 -0.1150758
## histogram_mode -1.013979563 -0.8182263
## histogram_variance 0.771339899 0.3846071
## histogram_tendency 0.069021985 -0.1346064
##
## AA1 - Significance Test for Variables (ANOVA):
## Variable F_Value
## 1 baseline.value 137.603512
## 2 accelerations 195.971542
## 3 fetal_movement 10.066204
## 4 uterine_contractions 92.535685
## 5 light_decelerations 62.601111
## 6 abnormal_short_term_variability 334.558940
## 7 mean_value_of_short_term_variability 153.668687
## 8 percentage_of_time_with_abnormal_long_term_variability 389.185482
## 9 mean_value_of_long_term_variability 79.350509
## 10 histogram_width 53.943891
## 11 histogram_max 2.256021
## 12 histogram_number_of_peaks 12.087430
## 13 histogram_mode 214.056236
## 14 histogram_variance 100.580770
## 15 histogram_tendency 44.820861
## P_Value
## 1 6.827411e-57
## 2 8.856190e-79
## 3 4.457249e-05
## 4 3.026880e-39
## 5 3.891911e-27
## 6 6.703609e-127
## 7 5.068339e-63
## 8 1.471757e-144
## 9 5.959481e-34
## 10 1.421706e-23
## 11 1.050197e-01
## 12 6.031104e-06
## 13 2.360489e-85
## 14 1.907755e-42
## 15 8.658192e-20
##
## AA1 - Confusion Matrix:
## y_pred
## y_data 1 2 3
## 1 1529 95 21
## 2 88 195 8
## 3 18 56 99
##
## AA1 - Accuracy:
## [1] 0.8643907
##
## BB1 - Mardia's Test (Skewness, p-value, Kurtosis):
## Skewness: 4.9953
## P-value Skewness: 0.0000
## Kurtosis: 32.5453
## Kesimpulan: Data tidak mengikuti distribusi normal multivariat (signifikan).
##
## BB1 - Box's M Test:
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: X_data
## Chi-Sq (approx.) = 706.61, df = 30, p-value < 2.2e-16
##
##
## BB1 - Wilks' Lambda (Approximated):
## [1] 6.242841e-06
##
## BB1 - LDA Discriminant Coefficients:
## LD1 LD2
## PC1 0.27046361 -0.3008567
## PC2 0.10859520 -0.4557624
## PC3 0.84311202 0.4072280
## PC4 0.08814101 -0.1067632
## PC5 0.18607476 -0.2206565
##
## BB1 - Significance Test for Variables (ANOVA):
## Variable F_Value P_Value
## 1 PC1 179.958069 7.055170e-73
## 2 PC2 95.742432 1.594449e-40
## 3 PC3 475.030478 5.321832e-171
## 4 PC4 5.250309 5.314781e-03
## 5 PC5 21.533755 5.525210e-10
##
## BB1 - Confusion Matrix:
## y_pred
## y_data 1 2 3
## 1 1552 41 52
## 2 114 173 4
## 3 40 54 79
##
## BB1 - Accuracy:
## [1] 0.8553817
##
## CC1 - Mardia's Test (Skewness, p-value, Kurtosis):
## Skewness: 1.9981
## P-value Skewness: 0.0000
## Kurtosis: 7.1346
## Kesimpulan: Data tidak mengikuti distribusi normal multivariat (signifikan).
##
## CC1 - Box's M Test:
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: X_data
## Chi-Sq (approx.) = 226.36, df = 6, p-value < 2.2e-16
##
##
## CC1 - Wilks' Lambda (Approximated):
## [1] 0.0001647984
##
## CC1 - LDA Discriminant Coefficients:
## LD1 LD2
## MR1 0.7786038 0.7611211
## MR2 -0.8629648 0.6765822
##
## CC1 - Significance Test for Variables (ANOVA):
## Variable F_Value P_Value
## 1 MR1 117.0307 6.388077e-49
## 2 MR2 144.2239 1.987955e-59
##
## CC1 - Confusion Matrix:
## y_pred
## y_data 1 2 3
## 1 1626 19 0
## 2 252 39 0
## 3 157 16 0
##
## CC1 - Accuracy:
## [1] 0.7894737
##
## DD1 - Mardia's Test (Skewness, p-value, Kurtosis):
## Skewness: 1.9981
## P-value Skewness: 0.0000
## Kurtosis: 7.7626
## Kesimpulan: Data tidak mengikuti distribusi normal multivariat (signifikan).
##
## DD1 - Box's M Test:
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: X_data
## Chi-Sq (approx.) = 93.262, df = 6, p-value < 2.2e-16
##
##
## DD1 - LDA Failed: variables 1 2 appear to be constant within groups
##
## EE1 - Mardia's Test (Skewness, p-value, Kurtosis):
## Skewness: 13.9867
## P-value Skewness: 0.0000
## Kurtosis: 258.0314
## Kesimpulan: Data tidak mengikuti distribusi normal multivariat (signifikan).
##
## EE1 - Box's M Test:
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: X_data
## Chi-Sq (approx.) = 8584.6, df = 210, p-value < 2.2e-16
##
##
## EE1 - Wilks' Lambda (Approximated):
## [1] 5.766711e-07
##
## EE1 - LDA Discriminant Coefficients:
## LD1 LD2
## baseline.value 0.43111518 0.001588134
## accelerations -0.66080101 0.527655774
## fetal_movement -0.05063304 -0.145927114
## uterine_contractions -0.25627467 -0.020740375
## light_decelerations -0.46759642 0.021077773
## abnormal_short_term_variability 0.77571867 0.061172600
## mean_value_of_short_term_variability 0.45442796 0.405646249
## percentage_of_time_with_abnormal_long_term_variability 0.32422327 -0.404875593
## mean_value_of_long_term_variability -0.31085939 -0.163255805
## histogram_width -0.26402935 -0.035580897
## histogram_max 0.20068766 -0.480803248
## histogram_mode -0.88362263 -0.481210420
## histogram_variance 0.77604189 0.090063147
## histogram_tendency 0.01862363 -0.104061863
##
## EE1 - Significance Test for Variables (ANOVA):
## Variable F_Value
## 1 baseline.value 304.228800
## 2 accelerations 551.961113
## 3 fetal_movement 22.183989
## 4 uterine_contractions 124.711906
## 5 light_decelerations 194.941513
## 6 abnormal_short_term_variability 561.352273
## 7 mean_value_of_short_term_variability 274.366100
## 8 percentage_of_time_with_abnormal_long_term_variability 425.063606
## 9 mean_value_of_long_term_variability 321.388461
## 10 histogram_width 93.642562
## 11 histogram_max 2.452833
## 12 histogram_mode 639.101591
## 13 histogram_variance 271.488996
## 14 histogram_tendency 158.487670
## P_Value
## 1 8.706348e-117
## 2 1.904524e-193
## 3 2.922438e-10
## 4 6.520038e-52
## 5 2.125750e-78
## 6 4.093910e-196
## 7 1.297908e-106
## 8 8.737857e-156
## 9 1.566161e-122
## 10 1.096720e-39
## 11 8.629535e-02
## 12 1.273593e-217
## 13 1.274852e-105
## 14 7.659638e-65
##
## EE1 - Confusion Matrix:
## y_pred
## y_data 1 2 3
## 1 586 93 24
## 2 43 636 24
## 3 0 167 535
##
## EE1 - Accuracy:
## [1] 0.8334915
##
## FF1 - Mardia's Test (Skewness, p-value, Kurtosis):
## Skewness: 4.9953
## P-value Skewness: 0.0000
## Kurtosis: 34.4231
## Kesimpulan: Data tidak mengikuti distribusi normal multivariat (signifikan).
##
## FF1 - Box's M Test:
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: X_data
## Chi-Sq (approx.) = 2117.2, df = 30, p-value < 2.2e-16
##
##
## FF1 - Wilks' Lambda (Approximated):
## [1] 1.052511e-06
##
## FF1 - LDA Discriminant Coefficients:
## LD1 LD2
## PC1 -0.0115421 0.44923823
## PC2 0.8763763 0.28876662
## PC3 0.6843459 -0.39018593
## PC4 0.1266701 -0.01265415
## PC5 -0.3615343 0.31421408
##
## FF1 - Significance Test for Variables (ANOVA):
## Variable F_Value P_Value
## 1 PC1 364.90299 8.750254e-137
## 2 PC2 795.97192 3.700256e-258
## 3 PC3 285.59847 1.819984e-110
## 4 PC4 4.89484 7.570521e-03
## 5 PC5 63.36136 1.901023e-27
##
## FF1 - Confusion Matrix:
## y_pred
## y_data 1 2 3
## 1 540 88 75
## 2 56 605 42
## 3 18 159 525
##
## FF1 - Accuracy:
## [1] 0.7922201
##
## GG1 - Mardia's Test (Skewness, p-value, Kurtosis):
## Skewness: 4.9953
## P-value Skewness: 0.0000
## Kurtosis: 33.2906
## Kesimpulan: Data tidak mengikuti distribusi normal multivariat (signifikan).
##
## GG1 - Box's M Test:
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: X_data
## Chi-Sq (approx.) = 1604.4, df = 30, p-value < 2.2e-16
##
##
## GG1 - Wilks' Lambda (Approximated):
## [1] 1.356584e-06
##
## GG1 - LDA Discriminant Coefficients:
## LD1 LD2
## MR1 0.4499719 0.4740041
## MR2 -0.1359885 -0.9574111
## MR3 -1.2109584 0.4616812
## MR5 -0.8507201 -0.2687832
## MR4 -0.3372380 0.5033044
##
## GG1 - Significance Test for Variables (ANOVA):
## Variable F_Value P_Value
## 1 MR1 120.10987 4.024017e-50
## 2 MR2 283.01971 1.386185e-109
## 3 MR3 511.99030 6.476115e-182
## 4 MR5 225.16611 2.423701e-89
## 5 MR4 92.78798 2.404584e-39
##
## GG1 - Confusion Matrix:
## y_pred
## y_data 1 2 3
## 1 555 96 52
## 2 92 565 46
## 3 37 166 499
##
## GG1 - Accuracy:
## [1] 0.7680266
##
## HH1 - Mardia's Test (Skewness, p-value, Kurtosis):
## Skewness: 1.9981
## P-value Skewness: 0.0000
## Kurtosis: 8.7146
## Kesimpulan: Data tidak mengikuti distribusi normal multivariat (signifikan).
##
## HH1 - Box's M Test:
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: X_data
## Chi-Sq (approx.) = 687.39, df = 6, p-value < 2.2e-16
##
##
## HH1 - LDA Failed: variables 1 2 appear to be constant within groups
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
# MLR Model
for (key in names(datasets)) {
X_data <- as.data.frame(datasets[[key]])
y_data <- targets[[key]]
label <- labels_mlr[[key]]
cat(sprintf("\n========== %s ==========\n", label))
# Hitung VIF dengan membuat model linier dummy
temp_data <- X_data
temp_data$dummy_y <- rnorm(nrow(temp_data)) # dummy target
vif_model <- lm(dummy_y ~ ., data = temp_data)
vif_values <- vif(vif_model)
cat(">>> VIF (Variance Inflation Factor):\n")
print(round(vif_values, 3))
# Cek cukupnya data
if (nrow(X_data) < length(unique(y_data)) || any(table(y_data) < 2)) {
cat("Skipped: Tidak cukup data untuk semua kelas.\n")
next
}
data_mlr <- cbind(X_data, y = y_data)
# MLR Model
mlr_model <- tryCatch(nnet::multinom(y ~ ., data = data_mlr, trace = FALSE, maxit = 1000), error = function(e) {
cat(sprintf("\n%s - MLR Failed: %s\n", label, e$message))
return(NULL)
})
if (!is.null(mlr_model)) {
# Likelihood Ratio Test
null_model <- tryCatch(nnet::multinom(y ~ 1, data = data_mlr, trace = FALSE, maxit = 1000), error = function(e) NULL)
if (!is.null(null_model)) {
loglik_full <- logLik(mlr_model)
loglik_null <- logLik(null_model)
lrt_stat <- 2 * (loglik_full - loglik_null)
df <- length(coef(mlr_model)) - length(coef(null_model))
lrt_pvalue <- pchisq(lrt_stat, df, lower.tail = FALSE)
cat(sprintf("\n%s - Likelihood Ratio Test (Manual Calculation):\n", label))
cat(sprintf("LRT Statistic: %.3f, df: %d, p-value: %.3e\n", lrt_stat, df, lrt_pvalue))
} else {
cat(sprintf("\n%s - Likelihood Ratio Test Skipped: Null model failed\n", label))
}
# Wald Test
cat(sprintf("\n%s - Wald Test (Coefficients and p-values):\n", label))
print(summary(mlr_model))
# Confusion Matrix
y_pred <- predict(mlr_model, X_data, type = "class")
cm <- table(y_data, y_pred)
cat(sprintf("\n%s - Confusion Matrix:\n", label))
print(cm)
cm_data <- as.data.frame.matrix(cm)
cm_data$Actual <- rownames(cm_data)
cm_data <- tidyr::pivot_longer(cm_data, cols = c("1", "2", "3"), names_to = "Predicted", values_to = "Count")
p_cm_mlr <- ggplot(cm_data, aes(x = Predicted, y = Actual, fill = Count)) +
geom_tile() +
geom_text(aes(label = Count), color = "white") +
scale_fill_gradient(low = "#f7f7f7", high = "#f28e2b") +
labs(title = paste("Confusion Matrix for", label, "(MLR)"), x = "Predicted", y = "Actual") +
theme_minimal()
print(p_cm_mlr)
# Odds Ratios
odds_ratios <- exp(coef(mlr_model))
cat(sprintf("\n%s - Odds Ratios:\n", label))
print(odds_ratios)
acc <- mean(y_data == y_pred)
cat(sprintf("\n%s - Accuracy:\n", label))
print(acc)
mlr_accuracies[[label]] <- acc
}
}
##
## ========== AB1 ==========
## >>> VIF (Variance Inflation Factor):
## baseline.value
## 4.063
## accelerations
## 2.567
## fetal_movement
## 1.269
## uterine_contractions
## 1.395
## light_decelerations
## 3.045
## abnormal_short_term_variability
## 1.859
## mean_value_of_short_term_variability
## 3.117
## percentage_of_time_with_abnormal_long_term_variability
## 2.074
## mean_value_of_long_term_variability
## 1.851
## histogram_width
## 9.006
## histogram_max
## 6.991
## histogram_number_of_peaks
## 2.386
## histogram_mode
## 5.100
## histogram_variance
## 3.002
## histogram_tendency
## 2.714
##
## AB1 - Likelihood Ratio Test (Manual Calculation):
## LRT Statistic: 1755.499, df: 30, p-value: 0.000e+00
##
## AB1 - Wald Test (Coefficients and p-values):
## Call:
## nnet::multinom(formula = y ~ ., data = data_mlr, trace = FALSE,
## maxit = 1000)
##
## Coefficients:
## (Intercept) baseline.value accelerations fetal_movement uterine_contractions
## 2 -4.535360 -0.1546641 -3.327839 0.38686473 -0.5213949
## 3 -9.027453 1.3305182 -5.608926 0.04413364 -0.9672625
## light_decelerations abnormal_short_term_variability
## 2 -0.9960602 0.8520202
## 3 -1.1044701 2.4437219
## mean_value_of_short_term_variability
## 2 -0.91154625
## 3 -0.05280065
## percentage_of_time_with_abnormal_long_term_variability
## 2 0.4828328
## 3 0.2118075
## mean_value_of_long_term_variability histogram_width histogram_max
## 2 -0.01137033 -0.7768076 1.286825
## 3 -0.88491386 -0.7132121 1.036181
## histogram_number_of_peaks histogram_mode histogram_variance
## 2 0.2994720 0.3915495 1.454514
## 3 -0.1510739 -2.8150510 2.266049
## histogram_tendency
## 2 0.2791977
## 3 0.3425407
##
## Std. Errors:
## (Intercept) baseline.value accelerations fetal_movement uterine_contractions
## 2 0.2996099 0.2577133 0.3854525 0.1104507 0.1212088
## 3 0.8989512 0.3122373 0.9893373 0.1649483 0.1864061
## light_decelerations abnormal_short_term_variability
## 2 0.2717582 0.1709663
## 3 0.2648024 0.2864991
## mean_value_of_short_term_variability
## 2 0.2650570
## 3 0.2981736
## percentage_of_time_with_abnormal_long_term_variability
## 2 0.1163989
## 3 0.2258749
## mean_value_of_long_term_variability histogram_width histogram_max
## 2 0.1944055 0.2667072 0.2709873
## 3 0.3275101 0.4426077 0.3714317
## histogram_number_of_peaks histogram_mode histogram_variance
## 2 0.1499308 0.3339761 0.2505191
## 3 0.2538637 0.3538485 0.2681127
## histogram_tendency
## 2 0.1507782
## 3 0.2053751
##
## Residual Deviance: 1079.949
## AIC: 1143.949
##
## AB1 - Confusion Matrix:
## y_pred
## y_data 1 2 3
## 1 1575 53 17
## 2 83 193 15
## 3 20 35 118
##
## AB1 - Odds Ratios:
## (Intercept) baseline.value accelerations fetal_movement uterine_contractions
## 2 0.0107230482 0.8567029 0.035870535 1.472357 0.5936919
## 3 0.0001200679 3.7830032 0.003665004 1.045122 0.3801222
## light_decelerations abnormal_short_term_variability
## 2 0.3693317 2.344378
## 3 0.3313864 11.515822
## mean_value_of_short_term_variability
## 2 0.4019023
## 3 0.9485691
## percentage_of_time_with_abnormal_long_term_variability
## 2 1.620659
## 3 1.235910
## mean_value_of_long_term_variability histogram_width histogram_max
## 2 0.9886941 0.4598717 3.621272
## 3 0.4127497 0.4900675 2.818432
## histogram_number_of_peaks histogram_mode histogram_variance
## 2 1.3491462 1.47927114 4.282403
## 3 0.8597841 0.05990166 9.641231
## histogram_tendency
## 2 1.322069
## 3 1.408522
##
## AB1 - Accuracy:
## [1] 0.8942627
##
## ========== BC1 ==========
## >>> VIF (Variance Inflation Factor):
## PC1 PC2 PC3 PC4 PC5
## 1 1 1 1 1
##
## BC1 - Likelihood Ratio Test (Manual Calculation):
## LRT Statistic: 1305.404, df: 10, p-value: 2.609e-274
##
## BC1 - Wald Test (Coefficients and p-values):
## Call:
## nnet::multinom(formula = y ~ ., data = data_mlr, trace = FALSE,
## maxit = 1000)
##
## Coefficients:
## (Intercept) PC1 PC2 PC3 PC4 PC5
## 2 -3.008824 0.6864185 0.675013 1.027433 0.24137535 0.5220185
## 3 -4.687199 0.2899734 -0.308525 2.384946 0.05931237 0.3202953
##
## Std. Errors:
## (Intercept) PC1 PC2 PC3 PC4 PC5
## 2 0.1376679 0.04900339 0.06917199 0.09464289 0.07002781 0.08670944
## 3 0.2830351 0.04617997 0.08537680 0.16078922 0.10040872 0.12508875
##
## Residual Deviance: 1530.045
## AIC: 1554.045
##
## BC1 - Confusion Matrix:
## y_pred
## y_data 1 2 3
## 1 1570 34 41
## 2 120 161 10
## 3 50 41 82
##
## BC1 - Odds Ratios:
## (Intercept) PC1 PC2 PC3 PC4 PC5
## 2 0.049349661 1.986588 1.9640586 2.793885 1.272999 1.685426
## 3 0.009212457 1.336392 0.7345296 10.858480 1.061107 1.377534
##
## BC1 - Accuracy:
## [1] 0.8596491
##
## ========== CD1 ==========
## >>> VIF (Variance Inflation Factor):
## MR1 MR2
## 1.001 1.001
##
## CD1 - Likelihood Ratio Test (Manual Calculation):
## LRT Statistic: 578.244, df: 4, p-value: 7.915e-124
##
## CD1 - Wald Test (Coefficients and p-values):
## Call:
## nnet::multinom(formula = y ~ ., data = data_mlr, trace = FALSE,
## maxit = 1000)
##
## Coefficients:
## (Intercept) MR1 MR2
## 2 -2.730659 -1.4724532 1.2206809
## 3 -2.678121 0.1397596 -0.9330897
##
## Std. Errors:
## (Intercept) MR1 MR2
## 2 0.1219215 0.10288603 0.09833167
## 3 0.1087513 0.08745025 0.09424308
##
## Residual Deviance: 2257.204
## AIC: 2269.204
##
## CD1 - Confusion Matrix:
## y_pred
## y_data 1 2 3
## 1 1607 38 0
## 2 206 85 0
## 3 150 23 0
##
## CD1 - Odds Ratios:
## (Intercept) MR1 MR2
## 2 0.06517632 0.2293621 3.3894947
## 3 0.06869213 1.1499973 0.3933366
##
## CD1 - Accuracy:
## [1] 0.802276
##
## ========== DE1 ==========
## >>> VIF (Variance Inflation Factor):
## MR1 MR2
## 1.134 1.134
##
## DE1 - Likelihood Ratio Test (Manual Calculation):
## LRT Statistic: 0.000, df: 4, p-value: 1.000e+00
##
## DE1 - Wald Test (Coefficients and p-values):
## Call:
## nnet::multinom(formula = y ~ ., data = data_mlr, trace = FALSE,
## maxit = 1000)
##
## Coefficients:
## (Intercept) MR1 MR2
## 2 -1.732172 4.225877e-06 2.641869e-05
## 3 -2.252204 1.486263e-05 1.722302e-05
##
## Std. Errors:
## (Intercept) MR1 MR2
## 2 0.06359502 0.000000e+00 1.099654e-21
## 3 0.07992653 2.597782e-21 1.770602e-21
##
## Residual Deviance: 2835.448
## AIC: 2847.448
##
## DE1 - Confusion Matrix:
## y_pred
## y_data 1 2 3
## 1 1645 0 0
## 2 291 0 0
## 3 173 0 0
##
## DE1 - Odds Ratios:
## (Intercept) MR1 MR2
## 2 0.1768998 1.000004 1.000026
## 3 0.1051672 1.000015 1.000017
##
## DE1 - Accuracy:
## [1] 0.7799905
##
## ========== EF1 ==========
## >>> VIF (Variance Inflation Factor):
## baseline.value
## 4.233
## accelerations
## 2.054
## fetal_movement
## 1.183
## uterine_contractions
## 1.935
## light_decelerations
## 3.006
## abnormal_short_term_variability
## 2.184
## mean_value_of_short_term_variability
## 4.988
## percentage_of_time_with_abnormal_long_term_variability
## 3.189
## mean_value_of_long_term_variability
## 2.071
## histogram_width
## 9.528
## histogram_max
## 6.759
## histogram_mode
## 7.002
## histogram_variance
## 3.530
## histogram_tendency
## 2.925
##
## EF1 - Likelihood Ratio Test (Manual Calculation):
## LRT Statistic: 3088.537, df: 28, p-value: 0.000e+00
##
## EF1 - Wald Test (Coefficients and p-values):
## Call:
## nnet::multinom(formula = y ~ ., data = data_mlr, trace = FALSE,
## maxit = 1000)
##
## Coefficients:
## (Intercept) baseline.value accelerations fetal_movement uterine_contractions
## 2 -0.2976572 0.1076334 -2.719055 0.22304836 -0.5192612
## 3 -1.8311926 1.5362856 -5.239806 -0.09119814 -1.2979018
## light_decelerations abnormal_short_term_variability
## 2 -1.113091 0.8353494
## 3 -2.138346 2.6489640
## mean_value_of_short_term_variability
## 2 -1.3311877
## 3 0.9814443
## percentage_of_time_with_abnormal_long_term_variability
## 2 0.5779501
## 3 0.1791453
## mean_value_of_long_term_variability histogram_width histogram_max
## 2 0.02941503 -0.7429038 1.522455
## 3 -1.34000521 -1.0186417 1.074816
## histogram_mode histogram_variance histogram_tendency
## 2 -0.1309325 2.296594 0.2873952
## 3 -3.2242140 3.396115 0.1205096
##
## Std. Errors:
## (Intercept) baseline.value accelerations fetal_movement uterine_contractions
## 2 0.1755196 0.2353804 0.2574797 0.1064790 0.1364606
## 3 0.3274820 0.2944149 0.7099174 0.1322297 0.1833212
## light_decelerations abnormal_short_term_variability
## 2 0.2523423 0.1741852
## 3 0.2950798 0.2252022
## mean_value_of_short_term_variability
## 2 0.3285242
## 3 0.3285351
## percentage_of_time_with_abnormal_long_term_variability
## 2 0.1380616
## 3 0.2154641
## mean_value_of_long_term_variability histogram_width histogram_max
## 2 0.1773035 0.2999940 0.2946158
## 3 0.2304425 0.3863953 0.3769490
## histogram_mode histogram_variance histogram_tendency
## 2 0.3563585 0.2720778 0.1803078
## 3 0.4273933 0.3078605 0.2156736
##
## Residual Deviance: 1543.211
## AIC: 1603.211
##
## EF1 - Confusion Matrix:
## y_pred
## y_data 1 2 3
## 1 611 74 18
## 2 39 601 63
## 3 9 93 600
##
## EF1 - Odds Ratios:
## (Intercept) baseline.value accelerations fetal_movement uterine_contractions
## 2 0.7425559 1.113639 0.065937064 1.2498810 0.5949599
## 3 0.1602224 4.647296 0.005301285 0.9128368 0.2731042
## light_decelerations abnormal_short_term_variability
## 2 0.3285419 2.30562
## 3 0.1178496 14.13938
## mean_value_of_short_term_variability
## 2 0.2641633
## 3 2.6683072
## percentage_of_time_with_abnormal_long_term_variability
## 2 1.782381
## 3 1.196195
## mean_value_of_long_term_variability histogram_width histogram_max
## 2 1.0298519 0.4757305 4.583463
## 3 0.2618443 0.3610851 2.929453
## histogram_mode histogram_variance histogram_tendency
## 2 0.87727702 9.940271 1.332951
## 3 0.03978704 29.847928 1.128072
##
## EF1 - Accuracy:
## [1] 0.8595825
##
## ========== FG1 ==========
## >>> VIF (Variance Inflation Factor):
## PC1 PC2 PC3 PC4 PC5
## 1 1 1 1 1
##
## FG1 - Likelihood Ratio Test (Manual Calculation):
## LRT Statistic: 2513.253, df: 10, p-value: 0.000e+00
##
## FG1 - Wald Test (Coefficients and p-values):
## Call:
## nnet::multinom(formula = y ~ ., data = data_mlr, trace = FALSE,
## maxit = 1000)
##
## Coefficients:
## (Intercept) PC1 PC2 PC3 PC4 PC5
## 2 -0.0396679 -0.7903702 0.5405675 1.281506 -0.03577445 -0.9398919
## 3 -0.8478772 -0.1596825 2.5236663 1.881434 0.25415190 -1.4049281
##
## Std. Errors:
## (Intercept) PC1 PC2 PC3 PC4 PC5
## 2 0.09978608 0.05074461 0.07356033 0.08803727 0.07019422 0.09143347
## 3 0.14135923 0.04418639 0.12317339 0.11631403 0.08519247 0.13834163
##
## Residual Deviance: 2118.495
## AIC: 2142.495
##
## FG1 - Confusion Matrix:
## y_pred
## y_data 1 2 3
## 1 557 87 59
## 2 67 548 88
## 3 38 101 563
##
## FG1 - Odds Ratios:
## (Intercept) PC1 PC2 PC3 PC4 PC5
## 2 0.9611086 0.4536768 1.716981 3.602060 0.9648579 0.3906700
## 3 0.4283232 0.8524144 12.474247 6.562912 1.2893676 0.2453847
##
## FG1 - Accuracy:
## [1] 0.7912713
##
## ========== GH1 ==========
## >>> VIF (Variance Inflation Factor):
## MR1 MR2 MR3 MR5 MR4
## 1.008 1.018 1.019 1.007 1.008
##
## GH1 - Likelihood Ratio Test (Manual Calculation):
## LRT Statistic: 2302.203, df: 10, p-value: 0.000e+00
##
## GH1 - Wald Test (Coefficients and p-values):
## Call:
## nnet::multinom(formula = y ~ ., data = data_mlr, trace = FALSE,
## maxit = 1000)
##
## Coefficients:
## (Intercept) MR1 MR2 MR3 MR5 MR4
## 2 -0.08495295 -0.7069997 1.3848634 -1.539444 -0.08338301 -1.2436374
## 3 -0.18678609 0.9869159 -0.2080233 -2.981898 -1.87820724 -0.9567872
##
## Std. Errors:
## (Intercept) MR1 MR2 MR3 MR5 MR4
## 2 0.1080107 0.1082165 0.09592049 0.1008310 0.1103074 0.0952941
## 3 0.1073241 0.1051512 0.10271432 0.1475438 0.1261135 0.1028907
##
## Residual Deviance: 2329.546
## AIC: 2353.546
##
## GH1 - Confusion Matrix:
## y_pred
## y_data 1 2 3
## 1 558 91 54
## 2 71 542 90
## 3 53 119 530
##
## GH1 - Odds Ratios:
## (Intercept) MR1 MR2 MR3 MR5 MR4
## 2 0.9185555 0.4931215 3.9942803 0.21450029 0.9199987 0.2883335
## 3 0.8296212 2.6829473 0.8121881 0.05069652 0.1528639 0.3841250
##
## GH1 - Accuracy:
## [1] 0.7732448
##
## ========== HI1 ==========
## >>> VIF (Variance Inflation Factor):
## MR1 MR2
## 1.016 1.016
##
## HI1 - Likelihood Ratio Test (Manual Calculation):
## LRT Statistic: 0.000, df: 4, p-value: 1.000e+00
##
## HI1 - Wald Test (Coefficients and p-values):
## Warning in sqrt(diag(vc)): NaNs produced
## Call:
## nnet::multinom(formula = y ~ ., data = data_mlr, trace = FALSE,
## maxit = 1000)
##
## Coefficients:
## (Intercept) MR1 MR2
## 2 -1.684779e-08 -5.866299e-06 -5.866196e-06
## 3 -1.423518e-03 -1.414946e-05 2.756714e-06
##
## Std. Errors:
## (Intercept) MR1 MR2
## 2 0.05333807 0.000000e+00 NaN
## 3 0.05335707 2.068111e-22 NaN
##
## Residual Deviance: 4631.748
## AIC: 4643.748
##
## HI1 - Confusion Matrix:
## y_pred
## y_data 1 2 3
## 1 355 348 0
## 2 352 351 0
## 3 347 355 0
##
## HI1 - Odds Ratios:
## (Intercept) MR1 MR2
## 2 1.0000000 0.9999941 0.9999941
## 3 0.9985775 0.9999859 1.0000028
##
## HI1 - Accuracy:
## [1] 0.3349146
# Visualization: Model Accuracy Comparison
accuracy_data <- data.frame(
Model = c(names(lda_accuracies), names(mlr_accuracies)),
Accuracy = c(unlist(lda_accuracies), unlist(mlr_accuracies)),
Method = c(rep("LDA", length(lda_accuracies)), rep("MLR", length(mlr_accuracies)))
)
# Create bar chart for accuracy comparison
p_accuracy <- ggplot(accuracy_data, aes(x = Model, y = Accuracy, fill = Method)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_manual(values = c("LDA" = "#4e79a7", "MLR" = "#f28e2b")) +
labs(title = "Model Accuracy Comparison (LDA vs MLR)", x = "Model", y = "Accuracy") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p_accuracy)
print(accuracy_data)
## Model Accuracy Method
## AA1 AA1 0.8643907 LDA
## BB1 BB1 0.8553817 LDA
## CC1 CC1 0.7894737 LDA
## EE1 EE1 0.8334915 LDA
## FF1 FF1 0.7922201 LDA
## GG1 GG1 0.7680266 LDA
## AB1 AB1 0.8942627 MLR
## BC1 BC1 0.8596491 MLR
## CD1 CD1 0.8022760 MLR
## DE1 DE1 0.7799905 MLR
## EF1 EF1 0.8595825 MLR
## FG1 FG1 0.7912713 MLR
## GH1 GH1 0.7732448 MLR
## HI1 HI1 0.3349146 MLR
#