#===================================Library===============================================#
packages <- c("nnet", "car", "ggplot2", "dplyr", "factoextra", "psych")
lapply(packages, function(x) if(!require(x, character.only=TRUE)) install.packages(x))
## Loading required package: nnet
## Warning: package 'nnet' was built under R version 4.4.3
## Loading required package: car
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: factoextra
## Warning: package 'factoextra' was built under R version 4.4.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
## Loading required package: psych
## Warning: package 'psych' was built under R version 4.4.3
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## The following object is masked from 'package:car':
## 
##     logit
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
#============================== 1. prepro ================================#
#---Read Csv
data <- read.csv('fetal_health.csv', header = TRUE)

#---mising
missing_data <- sapply(data, function(x) sum(is.na(x)))
print("Missing Values Count:")
## [1] "Missing Values Count:"
print(missing_data)
##                                         baseline.value 
##                                                      0 
##                                          accelerations 
##                                                      0 
##                                         fetal_movement 
##                                                      0 
##                                   uterine_contractions 
##                                                      0 
##                                    light_decelerations 
##                                                      0 
##                                   severe_decelerations 
##                                                      0 
##                               prolongued_decelerations 
##                                                      0 
##                        abnormal_short_term_variability 
##                                                      0 
##                   mean_value_of_short_term_variability 
##                                                      0 
## percentage_of_time_with_abnormal_long_term_variability 
##                                                      0 
##                    mean_value_of_long_term_variability 
##                                                      0 
##                                        histogram_width 
##                                                      0 
##                                          histogram_min 
##                                                      0 
##                                          histogram_max 
##                                                      0 
##                              histogram_number_of_peaks 
##                                                      0 
##                             histogram_number_of_zeroes 
##                                                      0 
##                                         histogram_mode 
##                                                      0 
##                                         histogram_mean 
##                                                      0 
##                                       histogram_median 
##                                                      0 
##                                     histogram_variance 
##                                                      0 
##                                     histogram_tendency 
##                                                      0 
##                                           fetal_health 
##                                                      0
#---cek_duplikat
duplicate_rows <- sum(duplicated(data))
cat("Jumlah duplikat dalam dataset: ", duplicate_rows, "\n")
## Jumlah duplikat dalam dataset:  13
#---Hapus duplikat
data <- data[!duplicated(data), ]
cat("Dataset setelah menghapus duplikat: ", nrow(data), "baris\n")
## Dataset setelah menghapus duplikat:  2113 baris
#---target to faktor
data$fetal_health <- as.factor(data$fetal_health)
colnames(data)
##  [1] "baseline.value"                                        
##  [2] "accelerations"                                         
##  [3] "fetal_movement"                                        
##  [4] "uterine_contractions"                                  
##  [5] "light_decelerations"                                   
##  [6] "severe_decelerations"                                  
##  [7] "prolongued_decelerations"                              
##  [8] "abnormal_short_term_variability"                       
##  [9] "mean_value_of_short_term_variability"                  
## [10] "percentage_of_time_with_abnormal_long_term_variability"
## [11] "mean_value_of_long_term_variability"                   
## [12] "histogram_width"                                       
## [13] "histogram_min"                                         
## [14] "histogram_max"                                         
## [15] "histogram_number_of_peaks"                             
## [16] "histogram_number_of_zeroes"                            
## [17] "histogram_mode"                                        
## [18] "histogram_mean"                                        
## [19] "histogram_median"                                      
## [20] "histogram_variance"                                    
## [21] "histogram_tendency"                                    
## [22] "fetal_health"
#---hapus multikonielaritas data krn jika tdk dihapus,bisa terjadi redudansi yang menyebabkan regresi tidak bisa mengestimasi koefisien untuk semua variabel
data$histogram_max <- NULL

#============================= 2. uji asumsi (VIF) ========================#
library(car)
library(dplyr)

#---Buat model VIF hanya dari prediktor, tanpa pake target 
predictors_only <- dplyr::select(data, -fetal_health)

model_vif <- lm(rep(1, nrow(predictors_only)) ~ ., data = predictors_only)
vif_values <- car::vif(model_vif)

print("VIF antar prediktor:")
## [1] "VIF antar prediktor:"
print(vif_values)
##                                         baseline.value 
##                                               6.681407 
##                                          accelerations 
##                                               2.885972 
##                                         fetal_movement 
##                                               1.138681 
##                                   uterine_contractions 
##                                               1.281532 
##                                    light_decelerations 
##                                               3.449187 
##                                   severe_decelerations 
##                                               1.134936 
##                               prolongued_decelerations 
##                                               2.736979 
##                        abnormal_short_term_variability 
##                                               1.916973 
##                   mean_value_of_short_term_variability 
##                                               2.972435 
## percentage_of_time_with_abnormal_long_term_variability 
##                                               1.875265 
##                    mean_value_of_long_term_variability 
##                                               2.078607 
##                                        histogram_width 
##                                              17.795686 
##                                          histogram_min 
##                                              19.707950 
##                              histogram_number_of_peaks 
##                                               2.366522 
##                             histogram_number_of_zeroes 
##                                               1.184267 
##                                         histogram_mode 
##                                               8.982942 
##                                         histogram_mean 
##                                              20.070464 
##                                       histogram_median 
##                                              26.780804 
##                                     histogram_variance 
##                                               2.590868 
##                                     histogram_tendency 
##                                               2.880995
#---Jika ada VIF > 5 atau 10, hapus
predictors_only <- predictors_only %>% dplyr::select(- histogram_mean) #isi dgn var yng nilainya >10


#---cek vif lagi (stlh dihapus)
model_vif <- lm(rep(1, nrow(predictors_only)) ~ ., data = predictors_only)
vif_values <- car::vif(model_vif)
print("VIF antar prediktor (setelah penghapusan):")
## [1] "VIF antar prediktor (setelah penghapusan):"
print(vif_values)
##                                         baseline.value 
##                                               6.227538 
##                                          accelerations 
##                                               2.642784 
##                                         fetal_movement 
##                                               1.138046 
##                                   uterine_contractions 
##                                               1.274577 
##                                    light_decelerations 
##                                               3.211755 
##                                   severe_decelerations 
##                                               1.133418 
##                               prolongued_decelerations 
##                                               2.635461 
##                        abnormal_short_term_variability 
##                                               1.766872 
##                   mean_value_of_short_term_variability 
##                                               2.664290 
## percentage_of_time_with_abnormal_long_term_variability 
##                                               1.859580 
##                    mean_value_of_long_term_variability 
##                                               1.988585 
##                                        histogram_width 
##                                              17.763757 
##                                          histogram_min 
##                                              19.542255 
##                              histogram_number_of_peaks 
##                                               2.366111 
##                             histogram_number_of_zeroes 
##                                               1.180517 
##                                         histogram_mode 
##                                               8.820754 
##                                       histogram_median 
##                                              20.308901 
##                                     histogram_variance 
##                                               2.590333 
##                                     histogram_tendency 
##                                               2.880981
#================================== 3. EDA ========================================#
cat("Statistika Deskriptif:\n")
## Statistika Deskriptif:
summary(data)
##  baseline.value  accelerations      fetal_movement     uterine_contractions
##  Min.   :106.0   Min.   :0.000000   Min.   :0.000000   Min.   :0.000000    
##  1st Qu.:126.0   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.002000    
##  Median :133.0   Median :0.002000   Median :0.000000   Median :0.005000    
##  Mean   :133.3   Mean   :0.003188   Mean   :0.009517   Mean   :0.004387    
##  3rd Qu.:140.0   3rd Qu.:0.006000   3rd Qu.:0.003000   3rd Qu.:0.007000    
##  Max.   :160.0   Max.   :0.019000   Max.   :0.481000   Max.   :0.015000    
##  light_decelerations severe_decelerations prolongued_decelerations
##  Min.   :0.000000    Min.   :0.000e+00    Min.   :0.0000000       
##  1st Qu.:0.000000    1st Qu.:0.000e+00    1st Qu.:0.0000000       
##  Median :0.000000    Median :0.000e+00    Median :0.0000000       
##  Mean   :0.001901    Mean   :3.313e-06    Mean   :0.0001595       
##  3rd Qu.:0.003000    3rd Qu.:0.000e+00    3rd Qu.:0.0000000       
##  Max.   :0.015000    Max.   :1.000e-03    Max.   :0.0050000       
##  abnormal_short_term_variability mean_value_of_short_term_variability
##  Min.   :12.00                   Min.   :0.200                       
##  1st Qu.:32.00                   1st Qu.:0.700                       
##  Median :49.00                   Median :1.200                       
##  Mean   :46.99                   Mean   :1.335                       
##  3rd Qu.:61.00                   3rd Qu.:1.700                       
##  Max.   :87.00                   Max.   :7.000                       
##  percentage_of_time_with_abnormal_long_term_variability
##  Min.   : 0.000                                        
##  1st Qu.: 0.000                                        
##  Median : 0.000                                        
##  Mean   : 9.795                                        
##  3rd Qu.:11.000                                        
##  Max.   :91.000                                        
##  mean_value_of_long_term_variability histogram_width  histogram_min   
##  Min.   : 0.000                      Min.   :  3.00   Min.   : 50.00  
##  1st Qu.: 4.600                      1st Qu.: 37.00   1st Qu.: 67.00  
##  Median : 7.400                      Median : 68.00   Median : 93.00  
##  Mean   : 8.167                      Mean   : 70.54   Mean   : 93.56  
##  3rd Qu.:10.800                      3rd Qu.:100.00   3rd Qu.:120.00  
##  Max.   :50.700                      Max.   :180.00   Max.   :159.00  
##  histogram_number_of_peaks histogram_number_of_zeroes histogram_mode 
##  Min.   : 0.000            Min.   : 0.0000            Min.   : 60.0  
##  1st Qu.: 2.000            1st Qu.: 0.0000            1st Qu.:129.0  
##  Median : 4.000            Median : 0.0000            Median :139.0  
##  Mean   : 4.077            Mean   : 0.3256            Mean   :137.5  
##  3rd Qu.: 6.000            3rd Qu.: 0.0000            3rd Qu.:148.0  
##  Max.   :18.000            Max.   :10.0000            Max.   :187.0  
##  histogram_mean  histogram_median histogram_variance histogram_tendency
##  Min.   : 73.0   Min.   : 77.0    Min.   :  0.00     Min.   :-1.0000   
##  1st Qu.:125.0   1st Qu.:129.0    1st Qu.:  2.00     1st Qu.: 0.0000   
##  Median :136.0   Median :139.0    Median :  7.00     Median : 0.0000   
##  Mean   :134.6   Mean   :138.1    Mean   : 18.91     Mean   : 0.3185   
##  3rd Qu.:145.0   3rd Qu.:148.0    3rd Qu.: 24.00     3rd Qu.: 1.0000   
##  Max.   :182.0   Max.   :186.0    Max.   :269.00     Max.   : 1.0000   
##  fetal_health
##  1:1646      
##  2: 292      
##  3: 175      
##              
##              
## 
#--Distribusi variabel target (fetal_health)
cat("\nDistribusi Target (fetal_health):\n")
## 
## Distribusi Target (fetal_health):
print(table(data$fetal_health))
## 
##    1    2    3 
## 1646  292  175
library(ggplot2)
ggplot(data, aes(x = factor(fetal_health))) +
  geom_bar(fill = "steelblue") +
  labs(title = "Distribusi Kesehatan Janin (fetal_health)", x = "Kategori Kesehatan", y = "Jumlah") +
  theme_minimal()

#--korelasi
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
numeric_cols <- data[, sapply(data, is.numeric)]
corrplot(cor(numeric_cols), method = "color",
         tl.cex = 0.3, number.cex = 0.2,
         col = colorRampPalette(c("blue", "white", "red"))(200),
         type = "full", addCoef.col = "black")

#--Boxplot semua variabel numerik terhadap fetal_health
num_vars <- names(data)[sapply(data, is.numeric)]
par(mfrow = c(3, 3))
for (col in num_vars) {
  boxplot(data[[col]] ~ data$fetal_health,
          main = paste("Boxplot:", col), xlab = "Fetal Health", ylab = col)
}

#--distribusi
par(mfrow = c(3, 3))

for (col in num_vars) {
  plot(density(data[[col]]), main = paste("Density:", col),
       xlab = col, col = "blue", lwd = 2)
}

#--Fungsi untuk winsorizing menggunakan IQR
winsorize_iqr <- function(dataset) {
  for (col in names(dataset)) {
    if (is.numeric(dataset[[col]])) {
      Q1 <- quantile(dataset[[col]], 0.25, na.rm = TRUE)
      Q3 <- quantile(dataset[[col]], 0.75, na.rm = TRUE)
      IQR_value <- Q3 - Q1
      lower_bound <- Q1 - 1.5 * IQR_value
      upper_bound <- Q3 + 1.5 * IQR_value
      dataset[[col]][dataset[[col]] < lower_bound] <- lower_bound
      dataset[[col]][dataset[[col]] > upper_bound] <- upper_bound
    }
  }
  return(dataset)
}

#--Terapkan winsorization pada dataset
data <- winsorize_iqr(data)


#--visualisasi setelah penanganan outliers
par(mfrow = c(3, 3))

for (col in num_vars) {
  boxplot(data[[col]],
          main = paste("Sesudah -", col),
          col = "lightblue", border = "black")
  
  
}

# =========== 4. Klasifikasi: Multinomial Logistic Regression ========= #
library(nnet)
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
set.seed(123)

#--Split data: 80% train, 20% test
splitIndex <- createDataPartition(data$fetal_health, p = 0.8, list = FALSE)
train_data <- data[splitIndex, ]
test_data <- data[-splitIndex, ]

#--Pastikan target adalah faktor
train_data$fetal_health <- as.factor(train_data$fetal_health)
test_data$fetal_health <- as.factor(test_data$fetal_health)

#--Model multinomial logistic regression
model_multi <- multinom(fetal_health ~ ., data = train_data)
## # weights:  66 (42 variable)
## initial  value 1857.753380 
## iter  10 value 671.586762
## iter  20 value 624.952705
## iter  30 value 523.283409
## iter  40 value 506.165191
## iter  50 value 455.069285
## iter  60 value 446.665918
## iter  70 value 416.834690
## iter  80 value 410.271349
## iter  90 value 410.007417
## iter 100 value 410.005957
## final  value 410.005957 
## stopped after 100 iterations
#--Prediksi
prediksi <- predict(model_multi, newdata = test_data)

#--Confusion Matrix
confusion_matrix <- table(Predicted = prediksi, Actual = test_data$fetal_health)
print("Confusion Matrix:")
## [1] "Confusion Matrix:"
print(confusion_matrix)
##          Actual
## Predicted   1   2   3
##         1 313  15   2
##         2  15  41   5
##         3   1   2  28
#--visualisasi cf
library(ggplot2)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.4.3
#--Buat dataframe dari confusion matrix
cm_df <- as.data.frame(confusion_matrix)
colnames(cm_df) <- c("Predicted", "Actual", "Freq")

#--Visualisasi heatmap confusion matrix
ggplot(data = cm_df, aes(x = Actual, y = Predicted, fill = Freq)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Freq), vjust = 0.5, fontface = "bold", color = "black") +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  labs(title = "Confusion Matrix", x = "Actual Label", y = "Predicted Label") +
  theme_minimal()


#--Akurasi
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
cat("Akurasi:", round(accuracy * 100, 2), "%\n")
## Akurasi: 90.52 %
#--Precision, Recall, F1 per kelas
precision <- diag(confusion_matrix) / colSums(confusion_matrix)
recall <- diag(confusion_matrix) / rowSums(confusion_matrix)
f1 <- 2 * precision * recall / (precision + recall)

#--macro
macro_precision <- mean(precision, na.rm = TRUE)
macro_recall <- mean(recall, na.rm = TRUE)
macro_f1 <- mean(f1, na.rm = TRUE)

cat("Macro Precision:", round(macro_precision, 4), "\n")
## Macro Precision: 0.8194
cat("Macro Recall:", round(macro_recall, 4), "\n")
## Macro Recall: 0.8413
cat("Macro F1 Score:", round(macro_f1, 4), "\n")
## Macro F1 Score: 0.8292
#===================================== 5. perceptual mapping=======================#
#--Ambil data numerik
numeric_features <- data[, sapply(data, is.numeric)]

#--Hitung jarak antar data
distance_matrix <- dist(scale(numeric_features))  # penting: dist perlu data diskalakan

#--Jalankan MDS
mds_result <- cmdscale(distance_matrix, k = 2)

#--Buat dataframe untuk visualisasi
mds_df <- data.frame(
  Dim1 = mds_result[,1],
  Dim2 = mds_result[,2],
  Label = data$fetal_health
)

#--Visualisasi MDS
ggplot(mds_df, aes(x = Dim1, y = Dim2, color = Label)) +
  geom_point(size = 2, alpha = 0.7) +
  labs(title = "Perceptual Mapping (MDS)", x = "Dimensi 1", y = "Dimensi 2") +
  theme_minimal() +
  scale_color_brewer(palette = "Set2")