# ================================== 1. PREPROCESSING =====================================#
data <- read.csv("fetal_health.csv")

#---Cek missing value
missing_data <- sapply(data, function(x) sum(is.na(x)))
print("Missing Values Count:")
## [1] "Missing Values Count:"
print(missing_data)
##                                         baseline.value 
##                                                      0 
##                                          accelerations 
##                                                      0 
##                                         fetal_movement 
##                                                      0 
##                                   uterine_contractions 
##                                                      0 
##                                    light_decelerations 
##                                                      0 
##                                   severe_decelerations 
##                                                      0 
##                               prolongued_decelerations 
##                                                      0 
##                        abnormal_short_term_variability 
##                                                      0 
##                   mean_value_of_short_term_variability 
##                                                      0 
## percentage_of_time_with_abnormal_long_term_variability 
##                                                      0 
##                    mean_value_of_long_term_variability 
##                                                      0 
##                                        histogram_width 
##                                                      0 
##                                          histogram_min 
##                                                      0 
##                                          histogram_max 
##                                                      0 
##                              histogram_number_of_peaks 
##                                                      0 
##                             histogram_number_of_zeroes 
##                                                      0 
##                                         histogram_mode 
##                                                      0 
##                                         histogram_mean 
##                                                      0 
##                                       histogram_median 
##                                                      0 
##                                     histogram_variance 
##                                                      0 
##                                     histogram_tendency 
##                                                      0 
##                                           fetal_health 
##                                                      0
#---Cek dan hapus duplikat
duplicate_rows <- sum(duplicated(data))
cat("Jumlah duplikat dalam dataset: ", duplicate_rows, "\n")
## Jumlah duplikat dalam dataset:  13
data <- data[!duplicated(data), ]
cat("Dataset setelah menghapus duplikat: ", nrow(data), "baris\n")
## Dataset setelah menghapus duplikat:  2113 baris
#---Ubah target menjadi faktor
data$fetal_health <- as.factor(data$fetal_health)

#---Handling Outliers
num_vars <- names(data)[sapply(data, is.numeric)]

winsorize_iqr <- function(dataset) {
  for (col in names(dataset)) {
    if (is.numeric(dataset[[col]])) {
      Q1 <- quantile(dataset[[col]], 0.25, na.rm = TRUE)
      Q3 <- quantile(dataset[[col]], 0.75, na.rm = TRUE)
      IQR_value <- Q3 - Q1
      lower_bound <- Q1 - 1.5 * IQR_value
      upper_bound <- Q3 + 1.5 * IQR_value
      dataset[[col]][dataset[[col]] < lower_bound] <- lower_bound
      dataset[[col]][dataset[[col]] > upper_bound] <- upper_bound
    }
  }
  return(dataset)
}

#---Terapkan winsorization pada dataset
data <- winsorize_iqr(data)

#================================== 2. EDA ========================================#

cat("Statistika Deskriptif:\n")
## Statistika Deskriptif:
summary(data)
##  baseline.value  accelerations      fetal_movement     uterine_contractions
##  Min.   :106.0   Min.   :0.000000   Min.   :0.000000   Min.   :0.000000    
##  1st Qu.:126.0   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.002000    
##  Median :133.0   Median :0.002000   Median :0.000000   Median :0.005000    
##  Mean   :133.3   Mean   :0.003177   Mean   :0.001747   Mean   :0.004387    
##  3rd Qu.:140.0   3rd Qu.:0.006000   3rd Qu.:0.003000   3rd Qu.:0.007000    
##  Max.   :160.0   Max.   :0.015000   Max.   :0.007500   Max.   :0.014500    
##  light_decelerations severe_decelerations prolongued_decelerations
##  Min.   :0.000000    Min.   :0            Min.   :0               
##  1st Qu.:0.000000    1st Qu.:0            1st Qu.:0               
##  Median :0.000000    Median :0            Median :0               
##  Mean   :0.001744    Mean   :0            Mean   :0               
##  3rd Qu.:0.003000    3rd Qu.:0            3rd Qu.:0               
##  Max.   :0.007500    Max.   :0            Max.   :0               
##  abnormal_short_term_variability mean_value_of_short_term_variability
##  Min.   :12.00                   Min.   :0.200                       
##  1st Qu.:32.00                   1st Qu.:0.700                       
##  Median :49.00                   Median :1.200                       
##  Mean   :46.99                   Mean   :1.302                       
##  3rd Qu.:61.00                   3rd Qu.:1.700                       
##  Max.   :87.00                   Max.   :3.200                       
##  percentage_of_time_with_abnormal_long_term_variability
##  Min.   : 0.000                                        
##  1st Qu.: 0.000                                        
##  Median : 0.000                                        
##  Mean   : 6.631                                        
##  3rd Qu.:11.000                                        
##  Max.   :27.500                                        
##  mean_value_of_long_term_variability histogram_width  histogram_min   
##  Min.   : 0.00                       Min.   :  3.00   Min.   : 50.00  
##  1st Qu.: 4.60                       1st Qu.: 37.00   1st Qu.: 67.00  
##  Median : 7.40                       Median : 68.00   Median : 93.00  
##  Mean   : 7.98                       Mean   : 70.54   Mean   : 93.56  
##  3rd Qu.:10.80                       3rd Qu.:100.00   3rd Qu.:120.00  
##  Max.   :20.10                       Max.   :180.00   Max.   :159.00  
##  histogram_max   histogram_number_of_peaks histogram_number_of_zeroes
##  Min.   :122.0   Min.   : 0.00             Min.   :0                 
##  1st Qu.:152.0   1st Qu.: 2.00             1st Qu.:0                 
##  Median :162.0   Median : 4.00             Median :0                 
##  Mean   :163.9   Mean   : 4.06             Mean   :0                 
##  3rd Qu.:174.0   3rd Qu.: 6.00             3rd Qu.:0                 
##  Max.   :207.0   Max.   :12.00             Max.   :0                 
##  histogram_mode  histogram_mean  histogram_median histogram_variance
##  Min.   :100.5   Min.   : 95.0   Min.   :100.5    Min.   : 0.00     
##  1st Qu.:129.0   1st Qu.:125.0   1st Qu.:129.0    1st Qu.: 2.00     
##  Median :139.0   Median :136.0   Median :139.0    Median : 7.00     
##  Mean   :137.9   Mean   :134.8   Mean   :138.2    Mean   :15.66     
##  3rd Qu.:148.0   3rd Qu.:145.0   3rd Qu.:148.0    3rd Qu.:24.00     
##  Max.   :176.5   Max.   :175.0   Max.   :176.5    Max.   :57.00     
##  histogram_tendency fetal_health
##  Min.   :-1.0000    1:1646      
##  1st Qu.: 0.0000    2: 292      
##  Median : 0.0000    3: 175      
##  Mean   : 0.3185                
##  3rd Qu.: 1.0000                
##  Max.   : 1.0000
#---Distribusi variabel target (fetal_health)
cat("\nDistribusi Target (fetal_health):\n")
## 
## Distribusi Target (fetal_health):
print(table(data$fetal_health))
## 
##    1    2    3 
## 1646  292  175
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
ggplot(data, aes(x = factor(fetal_health))) +
  geom_bar(fill = "steelblue") +
  labs(title = "Distribusi Kesehatan Janin (fetal_health)", x = "Kategori Kesehatan", y = "Jumlah") +
  theme_minimal()

#---korelasi
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
numeric_cols <- data[, sapply(data, is.numeric)]
corrplot(cor(numeric_cols), method = "color",
         tl.cex = 0.3, number.cex = 0.2,
         col = colorRampPalette(c("blue", "white", "red"))(200),
         type = "full", addCoef.col = "black")
## Warning in cor(numeric_cols): the standard deviation is zero

#---Boxplot semua variabel numerik terhadap fetal_health
num_vars <- names(data)[sapply(data, is.numeric)]
par(mfrow = c(3, 3))
for (col in num_vars) {
  boxplot(data[[col]] ~ data$fetal_health,
          main = paste("Boxplot:", col), xlab = "Fetal Health", ylab = col)
}

#--distribusi
par(mfrow = c(3, 3))

for (col in num_vars) {
  plot(density(data[[col]]), main = paste("Density:", col),
       xlab = col, col = "blue", lwd = 2)
}

#---visualisasi setelah penanganan outliers
num_vars <- names(data)[sapply(data, is.numeric)]
par(mfrow = c(3, 3))

for (col in num_vars) {
  boxplot(data[[col]],
          main = paste("Sesudah -", col),
          col = "lightblue", border = "black")
  
  
}

# ================================== 3. FA (Factor Analysis) ===================================== #
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(psych)
## Warning: package 'psych' was built under R version 4.4.3
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
#---Ambil hanya prediktor numerik (exclude target dan faktor)
numeric_predictors <- data %>% dplyr::select_if(is.numeric)
non_constant_predictors <- numeric_predictors[, apply(numeric_predictors, 2, var) != 0]

#---Standardisasi data
scale_data <- scale(non_constant_predictors)

#---Matriks kovarians dan eigen decomposition (PCA manual)
varcov <- cov(scale_data)
pc <- eigen(varcov)

#---Nilai eigen dan vektor eigen
pc$values
##  [1] 5.939993932 3.557440003 1.704753117 1.457227786 1.324010955 0.936299532
##  [7] 0.642214015 0.569947642 0.479736326 0.343847479 0.284374283 0.238523068
## [13] 0.204055256 0.132892799 0.111883040 0.050850244 0.020599614 0.001350909
pc$vectors
##               [,1]         [,2]        [,3]         [,4]          [,5]
##  [1,]  0.234431416  0.315676136 -0.20201148 -0.115138988  0.1991823629
##  [2,] -0.070824259  0.272717161  0.28700292 -0.155436045 -0.4352342682
##  [3,] -0.040203863  0.074308236 -0.30855477  0.332281463 -0.4809264528
##  [4,] -0.133631531  0.002588429  0.28165534 -0.447501040  0.3185520525
##  [5,] -0.289490223  0.025848559 -0.23887107 -0.199333019  0.3099524669
##  [6,]  0.164511467 -0.074296498 -0.53788417 -0.123192736 -0.0422251407
##  [7,] -0.341174683  0.102206106  0.07625616 -0.044899480  0.0131125960
##  [8,]  0.253530603 -0.094052534 -0.35143759  0.020656490  0.1302886700
##  [9,]  0.006577825  0.040958471  0.35119314  0.561052485  0.1653114830
## [10,] -0.326709318  0.260749930 -0.11357313  0.109283693  0.0002954748
## [11,]  0.346493749 -0.112573322  0.10867903 -0.256824452 -0.1247929485
## [12,] -0.135554623  0.394492714 -0.06021548 -0.181069184 -0.2058000159
## [13,] -0.262221752  0.224234661 -0.17860874  0.142520224 -0.0031618569
## [14,]  0.239382487  0.398902853  0.02336186 -0.046540512  0.0782915146
## [15,]  0.296389066  0.344171483  0.11332132  0.003603282 -0.0198242978
## [16,]  0.261550799  0.396796488  0.03182560 -0.042918369  0.0355921834
## [17,] -0.314269787  0.156878531 -0.16078401 -0.154624485  0.0622390556
## [18,]  0.052012640  0.216986283 -0.05250994  0.347638234  0.4756083660
##               [,6]        [,7]        [,8]          [,9]        [,10]
##  [1,] -0.184512237  0.21346024  0.10384131 -0.1823259868  0.146739862
##  [2,]  0.422557805 -0.28828834 -0.02014202  0.0173361320 -0.153516571
##  [3,]  0.135974612  0.62755349 -0.28741949  0.1987439864 -0.052592552
##  [4,] -0.012583172  0.19404714 -0.69612278  0.2333600872 -0.091078294
##  [5,]  0.056203434  0.26761677  0.26944058 -0.0785377391  0.113397957
##  [6,]  0.020641519 -0.21342686 -0.45139706 -0.4998063612 -0.094856541
##  [7,] -0.035486401  0.25203085  0.14649780  0.0559898916  0.096097935
##  [8,] -0.165379778 -0.22772769  0.07203104  0.6855710159 -0.346729120
##  [9,] -0.457781193  0.05543522 -0.17694017 -0.1778053335 -0.232080793
## [10,] -0.103162212 -0.19722616 -0.05774138 -0.0617307381 -0.150482236
## [11,] -0.116740095  0.19551129  0.05594176  0.0432026399  0.152885263
## [12,] -0.409608346 -0.09430314 -0.02027137 -0.0546977529 -0.081387464
## [13,] -0.129354696 -0.30427392 -0.20921095  0.3106494666  0.627628635
## [14,]  0.045195060  0.12144374  0.02863137  0.0738071301 -0.007002607
## [15,] -0.011773862 -0.01517565  0.01612687  0.0333113458 -0.029803115
## [16,]  0.018993248  0.05518327  0.01608572  0.0035119445  0.019281301
## [17,]  0.004905361  0.08088933  0.16670468  0.0006084543 -0.538659582
## [18,]  0.561823888 -0.03254123 -0.08675391 -0.0051829996 -0.025317734
##              [,11]        [,12]         [,13]       [,14]         [,15]
##  [1,] -0.180363160 -0.023600013  0.2542384320  0.68202755 -0.0828008956
##  [2,] -0.027222732  0.054733932 -0.3487539142  0.37746708 -0.1750950498
##  [3,]  0.057835517 -0.105690030 -0.0001919315  0.04366578 -0.0455384282
##  [4,] -0.009197949 -0.090906464  0.0256227293  0.07723994  0.0197313618
##  [5,]  0.268715703 -0.267365890 -0.6263906917  0.02275780 -0.1322662648
##  [6,] -0.118214248  0.248508443 -0.2564736833 -0.07773428  0.0366081184
##  [7,] -0.708046998  0.449565804 -0.1710234642 -0.10857260  0.0591105485
##  [8,] -0.204274792 -0.007602557 -0.2254714613  0.09756041 -0.0811208709
##  [9,]  0.106800352  0.165492570 -0.3462697064  0.13804217 -0.1179314599
## [10,] -0.142953789 -0.341984966  0.0847897897 -0.03775593  0.1371303483
## [11,]  0.130000653  0.292146323 -0.0468321569 -0.19306649 -0.4767971297
## [12,] -0.088276694 -0.241501779  0.1128732875 -0.35625009 -0.4638679451
## [13,]  0.282368933  0.310969945 -0.0090699899  0.08997137 -0.0174873128
## [14,]  0.113266835  0.095894782 -0.1101830578 -0.33427267  0.4838202151
## [15,]  0.084137422 -0.035838338 -0.0865875355  0.01189619  0.0679971757
## [16,]  0.001445312  0.023779671 -0.1253595523 -0.11117907  0.1324633991
## [17,]  0.409015896  0.494719583  0.2713377773  0.06075635 -0.0005140677
## [18,] -0.102884356  0.033093202  0.1541482565 -0.18952860 -0.4464680441
##               [,16]         [,17]         [,18]
##  [1,]  1.890667e-01 -0.0302745362 -0.0088031031
##  [2,]  2.003681e-01  0.0073362787 -0.0018662891
##  [3,] -1.609119e-02 -0.0023420877 -0.0026175892
##  [4,] -2.368642e-02 -0.0002429731  0.0005551159
##  [5,] -2.722990e-02 -0.0380646660 -0.0001182228
##  [6,] -3.300720e-02 -0.0510703388  0.0025338796
##  [7,] -1.120473e-01 -0.0771994378 -0.0074377618
##  [8,]  5.030988e-02  0.0091938249  0.0009956713
##  [9,]  1.053868e-01  0.0254230637  0.0010024972
## [10,] -1.344246e-06  0.0368812724 -0.7449356274
## [11,]  9.397018e-02  0.0118552108 -0.5618534911
## [12,]  1.038270e-01 -0.0526394973  0.3550228804
## [13,] -1.749595e-02 -0.0015716126 -0.0020459091
## [14,]  5.704821e-01 -0.2148775540 -0.0039844973
## [15,] -6.595388e-01 -0.5659045067 -0.0442671001
## [16,] -3.214611e-01  0.7844143632  0.0337668745
## [17,] -9.216216e-02  0.0235826920 -0.0083858401
## [18,]  5.544875e-02 -0.0414717440  0.0028755687
#---Scree Plot manual
plot(pc$values, type="b", main="Scree Plot", xlab="Number of Factors", ylab="Eigenvalues")

#---Proporsi kumulatif
cumprop <- cumsum(pc$values) / sum(pc$values)
cumprop
##  [1] 0.3299997 0.5276352 0.6223437 0.7033008 0.7768570 0.8288736 0.8645522
##  [8] 0.8962159 0.9228680 0.9419706 0.9577692 0.9710205 0.9823569 0.9897398
## [15] 0.9959555 0.9987805 0.9999249 1.0000000
#---Jumlah faktor berdasarkan threshold kumulatif >= 0.8
n_factors <- which(cumprop >= 0.80)[1]
cat("Jumlah faktor berdasarkan threshold kumulatif >= 0.80:", n_factors, "\n")
## Jumlah faktor berdasarkan threshold kumulatif >= 0.80: 6
#---Menghitung loading matrix manual (tanpa rotasi)
L <- matrix(nrow = nrow(pc$vectors), ncol = n_factors)
for (i in 1:n_factors) {
  L[, i] <- sqrt(pc$values[i]) * pc$vectors[, i]
}
colnames(L) <- paste0("F", 1:n_factors)
rownames(L) <- colnames(scale_data)
print(L)
##                                                                 F1           F2
## baseline.value                                          0.57135866  0.595402349
## accelerations                                          -0.17261361  0.514376666
## fetal_movement                                         -0.09798527  0.140154080
## uterine_contractions                                   -0.32568814  0.004882082
## light_decelerations                                    -0.70554855  0.048753424
## abnormal_short_term_variability                         0.40094904 -0.140131940
## mean_value_of_short_term_variability                   -0.83151444  0.192772746
## percentage_of_time_with_abnormal_long_term_variability  0.61790739 -0.177394149
## mean_value_of_long_term_variability                     0.01603154  0.077252497
## histogram_width                                        -0.79625931  0.491805061
## histogram_min                                           0.84447813 -0.212326535
## histogram_max                                          -0.33037512  0.744059693
## histogram_number_of_peaks                              -0.63908955  0.422932964
## histogram_mode                                          0.58342546  0.752377734
## histogram_mean                                          0.72236248  0.649147928
## histogram_median                                        0.63745430  0.748404882
## histogram_variance                                     -0.76594155  0.295891375
## histogram_tendency                                      0.12676574  0.409261671
##                                                                 F3           F4
## baseline.value                                         -0.26375870 -0.138990823
## accelerations                                           0.37472880 -0.187635695
## fetal_movement                                         -0.40286823  0.401115861
## uterine_contractions                                    0.36774666 -0.540203969
## light_decelerations                                    -0.31188487 -0.240626229
## abnormal_short_term_variability                        -0.70229489 -0.148712961
## mean_value_of_short_term_variability                    0.09956477 -0.054200717
## percentage_of_time_with_abnormal_long_term_variability -0.45885869  0.024935624
## mean_value_of_long_term_variability                     0.45853952  0.677278380
## histogram_width                                        -0.14828811  0.131922565
## histogram_min                                           0.14189807 -0.310027410
## histogram_max                                          -0.07862107 -0.218578915
## histogram_number_of_peaks                              -0.23320263  0.172044272
## histogram_mode                                          0.03050269 -0.056181700
## histogram_mean                                          0.14795933  0.004349727
## histogram_median                                        0.04155348 -0.051809205
## histogram_variance                                     -0.20992956 -0.186656015
## histogram_tendency                                     -0.06856024  0.419653894
##                                                                  F5
## baseline.value                                          0.229190530
## accelerations                                          -0.500805248
## fetal_movement                                         -0.553381269
## uterine_contractions                                    0.366544069
## light_decelerations                                     0.356648898
## abnormal_short_term_variability                        -0.048586643
## mean_value_of_short_term_variability                    0.015088097
## percentage_of_time_with_abnormal_long_term_variability  0.149917538
## mean_value_of_long_term_variability                     0.190216774
## histogram_width                                         0.000339990
## histogram_min                                          -0.143593848
## histogram_max                                          -0.236805177
## histogram_number_of_peaks                              -0.003638212
## histogram_mode                                          0.090086660
## histogram_mean                                         -0.022810962
## histogram_median                                        0.040954386
## histogram_variance                                      0.071615789
## histogram_tendency                                      0.547261976
##                                                                  F6
## baseline.value                                         -0.178538786
## accelerations                                           0.408877800
## fetal_movement                                          0.131572532
## uterine_contractions                                   -0.012175801
## light_decelerations                                     0.054383888
## abnormal_short_term_variability                         0.019973264
## mean_value_of_short_term_variability                   -0.034337554
## percentage_of_time_with_abnormal_long_term_variability -0.160025726
## mean_value_of_long_term_variability                    -0.442960856
## histogram_width                                        -0.099822409
## histogram_min                                          -0.112960719
## histogram_max                                          -0.396347570
## histogram_number_of_peaks                              -0.125166931
## histogram_mode                                          0.043731902
## histogram_mean                                         -0.011392692
## histogram_median                                        0.018378355
## histogram_variance                                      0.004746554
## histogram_tendency                                      0.543635243
#---FA tanpa rotasi menggunakan `psych::fa`
fa_result <- fa(r = scale_data, covar = TRUE, nfactors = n_factors, rotate = "varimax", scores = "regression")

#---Ambil factor loadings
load_no_rotation <- fa_result$loadings
print(load_no_rotation)
## 
## Loadings:
##                                                        MR1    MR2    MR3   
## baseline.value                                                 0.838 -0.267
## accelerations                                           0.169  0.205  0.955
## fetal_movement                                          0.166              
## uterine_contractions                                    0.187         0.100
## light_decelerations                                     0.681 -0.246 -0.166
## abnormal_short_term_variability                        -0.222  0.123 -0.360
## mean_value_of_short_term_variability                    0.719 -0.270  0.210
## percentage_of_time_with_abnormal_long_term_variability -0.424  0.214 -0.439
## mean_value_of_long_term_variability                                        
## histogram_width                                         0.954         0.152
## histogram_min                                          -0.848  0.320       
## histogram_max                                           0.681  0.475  0.182
## histogram_number_of_peaks                               0.740              
## histogram_mode                                         -0.106  0.930       
## histogram_mean                                         -0.310  0.911  0.150
## histogram_median                                       -0.157  0.970  0.110
## histogram_variance                                      0.786 -0.135       
## histogram_tendency                                      0.123  0.342       
##                                                        MR4    MR5    MR6   
## baseline.value                                          0.127              
## accelerations                                           0.102              
## fetal_movement                                                 0.466       
## uterine_contractions                                          -0.593       
## light_decelerations                                     0.273 -0.319  0.104
## abnormal_short_term_variability                         0.442  0.355       
## mean_value_of_short_term_variability                          -0.247       
## percentage_of_time_with_abnormal_long_term_variability  0.198  0.255       
## mean_value_of_long_term_variability                    -0.855              
## histogram_width                                        -0.122  0.116       
## histogram_min                                           0.146 -0.117 -0.301
## histogram_max                                                        -0.508
## histogram_number_of_peaks                                      0.148       
## histogram_mode                                                        0.135
## histogram_mean                                         -0.120              
## histogram_median                                                           
## histogram_variance                                      0.163 -0.150       
## histogram_tendency                                                    0.744
## 
##                  MR1   MR2   MR3   MR4   MR5   MR6
## SS loadings    4.710 4.048 1.519 1.170 1.018 0.959
## Proportion Var 0.262 0.225 0.084 0.065 0.057 0.053
## Cumulative Var 0.262 0.487 0.571 0.636 0.692 0.746
#---Visualisasi loading faktor tanpa rotasi
plot(load_no_rotation[, c(1,2)], type = "n", main = "Plot Faktor Tanpa Rotasi")
text(load_no_rotation[, c(1,2)], labels = rownames(load_no_rotation), cex = 0.7)

#---Diagram faktor (tanpa rotasi)
fa.diagram(fa_result)

#---Ambil skor faktornya
fa_scores <- as.data.frame(fa_result$scores)
colnames(fa_scores) <- paste0("F", 1:n_factors)

if (nrow(fa_scores) != nrow(data)) {
  stop("Jumlah baris fa_scores dan data$fetal_health berbeda! Cek missing values di scale_data.")
}
#---Gabungkan kembali dengan target
data_fa <- cbind(fa_scores, fetal_health = data$fetal_health)

#---cek kelas
print(table(data_fa$fetal_health))
## 
##    1    2    3 
## 1646  292  175
#================================= 4. Klasifikasi LDA =======================================#
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
library(ggplot2)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.4.3
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(MVN)
## Warning: package 'MVN' was built under R version 4.4.3
library(biotools)
## Warning: package 'biotools' was built under R version 4.4.3
## ---
## biotools version 4.3
#---Uji Normalitas Multivariat per kelas (Mardia)
cat("\n=== Uji Normalitas Multivariat (Mardia) ===\n")
## 
## === Uji Normalitas Multivariat (Mardia) ===
for (kelas in unique(data_fa$fetal_health)) {
  cat("\nKelas:", kelas, "\n")
  hasil_mvn <- mvn(data_fa[data_fa$fetal_health == kelas, -which(names(data_fa) == "fetal_health")],
                   mvnTest = "mardia", multivariatePlot = "none")
  print(hasil_mvn$multivariateNormality)
}
## 
## Kelas: 2 
##              Test        Statistic               p value Result
## 1 Mardia Skewness 1349.72566924878 1.90693331347686e-245     NO
## 2 Mardia Kurtosis 33.2641935630372                     0     NO
## 3             MVN             <NA>                  <NA>     NO
## 
## Kelas: 1 
##              Test        Statistic              p value Result
## 1 Mardia Skewness 1745.93628624914                    0     NO
## 2 Mardia Kurtosis 4.20548023971821 2.60528352138412e-05     NO
## 3             MVN             <NA>                 <NA>     NO
## 
## Kelas: 3 
##              Test        Statistic              p value Result
## 1 Mardia Skewness 506.064097862599  1.0165114564989e-73     NO
## 2 Mardia Kurtosis 4.59851389839783 4.25515332391768e-06     NO
## 3             MVN             <NA>                 <NA>     NO
#---Uji Homogenitas Varians-Kovarians (Box's M Test)
cat("\n=== Uji Homogenitas Varians-Kovarians (Box's M Test) ===\n")
## 
## === Uji Homogenitas Varians-Kovarians (Box's M Test) ===
boxm_result <- boxM(data_fa[, -which(names(data_fa) == "fetal_health")], data_fa$fetal_health)
print(boxm_result)
## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  data_fa[, -which(names(data_fa) == "fetal_health")]
## Chi-Sq (approx.) = 1715.4, df = 42, p-value < 2.2e-16
#---Pastikan labelnya bertipe faktor
data_fa$fetal_health <- as.factor(data_fa$fetal_health)


#---lda model
lda_model <- lda(fetal_health ~ ., data = data_fa)

#---uji signifikansi pakai wilks lambda
manova_lda <- manova(as.matrix(data_fa[, -which(names(data_fa) == "fetal_health")]) ~ data_fa$fetal_health)
summary_manova <- summary(manova_lda, test = "Wilks")
cat("\n=== Uji Signifikansi Model LDA (Wilks' Lambda) ===\n")
## 
## === Uji Signifikansi Model LDA (Wilks' Lambda) ===
print(summary_manova)
##                        Df   Wilks approx F num Df den Df    Pr(>F)    
## data_fa$fetal_health    2 0.45808   167.53     12   4210 < 2.2e-16 ***
## Residuals            2110                                             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#---Uji signifikansi Variabel (Koefisien Diskriminan)
print(round(lda_model$scaling, 4))
##        LD1     LD2
## F1 -0.0962  0.4722
## F2 -0.1004 -0.9002
## F3 -0.9254  0.2128
## F4  0.6719  0.4047
## F5  0.7186 -0.2573
## F6 -0.3361 -0.1965
#---prediksi pada data yang sama
lda_pred <- predict(lda_model, newdata = data_fa)

#---confusion
confusion_matrix <- table(Predicted = lda_pred$class, Actual = data_fa$fetal_health)
cat("Confusion Matrix:\n")
## Confusion Matrix:
print(confusion_matrix)
##          Actual
## Predicted    1    2    3
##         1 1551   97   27
##         2   57  192   49
##         3   38    3   99
#---heatmap
conf_mat_table <- table(Predicted = lda_pred$class, Actual = data_fa$fetal_health)
conf_mat_df <- as.data.frame(conf_mat_table)

ggplot(conf_mat_df, aes(x = Actual, y = Predicted, fill = Freq)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Freq), size = 5, color = "black") +
  scale_fill_gradient(low = "lightblue", high = "red") +
  labs(title = "Heatmap Confusion Matrix (LDA - In Sample)",
       x = "Actual Class", y = "Predicted Class") +
  theme_minimal()

accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix) * 100
cat("Akurasi:", round(accuracy, 2), "%\n")
## Akurasi: 87.17 %
#===================================== 5. klasifikasi mlr =================================#
library(nnet)
## Warning: package 'nnet' was built under R version 4.4.3
library(caret)
library(ggplot2)
library(reshape2)


#---VIF
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
## The following object is masked from 'package:dplyr':
## 
##     recode
library(dplyr)

#---Buat model VIF hanya dari prediktor, tanpa pake target 
predictors_only <- dplyr::select(data_fa, -fetal_health)

model_vif <- lm(rep(1, nrow(predictors_only)) ~ ., data = predictors_only)
vif_values <- car::vif(model_vif)

print("VIF antar prediktor:")
## [1] "VIF antar prediktor:"
print(vif_values)
##       F1       F2       F3       F4       F5       F6 
## 1.000918 1.001370 1.000896 1.002136 1.002308 1.002018
#---Klasikasi Multinomial Regresion
set.seed(123)
data_fa$fetal_health <- as.factor(data_fa$fetal_health)

#--Latih model multinomial logistic regression pada seluruh data
model_multi <- multinom(fetal_health ~ ., data = data_fa)
## # weights:  24 (14 variable)
## initial  value 2321.367766 
## iter  10 value 707.367031
## iter  20 value 636.637277
## final  value 633.481883 
## converged
#--Prediksi pada data yang sama (in-sample)
prediksi <- predict(model_multi, newdata = data_fa)

# --- Uji Serentak ---
model_null <- multinom(fetal_health ~ 1, data = data_fa)
## # weights:  6 (2 variable)
## initial  value 2321.367766 
## iter  10 value 1424.944851
## iter  10 value 1424.944848
## iter  10 value 1424.944848
## final  value 1424.944848 
## converged
lrt_stat <- 2 * (logLik(model_multi) - logLik(model_null))
df_diff <- attr(logLik(model_multi), "df") - attr(logLik(model_null), "df")
p_value <- pchisq(lrt_stat, df = df_diff, lower.tail = FALSE)

cat("Uji Serentak (Likelihood Ratio Test):\n")
## Uji Serentak (Likelihood Ratio Test):
cat("Statistik LRT =", round(lrt_stat, 3), "\n")
## Statistik LRT = 1582.926
cat("Derajat kebebasan =", df_diff, "\n")
## Derajat kebebasan = 12
cat("P-value =", p_value, "\n")
## P-value = 0
# --- Uji Parsial ---
summary_model <- summary(model_multi)

coefs <- summary_model$coefficients
std_err <- summary_model$standard.errors

z_values <- coefs / std_err
p_values <- 2 * (1 - pnorm(abs(z_values)))

cat("\nUji Parsial (Wald Test) untuk tiap koefisien:\n")
## 
## Uji Parsial (Wald Test) untuk tiap koefisien:
print(round(p_values, 4))
##   (Intercept) F1 F2 F3     F4 F5 F6
## 2           0  0  0  0 0.0025  0  0
## 3           0  0  0  0 0.0000  0  0
# Confusion Matrix
confusion_matrix <- table(Predicted = prediksi, Actual = data_fa$fetal_health)
cat("Confusion Matrix:\n")
## Confusion Matrix:
print(confusion_matrix)
##          Actual
## Predicted    1    2    3
##         1 1559   81   29
##         2   53  196   35
##         3   34   15  111
#--Visualisasi confusion matrix sebagai heatmap
cm_df <- as.data.frame(confusion_matrix)
colnames(cm_df) <- c("Predicted", "Actual", "Freq")

ggplot(data = cm_df, aes(x = Actual, y = Predicted, fill = Freq)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Freq), vjust = 0.5, fontface = "bold", color = "black") +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  labs(title = "Confusion Matrix (Multinomial Logistic Regression - In Sample)",
       x = "Actual Label", y = "Predicted Label") +
  theme_minimal()

#--Hitung akurasi
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
cat("Akurasi:", round(accuracy * 100, 2), "%\n")
## Akurasi: 88.31 %
#---Interpretasi menggunakan odds ratio
odds_ratios <- exp(coefs)
cat("\n=== Odds Ratio ===\n")
## 
## === Odds Ratio ===
print(round(odds_ratios, 3))
##   (Intercept)    F1    F2    F3    F4    F5    F6
## 2       0.015 0.418 1.827 0.038 1.495 4.328 0.658
## 3       0.001 0.522 0.168 0.005 7.054 3.798 0.466