Script Metode Discriminant Analysis Principal Component Analysis

Import library

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(ggpubr)
library(tidyr)
library(ggcorrplot)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ lubridate 1.9.4     ✔ stringr   1.5.1
## ✔ purrr     1.0.4     ✔ tibble    3.2.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(MASS)

## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select

library(corrplot)

## corrplot 0.95 loaded

library(psych)

## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

Membaca dataset

df <- read.csv("C:/Users/FIRA/Downloads/alzheimers_disease_data (1).csv")

str(df)

## 'data.frame':    2149 obs. of  35 variables:
##  $ PatientID                : int  4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 ...
##  $ Age                      : int  73 89 73 74 89 86 68 75 72 87 ...
##  $ Gender                   : int  0 0 0 1 0 1 0 0 1 0 ...
##  $ Ethnicity                : int  0 0 3 0 0 1 3 0 1 0 ...
##  $ EducationLevel           : int  2 0 1 1 0 1 2 1 0 0 ...
##  $ BMI                      : num  22.9 26.8 17.8 33.8 20.7 ...
##  $ Smoking                  : int  0 0 0 1 0 0 1 0 0 1 ...
##  $ AlcoholConsumption       : num  13.3 4.54 19.56 12.21 18.45 ...
##  $ PhysicalActivity         : num  6.33 7.62 7.84 8.43 6.31 ...
##  $ DietQuality              : num  1.347 0.519 1.826 7.436 0.795 ...
##  $ SleepQuality             : num  9.03 7.15 9.67 8.39 5.6 ...
##  $ FamilyHistoryAlzheimers  : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ CardiovascularDisease    : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ Diabetes                 : int  1 0 0 0 0 1 0 0 0 0 ...
##  $ Depression               : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ HeadInjury               : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ Hypertension             : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ SystolicBP               : int  142 115 99 118 94 168 143 117 117 130 ...
##  $ DiastolicBP              : int  72 64 116 115 117 62 88 63 119 78 ...
##  $ CholesterolTotal         : num  242 231 284 160 238 ...
##  $ CholesterolLDL           : num  56.2 193.4 153.3 65.4 92.9 ...
##  $ CholesterolHDL           : num  33.7 79 69.8 68.5 56.9 ...
##  $ CholesterolTriglycerides : num  162.2 294.6 83.6 277.6 291.2 ...
##  $ MMSE                     : num  21.46 20.61 7.36 13.99 13.52 ...
##  $ FunctionalAssessment     : num  6.52 7.12 5.9 8.97 6.05 ...
##  $ MemoryComplaints         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ BehavioralProblems       : int  0 0 0 1 0 0 0 0 1 1 ...
##  $ ADL                      : num  1.7259 2.5924 7.1195 6.4812 0.0147 ...
##  $ Confusion                : int  0 0 0 0 0 1 0 1 0 0 ...
##  $ Disorientation           : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ PersonalityChanges       : int  0 0 0 0 1 0 0 0 1 0 ...
##  $ DifficultyCompletingTasks: int  1 0 1 0 1 0 0 0 0 0 ...
##  $ Forgetfulness            : int  0 1 0 0 0 0 1 1 0 0 ...
##  $ Diagnosis                : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ DoctorInCharge           : chr  "XXXConfid" "XXXConfid" "XXXConfid" "XXXConfid" ...

Cek duplikasi

cat("Jumlah duplikasi:", sum(duplicated(df)), "\n")

## Jumlah duplikasi: 0

print(colSums(is.na(df)))

##                 PatientID                       Age                    Gender 
##                         0                         0                         0 
##                 Ethnicity            EducationLevel                       BMI 
##                         0                         0                         0 
##                   Smoking        AlcoholConsumption          PhysicalActivity 
##                         0                         0                         0 
##               DietQuality              SleepQuality   FamilyHistoryAlzheimers 
##                         0                         0                         0 
##     CardiovascularDisease                  Diabetes                Depression 
##                         0                         0                         0 
##                HeadInjury              Hypertension                SystolicBP 
##                         0                         0                         0 
##               DiastolicBP          CholesterolTotal            CholesterolLDL 
##                         0                         0                         0 
##            CholesterolHDL  CholesterolTriglycerides                      MMSE 
##                         0                         0                         0 
##      FunctionalAssessment          MemoryComplaints        BehavioralProblems 
##                         0                         0                         0 
##                       ADL                 Confusion            Disorientation 
##                         0                         0                         0 
##        PersonalityChanges DifficultyCompletingTasks             Forgetfulness 
##                         0                         0                         0 
##                 Diagnosis            DoctorInCharge 
##                         0                         0

Drop kolom yang tidak perlu

df <- df %>% dplyr::select(-PatientID, -DoctorInCharge)
str(df)

## 'data.frame':    2149 obs. of  33 variables:
##  $ Age                      : int  73 89 73 74 89 86 68 75 72 87 ...
##  $ Gender                   : int  0 0 0 1 0 1 0 0 1 0 ...
##  $ Ethnicity                : int  0 0 3 0 0 1 3 0 1 0 ...
##  $ EducationLevel           : int  2 0 1 1 0 1 2 1 0 0 ...
##  $ BMI                      : num  22.9 26.8 17.8 33.8 20.7 ...
##  $ Smoking                  : int  0 0 0 1 0 0 1 0 0 1 ...
##  $ AlcoholConsumption       : num  13.3 4.54 19.56 12.21 18.45 ...
##  $ PhysicalActivity         : num  6.33 7.62 7.84 8.43 6.31 ...
##  $ DietQuality              : num  1.347 0.519 1.826 7.436 0.795 ...
##  $ SleepQuality             : num  9.03 7.15 9.67 8.39 5.6 ...
##  $ FamilyHistoryAlzheimers  : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ CardiovascularDisease    : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ Diabetes                 : int  1 0 0 0 0 1 0 0 0 0 ...
##  $ Depression               : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ HeadInjury               : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ Hypertension             : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ SystolicBP               : int  142 115 99 118 94 168 143 117 117 130 ...
##  $ DiastolicBP              : int  72 64 116 115 117 62 88 63 119 78 ...
##  $ CholesterolTotal         : num  242 231 284 160 238 ...
##  $ CholesterolLDL           : num  56.2 193.4 153.3 65.4 92.9 ...
##  $ CholesterolHDL           : num  33.7 79 69.8 68.5 56.9 ...
##  $ CholesterolTriglycerides : num  162.2 294.6 83.6 277.6 291.2 ...
##  $ MMSE                     : num  21.46 20.61 7.36 13.99 13.52 ...
##  $ FunctionalAssessment     : num  6.52 7.12 5.9 8.97 6.05 ...
##  $ MemoryComplaints         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ BehavioralProblems       : int  0 0 0 1 0 0 0 0 1 1 ...
##  $ ADL                      : num  1.7259 2.5924 7.1195 6.4812 0.0147 ...
##  $ Confusion                : int  0 0 0 0 0 1 0 1 0 0 ...
##  $ Disorientation           : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ PersonalityChanges       : int  0 0 0 0 1 0 0 0 1 0 ...
##  $ DifficultyCompletingTasks: int  1 0 1 0 1 0 0 0 0 0 ...
##  $ Forgetfulness            : int  0 1 0 0 0 0 1 1 0 0 ...
##  $ Diagnosis                : int  0 0 0 0 0 0 0 1 0 0 ...

Konversi Diagnosis jadi faktor

df$Diagnosis <- as.factor(df$Diagnosis)
str(df)

## 'data.frame':    2149 obs. of  33 variables:
##  $ Age                      : int  73 89 73 74 89 86 68 75 72 87 ...
##  $ Gender                   : int  0 0 0 1 0 1 0 0 1 0 ...
##  $ Ethnicity                : int  0 0 3 0 0 1 3 0 1 0 ...
##  $ EducationLevel           : int  2 0 1 1 0 1 2 1 0 0 ...
##  $ BMI                      : num  22.9 26.8 17.8 33.8 20.7 ...
##  $ Smoking                  : int  0 0 0 1 0 0 1 0 0 1 ...
##  $ AlcoholConsumption       : num  13.3 4.54 19.56 12.21 18.45 ...
##  $ PhysicalActivity         : num  6.33 7.62 7.84 8.43 6.31 ...
##  $ DietQuality              : num  1.347 0.519 1.826 7.436 0.795 ...
##  $ SleepQuality             : num  9.03 7.15 9.67 8.39 5.6 ...
##  $ FamilyHistoryAlzheimers  : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ CardiovascularDisease    : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ Diabetes                 : int  1 0 0 0 0 1 0 0 0 0 ...
##  $ Depression               : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ HeadInjury               : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ Hypertension             : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ SystolicBP               : int  142 115 99 118 94 168 143 117 117 130 ...
##  $ DiastolicBP              : int  72 64 116 115 117 62 88 63 119 78 ...
##  $ CholesterolTotal         : num  242 231 284 160 238 ...
##  $ CholesterolLDL           : num  56.2 193.4 153.3 65.4 92.9 ...
##  $ CholesterolHDL           : num  33.7 79 69.8 68.5 56.9 ...
##  $ CholesterolTriglycerides : num  162.2 294.6 83.6 277.6 291.2 ...
##  $ MMSE                     : num  21.46 20.61 7.36 13.99 13.52 ...
##  $ FunctionalAssessment     : num  6.52 7.12 5.9 8.97 6.05 ...
##  $ MemoryComplaints         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ BehavioralProblems       : int  0 0 0 1 0 0 0 0 1 1 ...
##  $ ADL                      : num  1.7259 2.5924 7.1195 6.4812 0.0147 ...
##  $ Confusion                : int  0 0 0 0 0 1 0 1 0 0 ...
##  $ Disorientation           : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ PersonalityChanges       : int  0 0 0 0 1 0 0 0 1 0 ...
##  $ DifficultyCompletingTasks: int  1 0 1 0 1 0 0 0 0 0 ...
##  $ Forgetfulness            : int  0 1 0 0 0 0 1 1 0 0 ...
##  $ Diagnosis                : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...

head(df, 10)

##    Age Gender Ethnicity EducationLevel      BMI Smoking AlcoholConsumption
## 1   73      0         0              2 22.92775       0         13.2972177
## 2   89      0         0              0 26.82768       0          4.5425238
## 3   73      0         3              1 17.79588       0         19.5550845
## 4   74      1         0              1 33.80082       1         12.2092655
## 5   89      0         0              0 20.71697       0         18.4543561
## 6   86      1         1              1 30.62689       0          4.1401438
## 7   68      0         3              2 38.38762       1          0.6460473
## 8   75      0         0              1 18.77601       0         13.7238257
## 9   72      1         1              0 27.83319       0         12.1678476
## 10  87      0         0              0 35.45630       1         16.0286882
##    PhysicalActivity DietQuality SleepQuality FamilyHistoryAlzheimers
## 1         6.3271125   1.3472143     9.025679                       0
## 2         7.6198845   0.5187671     7.151293                       0
## 3         7.8449878   1.8263347     9.673574                       1
## 4         8.4280014   7.4356041     8.392554                       0
## 5         6.3104607   0.7954975     5.597238                       0
## 6         0.2110616   1.5849220     7.261953                       0
## 7         9.2576949   5.8973879     5.477686                       0
## 8         4.6494507   8.3419032     4.213210                       0
## 9         1.5313598   6.7368820     5.748224                       0
## 10        6.4407727   8.0860191     7.551773                       0
##    CardiovascularDisease Diabetes Depression HeadInjury Hypertension SystolicBP
## 1                      0        1          1          0            0        142
## 2                      0        0          0          0            0        115
## 3                      0        0          0          0            0         99
## 4                      0        0          0          0            0        118
## 5                      0        0          0          0            0         94
## 6                      0        1          0          0            0        168
## 7                      0        0          0          1            0        143
## 8                      0        0          0          0            0        117
## 9                      0        0          0          0            1        117
## 10                     1        0          0          0            0        130
##    DiastolicBP CholesterolTotal CholesterolLDL CholesterolHDL
## 1           72         242.3668       56.15090       33.68256
## 2           64         231.1626      193.40800       79.02848
## 3          116         284.1819      153.32276       69.77229
## 4          115         159.5822       65.36664       68.45749
## 5          117         237.6022       92.86970       56.87430
## 6           62         280.7125      198.33463       79.08050
## 7           88         263.7341       52.47067       66.53337
## 8           63         151.3831       69.62351       77.34682
## 9          119         233.6058      144.04574       43.07589
## 10          78         281.6301      130.49758       74.29125
##    CholesterolTriglycerides      MMSE FunctionalAssessment MemoryComplaints
## 1                 162.18914 21.463532             6.518877                0
## 2                 294.63091 20.613267             7.118696                0
## 3                  83.63832  7.356249             5.895077                0
## 4                 277.57736 13.991127             8.965106                0
## 5                 291.19878 13.517609             6.045039                0
## 6                 263.94365 27.517529             5.510144                0
## 7                 216.48917  1.964413             6.062124                0
## 8                 210.57087 10.139568             3.401374                0
## 9                 151.16419 25.820732             7.396061                0
## 10                144.17597 28.388409             1.148904                0
##    BehavioralProblems        ADL Confusion Disorientation PersonalityChanges
## 1                   0 1.72588346         0              0                  0
## 2                   0 2.59242413         0              0                  0
## 3                   0 7.11954774         0              1                  0
## 4                   1 6.48122586         0              0                  0
## 5                   0 0.01469122         0              0                  1
## 6                   0 9.01568628         1              0                  0
## 7                   0 9.23632828         0              0                  0
## 8                   0 4.51724827         1              0                  0
## 9                   1 0.75623181         0              0                  1
## 10                  1 4.55439387         0              0                  0
##    DifficultyCompletingTasks Forgetfulness Diagnosis
## 1                          1             0         0
## 2                          0             1         0
## 3                          1             0         0
## 4                          0             0         0
## 5                          1             0         0
## 6                          0             0         0
## 7                          0             1         0
## 8                          0             1         1
## 9                          0             0         0
## 10                         0             0         0

df$Diagnosis <- factor(df$Diagnosis, levels = c(0,1), labels = c("Tidak Alzheimer", "Alzheimer"))

num_cols <- sapply(df, is.numeric)
num_cols["Diagnosis"] <- FALSE
num_cols["Gender"] <- FALSE
df[num_cols] <- scale(df[num_cols])

ggplot(df, aes(x = Diagnosis)) +
  geom_bar(fill = "lightblue") +
  ggtitle("Distribusi Diagnosis Alzheimer") +
  xlab("Status Diagnosis") + ylab("Jumlah Pasien")

cat("Jumlah tiap kategori:\n")

## Jumlah tiap kategori:

print(table(df$Diagnosis))

## 
## Tidak Alzheimer       Alzheimer 
##            1389             760

label_percentages <- prop.table(table(df$Diagnosis)) * 100
cat("\nPersentase Tiap Label Diagnosis:\n")

## 
## Persentase Tiap Label Diagnosis:

print(round(label_percentages, 2))

## 
## Tidak Alzheimer       Alzheimer 
##           64.63           35.37

valid_vars <- names(df)[sapply(df, function(x) is.numeric(x) && length(unique(x)) > 1)]

anova_pvals <- sapply(valid_vars, function(var) {
  formula <- as.formula(paste0("`", var, "` ~ Diagnosis"))
  result <- tryCatch({
    aov_model <- aov(formula, data = df)
    summary(aov_model)[[1]][["Pr(>F)"]][1]
  }, error = function(e) NA)
  return(result)
})

selected_features <- names(anova_pvals[anova_pvals < 0.05])
cat("Fitur terpilih (p < 0.05):\n")

## Fitur terpilih (p < 0.05):

print(selected_features)

## [1] "EducationLevel"       "SleepQuality"         "CholesterolHDL"      
## [4] "MMSE"                 "FunctionalAssessment" "MemoryComplaints"    
## [7] "BehavioralProblems"   "ADL"

final_data <- df[, c("Diagnosis", selected_features)]
head(final_data)

##         Diagnosis EducationLevel SleepQuality CholesterolHDL        MMSE
## 1 Tidak Alzheimer      0.7886499   1.11965745     -1.1141698  0.77885552
## 2 Tidak Alzheimer     -1.4224508   0.05682309      0.8455334  0.68013845
## 3 Tidak Alzheimer     -0.3169004   1.48703408      0.4455111 -0.85902164
## 4 Tidak Alzheimer     -0.3169004   0.76065615      0.3886897 -0.08870211
## 5 Tidak Alzheimer     -1.4224508  -0.82437383     -0.1118981 -0.14367832
## 6 Tidak Alzheimer     -0.3169004   0.11957058      0.8477818  1.48173376
##   FunctionalAssessment MemoryComplaints BehavioralProblems        ADL
## 1            0.4973901       -0.5123573         -0.4311563 -1.1041775
## 2            0.7047429       -0.5123573         -0.4311563 -0.8104125
## 3            0.2817472       -0.5123573         -0.4311563  0.7243229
## 4            1.3430335       -0.5123573          2.3182650  0.5079260
## 5            0.3335878       -0.5123573         -0.4311563 -1.6842869
## 6            0.1486786       -0.5123573         -0.4311563  1.3671308

set.seed(123)
train_idx <- createDataPartition(final_data$Diagnosis, p = 0.7, list = FALSE)
train_data <- final_data[train_idx, ]
test_data  <- final_data[-train_idx, ]

write.csv(train_data, "C:/Users/FIRA/Downloads/train_data(3).csv", row.names = FALSE)
write.csv(test_data, "C:/Users/FIRA/Downloads/test_data(3).csv", row.names = FALSE)

cat("Distribusi sebelum oversampling:\n")

## Distribusi sebelum oversampling:

print(table(train_data$Diagnosis))

## 
## Tidak Alzheimer       Alzheimer 
##             973             532

library(ROSE)

## Loaded ROSE 0.0-4

set.seed(123)
data_balanced <- ovun.sample(Diagnosis ~ ., data = train_data, method = "over",
                             N = max(table(train_data$Diagnosis)) * 2)$data

cat("Distribusi setelah oversampling:\n")

## Distribusi setelah oversampling:

print(table(data_balanced$Diagnosis))

## 
## Tidak Alzheimer       Alzheimer 
##             973             973

data_balanced$Diagnosis <- factor(data_balanced$Diagnosis, levels = c("Tidak Alzheimer", "Alzheimer"))

write.csv(data_balanced, "C:/Users/FIRA/Downloads/train_data_oversampled(3).csv", row.names = FALSE)

— PCA —

pca_input <- final_data %>% dplyr::select(-Diagnosis)
pca_result <- prcomp(pca_input, center = TRUE, scale. = TRUE)

pca_var <- pca_result$sdev^2
pca_var_exp <- pca_var / sum(pca_var)
cat("\nProporsi Varians oleh Komponen PCA:\n")

## 
## Proporsi Varians oleh Komponen PCA:

print(round(pca_var_exp, 4))

## [1] 0.1385 0.1309 0.1293 0.1264 0.1233 0.1219 0.1162 0.1134

fviz_eig(pca_result, addlabels = TRUE, ylim = c(0, 50))

pca_scores <- as.data.frame(pca_result$x)
pca_data <- cbind(Diagnosis = final_data$Diagnosis, pca_scores)

write.csv(pca_data, "C:/Users/FIRA/Downloads/pca_data(3).csv", row.names = FALSE)

— Modeling Analisis Diskriminan —

library(caret)     
library(MASS)       
library(dplyr)      
library(MVN)        
library(biotools)

## ---
## biotools version 4.3

best_features <- selected_features

data_sel <- df %>%
  dplyr::select(all_of(best_features), Diagnosis)

Uji Asumsi LDA

mvn_res <- mvn(data_sel %>% dplyr::select(-Diagnosis),
               mvnTest = "hz", 
               univariatePlot = FALSE, 
               multivariatePlot = FALSE)
print(mvn_res$multivariateNormality)

##            Test       HZ p value MVN
## 1 Henze-Zirkler 4.208633       0  NO

boxm_res <- boxM(data_sel %>% dplyr::select(-Diagnosis),
                 data_sel$Diagnosis)
print(boxm_res)

## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  data_sel %>% dplyr::select(-Diagnosis)
## Chi-Sq (approx.) = 505.15, df = 36, p-value < 2.2e-16

bartlett_test <- function(df) {
  x <- df[complete.cases(df), ]
  n <- nrow(x); p <- ncol(x)
  chisq <- (n - 1 - (2*p+5)/6) * log(det(cor(x)))
  df_chi <- p*(p-1)/2
  pval <- pchisq(chisq, df_chi, lower.tail = FALSE)
  data.frame(statistic = chisq, df = df_chi, p.value = pval)
}
bart_res <- bartlett_test(data_sel %>% dplyr::select(-Diagnosis))
print(bart_res)

##   statistic df p.value
## 1 -31.54353 28       1

lda_model_rfe <- lda(Diagnosis ~ ., data = data_sel)

pred_rfe <- predict(lda_model_rfe, data_sel)$class
conf_rfe <- confusionMatrix(pred_rfe, data_sel$Diagnosis)
print(conf_rfe)

## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Tidak Alzheimer Alzheimer
##   Tidak Alzheimer            1257       193
##   Alzheimer                   132       567
##                                           
##                Accuracy : 0.8488          
##                  95% CI : (0.8329, 0.8637)
##     No Information Rate : 0.6463          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6631          
##                                           
##  Mcnemar's Test P-Value : 0.0008741       
##                                           
##             Sensitivity : 0.9050          
##             Specificity : 0.7461          
##          Pos Pred Value : 0.8669          
##          Neg Pred Value : 0.8112          
##              Prevalence : 0.6463          
##          Detection Rate : 0.5849          
##    Detection Prevalence : 0.6747          
##       Balanced Accuracy : 0.8255          
##                                           
##        'Positive' Class : Tidak Alzheimer 
##

— LDA PCA —

cum_var <- cumsum(pca_var_exp)
num_pc <- which(cum_var >= 0.9)[1]
cat("Jumlah komponen utama yang digunakan:", num_pc, "\n")

## Jumlah komponen utama yang digunakan: 8

pca_data_subset <- pca_data[, c(1, 2:(num_pc+1))]

set.seed(123)
train_idx_pca <- createDataPartition(pca_data_subset$Diagnosis, p = 0.7, list = FALSE)
train_data_pca <- pca_data_subset[train_idx_pca, ]
test_data_pca  <- pca_data_subset[-train_idx_pca, ]

set.seed(123)
data_balanced_pca <- ovun.sample(Diagnosis ~ ., data = train_data_pca, method = "over",
                                 N = max(table(train_data_pca$Diagnosis)) * 2)$data

lda_pca_model <- lda(Diagnosis ~ ., data = data_balanced_pca)

pred_test_pca  <- predict(lda_pca_model, test_data_pca)$class

cat("\nEvaluasi LDA dengan PCA - Data Uji:\n")

## 
## Evaluasi LDA dengan PCA - Data Uji:

confusion_test_pca <- confusionMatrix(pred_test_pca, test_data_pca$Diagnosis)
print(confusion_test_pca)

## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Tidak Alzheimer Alzheimer
##   Tidak Alzheimer             352        43
##   Alzheimer                    64       185
##                                           
##                Accuracy : 0.8339          
##                  95% CI : (0.8028, 0.8618)
##     No Information Rate : 0.646           
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.6442          
##                                           
##  Mcnemar's Test P-Value : 0.05318         
##                                           
##             Sensitivity : 0.8462          
##             Specificity : 0.8114          
##          Pos Pred Value : 0.8911          
##          Neg Pred Value : 0.7430          
##              Prevalence : 0.6460          
##          Detection Rate : 0.5466          
##    Detection Prevalence : 0.6134          
##       Balanced Accuracy : 0.8288          
##                                           
##        'Positive' Class : Tidak Alzheimer 
##

Script Metode Discriminant Analysis Principal Component Analysis

2025-05-22

Import library

Membaca dataset

Cek duplikasi

Drop kolom yang tidak perlu

Konversi Diagnosis jadi faktor

— PCA —

— Modeling Analisis Diskriminan —

Uji Asumsi LDA

— LDA PCA —