KELOMPOK 11

1. Raden Roro Azahra Tzitziliani Foulin (23031554003)
2. Fardaniyah Hazhiratul Dzauq (23031554045)
3. Novanna Zahrah Zahrani (23031554141)

Import Library

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)
library(dplyr)
library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(corrplot)

## corrplot 0.95 loaded

library(ggpubr)
library(nortest)   
library(biotools)

## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## ---
## biotools version 4.3

library(MASS)      
library(psych)

## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(FactoMineR) 
library(ggfortify)

Load dataset

data <- read.csv("C:\\Users\\LENOVO\\Downloads\\WA_Fn-UseC_-HR-Employee-Attrition.csv")

original_data <- data

PreProcessing

Drop kolom tidak berguna

drop_cols <- c("EmployeeCount", "EmployeeNumber", "StandardHours", "Over18")
data <- data[, !(names(data) %in% drop_cols)]

Cek missing values

missing_values <- sapply(data, function(x) sum(is.na(x)))
print(missing_values)

##                      Age                Attrition           BusinessTravel 
##                        0                        0                        0 
##                DailyRate               Department         DistanceFromHome 
##                        0                        0                        0 
##                Education           EducationField  EnvironmentSatisfaction 
##                        0                        0                        0 
##                   Gender               HourlyRate           JobInvolvement 
##                        0                        0                        0 
##                 JobLevel                  JobRole          JobSatisfaction 
##                        0                        0                        0 
##            MaritalStatus            MonthlyIncome              MonthlyRate 
##                        0                        0                        0 
##       NumCompaniesWorked                 OverTime        PercentSalaryHike 
##                        0                        0                        0 
##        PerformanceRating RelationshipSatisfaction         StockOptionLevel 
##                        0                        0                        0 
##        TotalWorkingYears    TrainingTimesLastYear          WorkLifeBalance 
##                        0                        0                        0 
##           YearsAtCompany       YearsInCurrentRole  YearsSinceLastPromotion 
##                        0                        0                        0 
##     YearsWithCurrManager 
##                        0

Definisi variabel

categorical_vars <- c(
  "Attrition", "BusinessTravel", "Department", "Education", "EducationField",
  "EnvironmentSatisfaction", "Gender", "JobInvolvement", "JobLevel", "JobRole",
  "JobSatisfaction", "MaritalStatus", "OverTime", "RelationshipSatisfaction",
  "StockOptionLevel", "WorkLifeBalance"
)
numeric_cols <- c(
  "Age", "DailyRate", "DistanceFromHome", "HourlyRate", "MonthlyIncome",
  "MonthlyRate", "NumCompaniesWorked", "PercentSalaryHike", "PerformanceRating",
  "TotalWorkingYears", "TrainingTimesLastYear", "YearsAtCompany",
  "YearsInCurrentRole", "YearsSinceLastPromotion", "YearsWithCurrManager"
)

Deteksi outlier (IQR)

outlier_count <- sapply(numeric_cols, function(col) {
  Q1 <- quantile(data[[col]], 0.25)
  Q3 <- quantile(data[[col]], 0.75)
  IQR <- Q3 - Q1
  sum(data[[col]] < (Q1 - 1.5 * IQR) | data[[col]] > (Q3 + 1.5 * IQR))
})
outlier_count <- sort(outlier_count, decreasing = TRUE)
print("Top 10 kolom dengan outlier terbanyak:")

## [1] "Top 10 kolom dengan outlier terbanyak:"

print(head(outlier_count, 10))

##   TrainingTimesLastYear       PerformanceRating           MonthlyIncome 
##                     238                     226                     114 
## YearsSinceLastPromotion          YearsAtCompany       TotalWorkingYears 
##                     107                     104                      63 
##      NumCompaniesWorked      YearsInCurrentRole    YearsWithCurrManager 
##                      52                      21                      14 
##                     Age 
##                       0

num_data_before <- data %>% dplyr::select(dplyr::all_of(numeric_cols))

Hapus outlier

clean_data <- data
cols_for_outlier_removal <- setdiff(numeric_cols, c("PerformanceRating"))
for (col in cols_for_outlier_removal) {
  Q1 <- quantile(clean_data[[col]], 0.25)
  Q3 <- quantile(clean_data[[col]], 0.75)
  IQR <- Q3 - Q1
  lower <- Q1 - 1.5 * IQR
  upper <- Q3 + 1.5 * IQR
  clean_data <- clean_data %>% filter(clean_data[[col]] >= lower & clean_data[[col]] <= upper)
}

Visualisasi boxplot

num_data_after <- clean_data %>% dplyr::select(dplyr::all_of(numeric_cols))

par(mfrow=c(1,2))
boxplot(num_data_before, main="Sebelum Outlier Removal", las=2, col="lightblue")
boxplot(num_data_after, main="Setelah Outlier Removal", las=2, col="lightgreen")

par(mfrow=c(1,1))

Encoding kategorikal ke faktor

clean_data <- clean_data %>% mutate_if(is.character, as.factor)
clean_data$Attrition <- factor(clean_data$Attrition)

Visualisasi distribusi target sebelum hapus outlier

ggplot(original_data, aes(x=Attrition)) +
  geom_bar(fill="steelblue") +
  labs(title="Distribusi Attrition (Sebelum Outlier Removal)")

Visualisasi distribusi target sesudah hapus outlier

ggplot(clean_data, aes(x=Attrition)) +
  geom_bar(fill="steelblue") +
  labs(title="Distribusi Attrition (Sesudah Outlier Removal)")

Korelasi Matriks

cor_matrix <- cor(clean_data[, numeric_cols])
corrplot(cor_matrix, method="color", tl.cex=0.8)

Split train-test 70-30

set.seed(123)
train_idx <- sample(seq_len(nrow(clean_data)), size = 0.7*nrow(clean_data))
train_data <- clean_data[train_idx, ]
test_data  <- clean_data[-train_idx, ]

==== MODEL 1: Logistic Regression dan LDA tanpa reduksi dimensi ====

Model Logistic Regression tanpa reduksi dimensi

logit_model <- glm(Attrition ~ ., data=train_data, family=binomial)
summary(logit_model)

## 
## Call:
## glm(formula = Attrition ~ ., family = binomial, data = train_data)
## 
## Coefficients: (1 not defined because of singularities)
##                                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      -2.502e+00  2.491e+00  -1.004 0.315183    
## Age                              -3.581e-02  2.049e-02  -1.748 0.080467 .  
## BusinessTravelTravel_Frequently   2.072e+00  6.535e-01   3.170 0.001524 ** 
## BusinessTravelTravel_Rarely       1.061e+00  6.102e-01   1.739 0.081972 .  
## DailyRate                        -1.275e-04  3.439e-04  -0.371 0.710899    
## DepartmentResearch & Development  2.805e-01  1.277e+00   0.220 0.826080    
## DepartmentSales                   8.897e-01  3.134e+03   0.000 0.999773    
## DistanceFromHome                  6.239e-02  1.741e-02   3.583 0.000340 ***
## Education                         3.387e-01  1.443e-01   2.347 0.018923 *  
## EducationFieldLife Sciences      -1.969e-01  1.915e+00  -0.103 0.918123    
## EducationFieldMarketing           7.147e-02  1.976e+00   0.036 0.971141    
## EducationFieldMedical             3.934e-02  1.932e+00   0.020 0.983758    
## EducationFieldOther              -1.496e-01  1.967e+00  -0.076 0.939379    
## EducationFieldTechnical Degree    8.453e-01  1.962e+00   0.431 0.666576    
## EnvironmentSatisfaction          -5.402e-01  1.303e-01  -4.144 3.41e-05 ***
## GenderMale                        2.378e-01  2.951e-01   0.806 0.420376    
## HourlyRate                        3.076e-03  7.023e-03   0.438 0.661424    
## JobInvolvement                   -5.331e-01  1.959e-01  -2.721 0.006507 ** 
## JobLevel                          2.577e-01  5.062e-01   0.509 0.610661    
## JobRoleHuman Resources                   NA         NA      NA       NA    
## JobRoleLaboratory Technician      1.689e+00  7.624e-01   2.216 0.026721 *  
## JobRoleManager                   -1.405e+01  1.662e+03  -0.008 0.993254    
## JobRoleManufacturing Director    -8.038e-02  8.238e-01  -0.098 0.922266    
## JobRoleResearch Director         -1.571e+01  1.024e+03  -0.015 0.987757    
## JobRoleResearch Scientist         6.527e-01  7.813e-01   0.835 0.403518    
## JobRoleSales Executive            1.835e-01  3.134e+03   0.000 0.999953    
## JobRoleSales Representative       1.445e+00  3.134e+03   0.000 0.999632    
## JobSatisfaction                  -6.387e-01  1.304e-01  -4.897 9.71e-07 ***
## MaritalStatusMarried             -3.864e-02  4.201e-01  -0.092 0.926720    
## MaritalStatusSingle               9.682e-01  5.316e-01   1.821 0.068544 .  
## MonthlyIncome                     9.596e-05  1.330e-04   0.721 0.470694    
## MonthlyRate                       2.802e-05  1.991e-05   1.407 0.159383    
## NumCompaniesWorked                2.573e-01  7.410e-02   3.473 0.000515 ***
## OverTimeYes                       1.829e+00  3.068e-01   5.961 2.50e-09 ***
## PercentSalaryHike                -8.254e-02  6.221e-02  -1.327 0.184594    
## PerformanceRating                 1.289e+00  6.165e-01   2.091 0.036570 *  
## RelationshipSatisfaction         -3.800e-01  1.348e-01  -2.818 0.004829 ** 
## StockOptionLevel                 -3.044e-01  2.264e-01  -1.345 0.178692    
## TotalWorkingYears                -1.485e-01  4.933e-02  -3.010 0.002609 ** 
## TrainingTimesLastYear             1.696e-02  1.891e-01   0.090 0.928538    
## WorkLifeBalance                  -2.863e-01  2.045e-01  -1.400 0.161591    
## YearsAtCompany                    6.114e-02  1.219e-01   0.501 0.616095    
## YearsInCurrentRole               -2.161e-01  1.096e-01  -1.972 0.048584 *  
## YearsSinceLastPromotion           1.725e-01  1.479e-01   1.166 0.243636    
## YearsWithCurrManager             -8.094e-02  1.112e-01  -0.728 0.466695    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 598.49  on 621  degrees of freedom
## Residual deviance: 364.17  on 578  degrees of freedom
## AIC: 452.17
## 
## Number of Fisher Scoring iterations: 16

anova(logit_model, test = "Chisq")

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Attrition
## 
## Terms added sequentially (first to last)
## 
## 
##                          Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                                       621     598.49              
## Age                       1   18.268       620     580.22 1.919e-05 ***
## BusinessTravel            2   13.028       618     567.19 0.0014829 ** 
## DailyRate                 1    0.739       617     566.45 0.3899481    
## Department                2    6.222       615     560.23 0.0445488 *  
## DistanceFromHome          1    4.962       614     555.27 0.0259097 *  
## Education                 1    1.843       613     553.43 0.1745420    
## EducationField            5    2.756       608     550.67 0.7375249    
## EnvironmentSatisfaction   1   12.723       607     537.95 0.0003612 ***
## Gender                    1    1.123       606     536.83 0.2893104    
## HourlyRate                1    0.122       605     536.70 0.7273830    
## JobInvolvement            1   11.267       604     525.44 0.0007888 ***
## JobLevel                  1   11.254       603     514.18 0.0007946 ***
## JobRole                   7   14.598       596     499.58 0.0415100 *  
## JobSatisfaction           1   18.765       595     480.82 1.478e-05 ***
## MaritalStatus             2   21.883       593     458.94 1.771e-05 ***
## MonthlyIncome             1    0.090       592     458.85 0.7646387    
## MonthlyRate               1    1.131       591     457.71 0.2875058    
## NumCompaniesWorked        1    8.868       590     448.85 0.0029019 ** 
## OverTime                  1   40.715       589     408.13 1.761e-10 ***
## PercentSalaryHike         1    0.244       588     407.89 0.6213134    
## PerformanceRating         1    3.624       587     404.26 0.0569597 .  
## RelationshipSatisfaction  1    5.949       586     398.31 0.0147250 *  
## StockOptionLevel          1    1.752       585     396.56 0.1855692    
## TotalWorkingYears         1   20.088       584     376.47 7.397e-06 ***
## TrainingTimesLastYear     1    0.004       583     376.47 0.9465288    
## WorkLifeBalance           1    1.904       582     374.57 0.1676249    
## YearsAtCompany            1    5.657       581     368.91 0.0173872 *  
## YearsInCurrentRole        1    2.895       580     366.01 0.0888674 .  
## YearsSinceLastPromotion   1    1.319       579     364.70 0.2508314    
## YearsWithCurrManager      1    0.521       578     364.17 0.4705305    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

test_data$pred_prob_logit <- predict(logit_model, newdata=test_data, type="response")
test_data$pred_class_logit <- factor(ifelse(test_data$pred_prob_logit > 0.5, "Yes", "No"))
conf_matrix_logit <- confusionMatrix(test_data$pred_class_logit, test_data$Attrition)
print(conf_matrix_logit)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  214  22
##        Yes  13  18
##                                          
##                Accuracy : 0.8689         
##                  95% CI : (0.8224, 0.907)
##     No Information Rate : 0.8502         
##     P-Value [Acc > NIR] : 0.2226         
##                                          
##                   Kappa : 0.4328         
##                                          
##  Mcnemar's Test P-Value : 0.1763         
##                                          
##             Sensitivity : 0.9427         
##             Specificity : 0.4500         
##          Pos Pred Value : 0.9068         
##          Neg Pred Value : 0.5806         
##              Prevalence : 0.8502         
##          Detection Rate : 0.8015         
##    Detection Prevalence : 0.8839         
##       Balanced Accuracy : 0.6964         
##                                          
##        'Positive' Class : No             
##

cmlr <- as.table(conf_matrix_logit$table)

cmlr_df <- as.data.frame(cmlr)
colnames(cmlr_df) <- c("Predicted", "Actual", "Freq")

ggplot(cmlr_df, aes(x = Actual, y = Predicted, fill = Freq)) +
  geom_tile(color = "black") +
  geom_text(aes(label = Freq), color = "white", size = 6) +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  labs(title = "Confusion Matrix Logistic Regression",
       x = "Actual Label",
       y = "Predicted Label") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

cmlogit <- conf_matrix_logit$table
TP <- cmlogit["Yes", "Yes"]
TN <- cmlogit["No", "No"]
FP <- cmlogit["Yes", "No"]
FN <- cmlogit["No", "Yes"]

accuracy <- (TP + TN) / sum(cmlogit)
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
f1_score <- 2 * (precision * recall) / (precision + recall)

cat("Akurasi:", round(accuracy, 4), "\n")

## Akurasi: 0.8689

cat("Precision:", round(precision, 4), "\n")

## Precision: 0.5806

cat("Recall:", round(recall, 4), "\n")

## Recall: 0.45

cat("F1-score:", round(f1_score, 4), "\n")

## F1-score: 0.507

UJI ASUMSI UNTUK LDA pada data asli

Uji Normalitas Lilliefors

normality_results <- sapply(numeric_cols, function(x) {
  yes_data <- na.omit(train_data %>% filter(Attrition == "Yes") %>% pull(x))
  no_data <- na.omit(train_data %>% filter(Attrition == "No") %>% pull(x))
  p_yes <- if(length(yes_data) >= 4) nortest::lillie.test(yes_data)$p.value else NA
  p_no <- if(length(no_data) >= 4) nortest::lillie.test(no_data)$p.value else NA
  c(Yes = p_yes, No = p_no)
})
print("P-value uji normalitas Lilliefors tiap variabel:")

## [1] "P-value uji normalitas Lilliefors tiap variabel:"

print(normality_results)

##              Age    DailyRate DistanceFromHome   HourlyRate MonthlyIncome
## Yes 2.068406e-04 6.849324e-04     1.524981e-06 7.753191e-02  1.161827e-11
## No  5.830763e-12 2.922622e-09     5.533947e-33 7.190823e-07  4.190625e-21
##      MonthlyRate NumCompaniesWorked PercentSalaryHike PerformanceRating
## Yes 4.870166e-02       1.163422e-28      1.494536e-10      3.315593e-81
## No  2.235624e-07       5.624869e-87      3.382705e-40      0.000000e+00
##     TotalWorkingYears TrainingTimesLastYear YearsAtCompany YearsInCurrentRole
## Yes      6.451584e-05          6.608783e-32   7.040844e-11       1.212209e-16
## No       8.270227e-23          4.387245e-96   1.875544e-32       9.302912e-59
##     YearsSinceLastPromotion YearsWithCurrManager
## Yes            2.599799e-31         1.766402e-16
## No             3.797598e-99         1.066115e-58

Uji homogenitas kovarian dengan Box’s M test

boxm_test <- boxM(train_data[, numeric_cols], train_data$Attrition)
print(boxm_test)

## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  train_data[, numeric_cols]
## Chi-Sq (approx.) = 204.51, df = 120, p-value = 2.411e-06

Model LDA tanpa reduksi dimensi

lda_model <- lda(Attrition ~ ., data=train_data)

## Warning in lda.default(x, grouping, ...): variables are collinear

pred_lda <- predict(lda_model, newdata=test_data)
conf_matrix_lda <- confusionMatrix(pred_lda$class, test_data$Attrition)
print(conf_matrix_lda)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  216  22
##        Yes  11  18
##                                           
##                Accuracy : 0.8764          
##                  95% CI : (0.8308, 0.9134)
##     No Information Rate : 0.8502          
##     P-Value [Acc > NIR] : 0.13124         
##                                           
##                   Kappa : 0.4528          
##                                           
##  Mcnemar's Test P-Value : 0.08172         
##                                           
##             Sensitivity : 0.9515          
##             Specificity : 0.4500          
##          Pos Pred Value : 0.9076          
##          Neg Pred Value : 0.6207          
##              Prevalence : 0.8502          
##          Detection Rate : 0.8090          
##    Detection Prevalence : 0.8914          
##       Balanced Accuracy : 0.7008          
##                                           
##        'Positive' Class : No              
##

cmlda <- as.table(conf_matrix_lda$table)

cmlda_df <- as.data.frame(cmlda)
colnames(cmlda_df) <- c("Predicted", "Actual", "Freq")

ggplot(cmlda_df, aes(x = Actual, y = Predicted, fill = Freq)) +
  geom_tile(color = "black") +
  geom_text(aes(label = Freq), color = "white", size = 6) +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  labs(title = "Confusion Matrix LDA",
       x = "Actual Label",
       y = "Predicted Label") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

cmlda <- conf_matrix_lda$table
TP <- cmlda["Yes", "Yes"]
TN <- cmlda["No", "No"]
FP <- cmlda["Yes", "No"]
FN <- cmlda["No", "Yes"]

accuracy <- (TP + TN) / sum(cmlda)
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
f1_score <- 2 * (precision * recall) / (precision + recall)

cat("Akurasi:", round(accuracy, 4), "\n")

## Akurasi: 0.8764

cat("Precision:", round(precision, 4), "\n")

## Precision: 0.6207

cat("Recall:", round(recall, 4), "\n")

## Recall: 0.45

cat("F1-score:", round(f1_score, 4), "\n")

## F1-score: 0.5217

==== MODEL 2: Logistic Regression dan LDA dengan PCA ====

Standarisasi data numerik

num_scaled <- scale(clean_data[, numeric_cols])

pca_result <- prcomp(num_scaled, center=TRUE, scale.=TRUE)

summary(pca_result)

## Importance of components:
##                           PC1    PC2    PC3     PC4    PC5     PC6     PC7
## Standard deviation     1.8432 1.3651 1.3274 1.05424 1.0276 1.00303 0.96992
## Proportion of Variance 0.2265 0.1242 0.1175 0.07409 0.0704 0.06707 0.06272
## Cumulative Proportion  0.2265 0.3507 0.4682 0.54228 0.6127 0.67976 0.74247
##                            PC8    PC9    PC10    PC11    PC12    PC13    PC14
## Standard deviation     0.95060 0.9116 0.81008 0.77588 0.53692 0.50680 0.47347
## Proportion of Variance 0.06024 0.0554 0.04375 0.04013 0.01922 0.01712 0.01495
## Cumulative Proportion  0.80272 0.8581 0.90187 0.94200 0.96122 0.97834 0.99329
##                           PC15
## Standard deviation     0.31735
## Proportion of Variance 0.00671
## Cumulative Proportion  1.00000

eigenvalues <- pca_result$sdev^2
eigenvalues

##  [1] 3.3973886 1.8635514 1.7619062 1.1114121 1.0560111 1.0060737 0.9407477
##  [8] 0.9036475 0.8310322 0.6562262 0.6019880 0.2882780 0.2568463 0.2241786
## [15] 0.1007123

Screeplot

plot(eigenvalues, type = "b", pch = 19,
     xlab = "Komponen Utama",
     ylab = "Eigenvalue",
     main = "Scree Plot dengan Elbow Method")

# Tambahkan garis horizontal di eigenvalue = 1 (kriteria Kaiser)
abline(h = 1, col = "red", lty = 2)

pca_result <- prcomp(train_data[, numeric_cols], scale. = TRUE)

attrition_labels <- train_data$Attrition

library(ggfortify)
autoplot(pca_result, data = train_data, colour = 'Attrition',
         loadings = TRUE, loadings.label = TRUE, loadings.label.size = 3) +
  ggtitle("Biplot PCA") +
  theme_minimal()

n_pca <- sum(eigenvalues > 1)
print(paste("Jumlah komponen PCA (eigenvalue>1):", n_pca))

## [1] "Jumlah komponen PCA (eigenvalue>1): 6"

pca_scores <- pca_result$x[, 1:n_pca]
pca_data <- data.frame(pca_scores, Attrition = train_data$Attrition)

loadings <- pca_result$rotation
print(round(loadings, 3))

##                            PC1    PC2    PC3    PC4    PC5    PC6    PC7    PC8
## Age                      0.186 -0.479 -0.139 -0.138  0.104  0.110  0.014  0.110
## DailyRate                0.054 -0.101 -0.044 -0.276 -0.474 -0.565  0.369 -0.435
## DistanceFromHome         0.062 -0.075 -0.015  0.371 -0.506  0.447 -0.306 -0.457
## HourlyRate               0.027 -0.049 -0.039 -0.319 -0.663  0.214  0.007  0.611
## MonthlyIncome            0.297 -0.319  0.027  0.195  0.018  0.033 -0.080 -0.027
## MonthlyRate             -0.029 -0.098  0.091  0.551 -0.040  0.123  0.759  0.221
## NumCompaniesWorked       0.058 -0.510 -0.149 -0.096  0.140 -0.124 -0.023  0.013
## PercentSalaryHike        0.057  0.144 -0.681  0.079  0.014  0.004  0.041 -0.046
## PerformanceRating        0.053  0.180 -0.673  0.087  0.005  0.011  0.054  0.066
## TotalWorkingYears        0.391 -0.380 -0.060  0.000  0.059  0.026 -0.034 -0.003
## TrainingTimesLastYear   -0.014  0.024 -0.015 -0.508  0.165  0.620  0.408 -0.354
## YearsAtCompany           0.481  0.217  0.099  0.027 -0.021 -0.020  0.061 -0.018
## YearsInCurrentRole       0.462  0.206  0.085  0.011  0.005 -0.002  0.090 -0.007
## YearsSinceLastPromotion  0.255  0.206  0.043 -0.204  0.124  0.046 -0.055  0.183
## YearsWithCurrManager     0.443  0.226  0.075  0.016 -0.011 -0.058  0.001 -0.024
##                            PC9   PC10   PC11   PC12   PC13   PC14   PC15
## Age                      0.014 -0.022 -0.745  0.321 -0.055  0.011  0.030
## DailyRate               -0.084  0.157 -0.069  0.015  0.011  0.047 -0.012
## DistanceFromHome        -0.285 -0.081 -0.068  0.034 -0.005  0.032  0.002
## HourlyRate               0.105 -0.037  0.139  0.005  0.003 -0.046 -0.003
## MonthlyIncome            0.340  0.611  0.377  0.340 -0.001  0.129 -0.004
## MonthlyRate             -0.174 -0.004 -0.012  0.002  0.068 -0.028 -0.008
## NumCompaniesWorked      -0.357 -0.490  0.496  0.212 -0.036  0.046  0.083
## PercentSalaryHike        0.016  0.092  0.067  0.107 -0.022 -0.690  0.026
## PerformanceRating        0.032 -0.018 -0.015 -0.104  0.020  0.695 -0.015
## TotalWorkingYears        0.037  0.056  0.000 -0.804  0.132 -0.114 -0.115
## TrainingTimesLastYear    0.104  0.019  0.150  0.008  0.048  0.034  0.000
## YearsAtCompany           0.102 -0.159 -0.013 -0.018 -0.081  0.005  0.814
## YearsInCurrentRole       0.019 -0.176  0.029  0.055 -0.700 -0.001 -0.451
## YearsSinceLastPromotion -0.767  0.443 -0.005  0.041  0.093  0.030  0.001
## YearsWithCurrManager     0.120 -0.298 -0.012  0.248  0.682 -0.022 -0.335

Split train-test untuk PCA data

train_pca <- pca_data[train_idx, ]
test_pca <- pca_data[-train_idx, ]

Model Logistic Regression dengan PCA

logit_pca <- glm(Attrition ~ ., data=train_pca, family=binomial)
summary(logit_pca)

## 
## Call:
## glm(formula = Attrition ~ ., family = binomial, data = train_pca)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.63120    0.14262 -11.437  < 2e-16 ***
## PC1         -0.38560    0.08323  -4.633 3.61e-06 ***
## PC2         -0.01245    0.09428  -0.132   0.8949    
## PC3         -0.11542    0.09079  -1.271   0.2036    
## PC4          0.20217    0.12178   1.660   0.0969 .  
## PC5         -0.23888    0.12280  -1.945   0.0517 .  
## PC6          0.11055    0.12716   0.869   0.3846    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 436.91  on 444  degrees of freedom
## Residual deviance: 402.04  on 438  degrees of freedom
##   (177 observations deleted due to missingness)
## AIC: 416.04
## 
## Number of Fisher Scoring iterations: 5

test_pca$pred_prob_logit <- predict(logit_pca, newdata=test_pca, type="response")
test_pca$pred_class_logit <- factor(ifelse(test_pca$pred_prob_logit > 0.5, "Yes", "No"))
conf_matrix_logit_pca <- confusionMatrix(test_pca$pred_class_logit, test_pca$Attrition)
print(conf_matrix_logit_pca)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  147  28
##        Yes   0   2
##                                           
##                Accuracy : 0.8418          
##                  95% CI : (0.7795, 0.8922)
##     No Information Rate : 0.8305          
##     P-Value [Acc > NIR] : 0.3897          
##                                           
##                   Kappa : 0.1061          
##                                           
##  Mcnemar's Test P-Value : 3.352e-07       
##                                           
##             Sensitivity : 1.00000         
##             Specificity : 0.06667         
##          Pos Pred Value : 0.84000         
##          Neg Pred Value : 1.00000         
##              Prevalence : 0.83051         
##          Detection Rate : 0.83051         
##    Detection Prevalence : 0.98870         
##       Balanced Accuracy : 0.53333         
##                                           
##        'Positive' Class : No              
##

cmlogpca <- as.table(conf_matrix_logit_pca$table)

cmlogpca_df <- as.data.frame(cmlogpca)
colnames(cmlogpca_df) <- c("Predicted", "Actual", "Freq")

ggplot(cmlogpca_df, aes(x = Actual, y = Predicted, fill = Freq)) +
  geom_tile(color = "black") +
  geom_text(aes(label = Freq), color = "white", size = 6) +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  labs(title = "Confusion Matrix Logistic Regression dengan PCA",
       x = "Actual Label",
       y = "Predicted Label") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

cmlogpca <- conf_matrix_logit_pca$table
TP <- cmlogpca["Yes", "Yes"]
TN <- cmlogpca["No", "No"]
FP <- cmlogpca["Yes", "No"]
FN <- cmlogpca["No", "Yes"]

accuracy <- (TP + TN) / sum(cmlogpca)
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
f1_score <- 2 * (precision * recall) / (precision + recall)

cat("Akurasi:", round(accuracy, 4), "\n")

## Akurasi: 0.8418

cat("Precision:", round(precision, 4), "\n")

## Precision: 1

cat("Recall:", round(recall, 4), "\n")

## Recall: 0.0667

cat("F1-score:", round(f1_score, 4), "\n")

## F1-score: 0.125

Model LDA dengan PCA

lda_pca <- lda(Attrition ~ ., data=train_pca)

pred_lda_pca <- predict(lda_pca, newdata=test_pca)
conf_matrix_lda_pca <- confusionMatrix(pred_lda_pca$class, test_pca$Attrition)
print(conf_matrix_lda_pca)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  147  28
##        Yes   0   2
##                                           
##                Accuracy : 0.8418          
##                  95% CI : (0.7795, 0.8922)
##     No Information Rate : 0.8305          
##     P-Value [Acc > NIR] : 0.3897          
##                                           
##                   Kappa : 0.1061          
##                                           
##  Mcnemar's Test P-Value : 3.352e-07       
##                                           
##             Sensitivity : 1.00000         
##             Specificity : 0.06667         
##          Pos Pred Value : 0.84000         
##          Neg Pred Value : 1.00000         
##              Prevalence : 0.83051         
##          Detection Rate : 0.83051         
##    Detection Prevalence : 0.98870         
##       Balanced Accuracy : 0.53333         
##                                           
##        'Positive' Class : No              
##

cmldapca <- as.table(conf_matrix_lda_pca$table)

cmldapca_df <- as.data.frame(cmldapca)
colnames(cmldapca_df) <- c("Predicted", "Actual", "Freq")

ggplot(cmldapca_df, aes(x = Actual, y = Predicted, fill = Freq)) +
  geom_tile(color = "black") +
  geom_text(aes(label = Freq), color = "white", size = 6) +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  labs(title = "Confusion Matrix LDA dengan PCA",
       x = "Actual Label",
       y = "Predicted Label") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

cmldapca <- conf_matrix_lda_pca$table
TP <- cmldapca["Yes", "Yes"]
TN <- cmldapca["No", "No"]
FP <- cmldapca["Yes", "No"]
FN <- cmldapca["No", "Yes"]

accuracy <- (TP + TN) / sum(cmldapca)
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
f1_score <- 2 * (precision * recall) / (precision + recall)

cat("Akurasi:", round(accuracy, 4), "\n")

## Akurasi: 0.8418

cat("Precision:", round(precision, 4), "\n")

## Precision: 1

cat("Recall:", round(recall, 4), "\n")

## Recall: 0.0667

cat("F1-score:", round(f1_score, 4), "\n")

## F1-score: 0.125

==== MODEL 3: Logistic Regression dan LDA dengan Factor Analysis (FA) ====

fa_result <- fa(train_data[, numeric_cols], nfactors=n_pca, rotate="varimax", fm="ml")
print(fa_result)

## Factor Analysis using method =  ml
## Call: fa(r = train_data[, numeric_cols], nfactors = n_pca, rotate = "varimax", 
##     fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
##                           ML1   ML2   ML5   ML6   ML3   ML4    h2    u2 com
## Age                      0.04  0.02  0.62 -0.01  0.12  0.01 0.397 0.603 1.1
## DailyRate                0.05  0.01  0.11 -0.08 -0.02  0.04 0.022 0.978 2.7
## DistanceFromHome         0.04  0.02  0.08  0.16 -0.07  0.01 0.039 0.961 2.1
## HourlyRate               0.02  0.01  0.04 -0.04  0.09  0.00 0.012 0.988 2.0
## MonthlyIncome            0.26 -0.03  0.49  0.41 -0.10  0.04 0.484 0.516 2.6
## MonthlyRate             -0.04 -0.05 -0.02  0.19 -0.03 -0.06 0.046 0.954 1.5
## NumCompaniesWorked      -0.14  0.01  0.54 -0.06  0.06 -0.08 0.324 0.676 1.2
## PercentSalaryHike        0.03  0.94  0.08 -0.12 -0.28  0.06 0.995 0.005 1.2
## PerformanceRating        0.04  0.93 -0.05  0.04  0.36 -0.05 0.995 0.005 1.3
## TotalWorkingYears        0.38  0.01  0.83  0.20  0.09  0.07 0.890 0.110 1.6
## TrainingTimesLastYear    0.01 -0.01  0.02 -0.17  0.00 -0.03 0.031 0.969 1.1
## YearsAtCompany           0.97  0.01  0.07  0.10  0.04  0.21 0.995 0.005 1.1
## YearsInCurrentRole       0.95  0.02  0.10  0.04 -0.04 -0.29 0.995 0.005 1.2
## YearsSinceLastPromotion  0.42  0.02  0.06 -0.12  0.02 -0.04 0.193 0.807 1.2
## YearsWithCurrManager     0.82  0.03  0.06  0.04  0.05  0.17 0.715 0.285 1.1
## 
##                        ML1  ML2  ML5  ML6  ML3  ML4
## SS loadings           2.93 1.75 1.65 0.35 0.27 0.18
## Proportion Var        0.20 0.12 0.11 0.02 0.02 0.01
## Cumulative Var        0.20 0.31 0.42 0.45 0.46 0.48
## Proportion Explained  0.41 0.25 0.23 0.05 0.04 0.03
## Cumulative Proportion 0.41 0.66 0.89 0.94 0.97 1.00
## 
## Mean item complexity =  1.5
## Test of the hypothesis that 6 factors are sufficient.
## 
## df null model =  105  with the objective function =  5.21 with Chi Square =  3203.9
## df of  the model are 30  and the objective function was  0.05 
## 
## The root mean square of the residuals (RMSR) is  0.02 
## The df corrected root mean square of the residuals is  0.03 
## 
## The harmonic n.obs is  622 with the empirical chi square  33.62  with prob <  0.3 
## The total n.obs was  622  with Likelihood Chi Square =  28.88  with prob <  0.52 
## 
## Tucker Lewis Index of factoring reliability =  1.001
## RMSEA index =  0  and the 90 % confidence intervals are  0 0.029
## BIC =  -164.11
## Fit based upon off diagonal values = 0.99
## Measures of factor score adequacy             
##                                                    ML1  ML2  ML5   ML6  ML3
## Correlation of (regression) scores with factors   1.00 1.00 0.93  0.60 0.96
## Multiple R square of scores with factors          0.99 1.00 0.87  0.37 0.92
## Minimum correlation of possible factor scores     0.99 0.99 0.74 -0.27 0.84
##                                                    ML4
## Correlation of (regression) scores with factors   0.98
## Multiple R square of scores with factors          0.96
## Minimum correlation of possible factor scores     0.92

fa_facto <- PCA(train_data[, numeric_cols], graph = FALSE)

fviz_pca_biplot(fa_facto, 
                label = "var", 
                habillage = train_data$Attrition, 
                palette = "jco",
                addEllipses = FALSE,
                title = "Biplot FA")

fa_scores <- factor.scores(train_data[, numeric_cols], fa_result)$scores
fa_data <- data.frame(fa_scores, Attrition = train_data$Attrition)

Model Logistic Regression dengan FA

logit_fa <- glm(Attrition ~ ., data=fa_data, family=binomial)
summary(logit_fa)

## 
## Call:
## glm(formula = Attrition ~ ., family = binomial, data = fa_data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.69446    0.12455 -13.605  < 2e-16 ***
## ML1         -0.77288    0.14502  -5.329 9.85e-08 ***
## ML2          0.09769    0.10226   0.955 0.339399    
## ML5         -0.39425    0.11047  -3.569 0.000359 ***
## ML6          0.04193    0.11454   0.366 0.714323    
## ML3          0.09593    0.10977   0.874 0.382161    
## ML4         -0.16581    0.13539  -1.225 0.220687    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 598.49  on 621  degrees of freedom
## Residual deviance: 545.42  on 615  degrees of freedom
## AIC: 559.42
## 
## Number of Fisher Scoring iterations: 5

fa_loadings <- fa_result$loadings

train_means <- attr(scale(train_data[, numeric_cols]), "scaled:center")
train_sds <- attr(scale(train_data[, numeric_cols]), "scaled:scale")

test_scaled <- scale(test_data[, numeric_cols], center = train_means, scale = train_sds)

test_fa_scores <- as.matrix(test_scaled) %*% as.matrix(fa_loadings)

colnames(test_fa_scores) <- colnames(fa_scores)
test_fa <- data.frame(test_fa_scores, Attrition = test_data$Attrition)

print(fa_result$loadings)

## 
## Loadings:
##                         ML1    ML2    ML5    ML6    ML3    ML4   
## Age                                    0.616         0.121       
## DailyRate                              0.109                     
## DistanceFromHome                              0.161              
## HourlyRate                                                       
## MonthlyIncome            0.258         0.487  0.408 -0.103       
## MonthlyRate                                   0.194              
## NumCompaniesWorked      -0.141         0.538                     
## PercentSalaryHike               0.942        -0.121 -0.285       
## PerformanceRating               0.927                0.355       
## TotalWorkingYears        0.385         0.831  0.196              
## TrainingTimesLastYear                        -0.173              
## YearsAtCompany           0.968                              0.205
## YearsInCurrentRole       0.949                             -0.287
## YearsSinceLastPromotion  0.416               -0.122              
## YearsWithCurrManager     0.824                              0.167
## 
##                  ML1   ML2   ML5   ML6   ML3   ML4
## SS loadings    2.933 1.755 1.647 0.354 0.265 0.179
## Proportion Var 0.196 0.117 0.110 0.024 0.018 0.012
## Cumulative Var 0.196 0.313 0.422 0.446 0.464 0.476

test_fa$pred_prob_logit <- predict(logit_fa, newdata = test_fa, type = "response")
test_fa$pred_class_logit <- factor(ifelse(test_fa$pred_prob_logit > 0.5, "Yes", "No"))

conf_matrix_logit_fa <- confusionMatrix(test_fa$pred_class_logit, test_fa$Attrition)
print(conf_matrix_logit_fa)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  172  20
##        Yes  55  20
##                                           
##                Accuracy : 0.7191          
##                  95% CI : (0.6611, 0.7722)
##     No Information Rate : 0.8502          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1894          
##                                           
##  Mcnemar's Test P-Value : 8.638e-05       
##                                           
##             Sensitivity : 0.7577          
##             Specificity : 0.5000          
##          Pos Pred Value : 0.8958          
##          Neg Pred Value : 0.2667          
##              Prevalence : 0.8502          
##          Detection Rate : 0.6442          
##    Detection Prevalence : 0.7191          
##       Balanced Accuracy : 0.6289          
##                                           
##        'Positive' Class : No              
##

cmlogfa <- as.table(conf_matrix_logit_fa$table)

cmlogfa_df <- as.data.frame(cmlogfa)
colnames(cmlogfa_df) <- c("Predicted", "Actual", "Freq")

ggplot(cmlogfa_df, aes(x = Actual, y = Predicted, fill = Freq)) +
  geom_tile(color = "black") +
  geom_text(aes(label = Freq), color = "white", size = 6) +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  labs(title = "Confusion Matrix Logistic Regression dengan FA",
       x = "Actual Label",
       y = "Predicted Label") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

cmlogfa <- conf_matrix_logit_fa$table
TP <- cmlogfa["Yes", "Yes"]
TN <- cmlogfa["No", "No"]
FP <- cmlogfa["Yes", "No"]
FN <- cmlogfa["No", "Yes"]

accuracy <- (TP + TN) / sum(cmlogfa)
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
f1_score <- 2 * (precision * recall) / (precision + recall)

cat("Akurasi:", round(accuracy, 4), "\n")

## Akurasi: 0.7191

cat("Precision:", round(precision, 4), "\n")

## Precision: 0.2667

cat("Recall:", round(recall, 4), "\n")

## Recall: 0.5

cat("F1-score:", round(f1_score, 4), "\n")

## F1-score: 0.3478

Model LDA dengan FA

lda_fa <- lda(Attrition ~ ., data = fa_data)
lda_pred_fa <- predict(lda_fa, newdata = test_fa)
conf_matrix_lda_fa <- confusionMatrix(lda_pred_fa$class, test_fa$Attrition)
print(conf_matrix_lda_fa)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  182  23
##        Yes  45  17
##                                           
##                Accuracy : 0.7453          
##                  95% CI : (0.6886, 0.7965)
##     No Information Rate : 0.8502          
##     P-Value [Acc > NIR] : 1.00000         
##                                           
##                   Kappa : 0.1849          
##                                           
##  Mcnemar's Test P-Value : 0.01088         
##                                           
##             Sensitivity : 0.8018          
##             Specificity : 0.4250          
##          Pos Pred Value : 0.8878          
##          Neg Pred Value : 0.2742          
##              Prevalence : 0.8502          
##          Detection Rate : 0.6816          
##    Detection Prevalence : 0.7678          
##       Balanced Accuracy : 0.6134          
##                                           
##        'Positive' Class : No              
##

cmldafa <- as.table(conf_matrix_lda_fa$table)

cmldafa_df <- as.data.frame(cmldafa)
colnames(cmldafa_df) <- c("Predicted", "Actual", "Freq")

ggplot(cmldafa_df, aes(x = Actual, y = Predicted, fill = Freq)) +
  geom_tile(color = "black") +
  geom_text(aes(label = Freq), color = "white", size = 6) +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  labs(title = "Confusion Matrix LDA dengan FA",
       x = "Actual Label",
       y = "Predicted Label") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

cmldafa <- conf_matrix_lda_fa$table
TP <- cmldafa["Yes", "Yes"]
TN <- cmldafa["No", "No"]
FP <- cmldafa["Yes", "No"]
FN <- cmldafa["No", "Yes"]

accuracy <- (TP + TN) / sum(cmldafa)
precision <- TP / cmldafa
recall <- TP / (TP + FN)
f1_score <- 2 * (precision * recall) / (precision + recall)

cat("Akurasi:", round(accuracy, 4), "\n")

## Akurasi: 0.7453

cat("Precision:", round(precision, 4), "\n")

## Precision: 0.0934 0.3778 0.7391 1

cat("Recall:", round(recall, 4), "\n")

## Recall: 0.425

cat("F1-score:", round(f1_score, 4), "\n")

## F1-score: 0.1532 0.4 0.5397 0.5965

==== PERCEPTUAL MAPPING ====

1. MDS

num_data_scaled <- scale(clean_data[, numeric_cols])

dist_matrix <- dist(num_data_scaled, method = "euclidean")

mds_result <- cmdscale(dist_matrix, k = 2, eig = TRUE)

mds_coords <- as.data.frame(mds_result$points)
names(mds_coords) <- c("Dim1", "Dim2")

mds_coords$Attrition <- clean_data$Attrition

p_mds <- ggplot(mds_coords, aes(x = Dim1, y = Dim2, color = Attrition)) +
  geom_point(alpha = 0.7, size = 3) +
  labs(title = "Perceptual Mapping - MDS (2D)",
       x = "Dimensi 1",
       y = "Dimensi 2") +
  theme_minimal() +
  theme(legend.position = "right")

print(p_mds)

2. PCA Biplot

cat("Eigenvalues (varians tiap PC):\n")

## Eigenvalues (varians tiap PC):

print(round(eigenvalues, 3))

##  [1] 3.397 1.864 1.762 1.111 1.056 1.006 0.941 0.904 0.831 0.656 0.602 0.288
## [13] 0.257 0.224 0.101

cat("Proporsi varians tiap PC:\n")

## Proporsi varians tiap PC:

print(round(eigenvalues / sum(eigenvalues), 3))

##  [1] 0.226 0.124 0.117 0.074 0.070 0.067 0.063 0.060 0.055 0.044 0.040 0.019
## [13] 0.017 0.015 0.007

cat("Proporsi kumulatif varians:\n")

## Proporsi kumulatif varians:

print(round(cumsum(eigenvalues / sum(eigenvalues)), 3))

##  [1] 0.226 0.351 0.468 0.542 0.613 0.680 0.742 0.803 0.858 0.902 0.942 0.961
## [13] 0.978 0.993 1.000

p_biplot <- fviz_pca_biplot(pca_result, 
                            geom.ind = "point",
                            habillage = train_data$Attrition,
                            addEllipses = TRUE,
                            ellipse.level = 0.95,
                            palette = c("#00AFBB", "#FC4E07"),
                            repel = TRUE) +
  ggtitle("Biplot PCA dengan Pewarnaan berdasarkan Attrition") +
  theme_minimal()

print(p_biplot)

Prediksi dan Analisis Faktor Pengunduran Diri Karyawan Menggunakan Analisis Diskriminan dan Regresi Logistik

Kelompok 11 2023 F

2025-06-05

KELOMPOK 11

Import Library

Load dataset

PreProcessing

Drop kolom tidak berguna

Cek missing values

Definisi variabel

Deteksi outlier (IQR)

Hapus outlier

Visualisasi boxplot

Encoding kategorikal ke faktor

Visualisasi distribusi target sebelum hapus outlier

Visualisasi distribusi target sesudah hapus outlier

Korelasi Matriks

Split train-test 70-30

==== MODEL 1: Logistic Regression dan LDA tanpa reduksi dimensi ====

Model Logistic Regression tanpa reduksi dimensi

UJI ASUMSI UNTUK LDA pada data asli

Uji Normalitas Lilliefors

Uji homogenitas kovarian dengan Box’s M test

Model LDA tanpa reduksi dimensi

==== MODEL 2: Logistic Regression dan LDA dengan PCA ====

Standarisasi data numerik

Screeplot

Split train-test untuk PCA data

Model Logistic Regression dengan PCA

Model LDA dengan PCA

==== MODEL 3: Logistic Regression dan LDA dengan Factor Analysis (FA) ====

Model Logistic Regression dengan FA

Model LDA dengan FA

==== PERCEPTUAL MAPPING ====

1. MDS

2. PCA Biplot