KELOMPOK 11

1. Raden Roro Azahra Tzitziliani Foulin (23031554003)
2. Fardaniyah Hazhiratul Dzauq (23031554045)
3. Novanna Zahrah Zahrani (23031554141)

Import Library

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(corrplot)
## corrplot 0.95 loaded
library(ggpubr)
library(nortest)   
library(biotools)  
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## ---
## biotools version 4.3
library(MASS)      
library(psych)     
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(factoextra) 
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(FactoMineR) 
library(ggfortify)

Load dataset

data <- read.csv("C:\\Users\\LENOVO\\Downloads\\WA_Fn-UseC_-HR-Employee-Attrition.csv")
original_data <- data

PreProcessing

Drop kolom tidak berguna

drop_cols <- c("EmployeeCount", "EmployeeNumber", "StandardHours", "Over18")
data <- data[, !(names(data) %in% drop_cols)]

Cek missing values

missing_values <- sapply(data, function(x) sum(is.na(x)))
print(missing_values)
##                      Age                Attrition           BusinessTravel 
##                        0                        0                        0 
##                DailyRate               Department         DistanceFromHome 
##                        0                        0                        0 
##                Education           EducationField  EnvironmentSatisfaction 
##                        0                        0                        0 
##                   Gender               HourlyRate           JobInvolvement 
##                        0                        0                        0 
##                 JobLevel                  JobRole          JobSatisfaction 
##                        0                        0                        0 
##            MaritalStatus            MonthlyIncome              MonthlyRate 
##                        0                        0                        0 
##       NumCompaniesWorked                 OverTime        PercentSalaryHike 
##                        0                        0                        0 
##        PerformanceRating RelationshipSatisfaction         StockOptionLevel 
##                        0                        0                        0 
##        TotalWorkingYears    TrainingTimesLastYear          WorkLifeBalance 
##                        0                        0                        0 
##           YearsAtCompany       YearsInCurrentRole  YearsSinceLastPromotion 
##                        0                        0                        0 
##     YearsWithCurrManager 
##                        0

Definisi variabel

categorical_vars <- c(
  "Attrition", "BusinessTravel", "Department", "Education", "EducationField",
  "EnvironmentSatisfaction", "Gender", "JobInvolvement", "JobLevel", "JobRole",
  "JobSatisfaction", "MaritalStatus", "OverTime", "RelationshipSatisfaction",
  "StockOptionLevel", "WorkLifeBalance"
)
numeric_cols <- c(
  "Age", "DailyRate", "DistanceFromHome", "HourlyRate", "MonthlyIncome",
  "MonthlyRate", "NumCompaniesWorked", "PercentSalaryHike", "PerformanceRating",
  "TotalWorkingYears", "TrainingTimesLastYear", "YearsAtCompany",
  "YearsInCurrentRole", "YearsSinceLastPromotion", "YearsWithCurrManager"
)

Deteksi outlier (IQR)

outlier_count <- sapply(numeric_cols, function(col) {
  Q1 <- quantile(data[[col]], 0.25)
  Q3 <- quantile(data[[col]], 0.75)
  IQR <- Q3 - Q1
  sum(data[[col]] < (Q1 - 1.5 * IQR) | data[[col]] > (Q3 + 1.5 * IQR))
})
outlier_count <- sort(outlier_count, decreasing = TRUE)
print("Top 10 kolom dengan outlier terbanyak:")
## [1] "Top 10 kolom dengan outlier terbanyak:"
print(head(outlier_count, 10))
##   TrainingTimesLastYear       PerformanceRating           MonthlyIncome 
##                     238                     226                     114 
## YearsSinceLastPromotion          YearsAtCompany       TotalWorkingYears 
##                     107                     104                      63 
##      NumCompaniesWorked      YearsInCurrentRole    YearsWithCurrManager 
##                      52                      21                      14 
##                     Age 
##                       0
num_data_before <- data %>% dplyr::select(dplyr::all_of(numeric_cols))

Hapus outlier

clean_data <- data
cols_for_outlier_removal <- setdiff(numeric_cols, c("PerformanceRating"))
for (col in cols_for_outlier_removal) {
  Q1 <- quantile(clean_data[[col]], 0.25)
  Q3 <- quantile(clean_data[[col]], 0.75)
  IQR <- Q3 - Q1
  lower <- Q1 - 1.5 * IQR
  upper <- Q3 + 1.5 * IQR
  clean_data <- clean_data %>% filter(clean_data[[col]] >= lower & clean_data[[col]] <= upper)
}

Visualisasi boxplot

num_data_after <- clean_data %>% dplyr::select(dplyr::all_of(numeric_cols))

par(mfrow=c(1,2))
boxplot(num_data_before, main="Sebelum Outlier Removal", las=2, col="lightblue")
boxplot(num_data_after, main="Setelah Outlier Removal", las=2, col="lightgreen")

par(mfrow=c(1,1))

Encoding kategorikal ke faktor

clean_data <- clean_data %>% mutate_if(is.character, as.factor)
clean_data$Attrition <- factor(clean_data$Attrition)

Visualisasi distribusi target sebelum hapus outlier

ggplot(original_data, aes(x=Attrition)) +
  geom_bar(fill="steelblue") +
  labs(title="Distribusi Attrition (Sebelum Outlier Removal)")

Visualisasi distribusi target sesudah hapus outlier

ggplot(clean_data, aes(x=Attrition)) +
  geom_bar(fill="steelblue") +
  labs(title="Distribusi Attrition (Sesudah Outlier Removal)")

Korelasi Matriks

cor_matrix <- cor(clean_data[, numeric_cols])
corrplot(cor_matrix, method="color", tl.cex=0.8)

Split train-test 70-30

set.seed(123)
train_idx <- sample(seq_len(nrow(clean_data)), size = 0.7*nrow(clean_data))
train_data <- clean_data[train_idx, ]
test_data  <- clean_data[-train_idx, ]

==== MODEL 1: Logistic Regression dan LDA tanpa reduksi dimensi ====

Model Logistic Regression tanpa reduksi dimensi

logit_model <- glm(Attrition ~ ., data=train_data, family=binomial)
summary(logit_model)
## 
## Call:
## glm(formula = Attrition ~ ., family = binomial, data = train_data)
## 
## Coefficients: (1 not defined because of singularities)
##                                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      -2.502e+00  2.491e+00  -1.004 0.315183    
## Age                              -3.581e-02  2.049e-02  -1.748 0.080467 .  
## BusinessTravelTravel_Frequently   2.072e+00  6.535e-01   3.170 0.001524 ** 
## BusinessTravelTravel_Rarely       1.061e+00  6.102e-01   1.739 0.081972 .  
## DailyRate                        -1.275e-04  3.439e-04  -0.371 0.710899    
## DepartmentResearch & Development  2.805e-01  1.277e+00   0.220 0.826080    
## DepartmentSales                   8.897e-01  3.134e+03   0.000 0.999773    
## DistanceFromHome                  6.239e-02  1.741e-02   3.583 0.000340 ***
## Education                         3.387e-01  1.443e-01   2.347 0.018923 *  
## EducationFieldLife Sciences      -1.969e-01  1.915e+00  -0.103 0.918123    
## EducationFieldMarketing           7.147e-02  1.976e+00   0.036 0.971141    
## EducationFieldMedical             3.934e-02  1.932e+00   0.020 0.983758    
## EducationFieldOther              -1.496e-01  1.967e+00  -0.076 0.939379    
## EducationFieldTechnical Degree    8.453e-01  1.962e+00   0.431 0.666576    
## EnvironmentSatisfaction          -5.402e-01  1.303e-01  -4.144 3.41e-05 ***
## GenderMale                        2.378e-01  2.951e-01   0.806 0.420376    
## HourlyRate                        3.076e-03  7.023e-03   0.438 0.661424    
## JobInvolvement                   -5.331e-01  1.959e-01  -2.721 0.006507 ** 
## JobLevel                          2.577e-01  5.062e-01   0.509 0.610661    
## JobRoleHuman Resources                   NA         NA      NA       NA    
## JobRoleLaboratory Technician      1.689e+00  7.624e-01   2.216 0.026721 *  
## JobRoleManager                   -1.405e+01  1.662e+03  -0.008 0.993254    
## JobRoleManufacturing Director    -8.038e-02  8.238e-01  -0.098 0.922266    
## JobRoleResearch Director         -1.571e+01  1.024e+03  -0.015 0.987757    
## JobRoleResearch Scientist         6.527e-01  7.813e-01   0.835 0.403518    
## JobRoleSales Executive            1.835e-01  3.134e+03   0.000 0.999953    
## JobRoleSales Representative       1.445e+00  3.134e+03   0.000 0.999632    
## JobSatisfaction                  -6.387e-01  1.304e-01  -4.897 9.71e-07 ***
## MaritalStatusMarried             -3.864e-02  4.201e-01  -0.092 0.926720    
## MaritalStatusSingle               9.682e-01  5.316e-01   1.821 0.068544 .  
## MonthlyIncome                     9.596e-05  1.330e-04   0.721 0.470694    
## MonthlyRate                       2.802e-05  1.991e-05   1.407 0.159383    
## NumCompaniesWorked                2.573e-01  7.410e-02   3.473 0.000515 ***
## OverTimeYes                       1.829e+00  3.068e-01   5.961 2.50e-09 ***
## PercentSalaryHike                -8.254e-02  6.221e-02  -1.327 0.184594    
## PerformanceRating                 1.289e+00  6.165e-01   2.091 0.036570 *  
## RelationshipSatisfaction         -3.800e-01  1.348e-01  -2.818 0.004829 ** 
## StockOptionLevel                 -3.044e-01  2.264e-01  -1.345 0.178692    
## TotalWorkingYears                -1.485e-01  4.933e-02  -3.010 0.002609 ** 
## TrainingTimesLastYear             1.696e-02  1.891e-01   0.090 0.928538    
## WorkLifeBalance                  -2.863e-01  2.045e-01  -1.400 0.161591    
## YearsAtCompany                    6.114e-02  1.219e-01   0.501 0.616095    
## YearsInCurrentRole               -2.161e-01  1.096e-01  -1.972 0.048584 *  
## YearsSinceLastPromotion           1.725e-01  1.479e-01   1.166 0.243636    
## YearsWithCurrManager             -8.094e-02  1.112e-01  -0.728 0.466695    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 598.49  on 621  degrees of freedom
## Residual deviance: 364.17  on 578  degrees of freedom
## AIC: 452.17
## 
## Number of Fisher Scoring iterations: 16
anova(logit_model, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Attrition
## 
## Terms added sequentially (first to last)
## 
## 
##                          Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                                       621     598.49              
## Age                       1   18.268       620     580.22 1.919e-05 ***
## BusinessTravel            2   13.028       618     567.19 0.0014829 ** 
## DailyRate                 1    0.739       617     566.45 0.3899481    
## Department                2    6.222       615     560.23 0.0445488 *  
## DistanceFromHome          1    4.962       614     555.27 0.0259097 *  
## Education                 1    1.843       613     553.43 0.1745420    
## EducationField            5    2.756       608     550.67 0.7375249    
## EnvironmentSatisfaction   1   12.723       607     537.95 0.0003612 ***
## Gender                    1    1.123       606     536.83 0.2893104    
## HourlyRate                1    0.122       605     536.70 0.7273830    
## JobInvolvement            1   11.267       604     525.44 0.0007888 ***
## JobLevel                  1   11.254       603     514.18 0.0007946 ***
## JobRole                   7   14.598       596     499.58 0.0415100 *  
## JobSatisfaction           1   18.765       595     480.82 1.478e-05 ***
## MaritalStatus             2   21.883       593     458.94 1.771e-05 ***
## MonthlyIncome             1    0.090       592     458.85 0.7646387    
## MonthlyRate               1    1.131       591     457.71 0.2875058    
## NumCompaniesWorked        1    8.868       590     448.85 0.0029019 ** 
## OverTime                  1   40.715       589     408.13 1.761e-10 ***
## PercentSalaryHike         1    0.244       588     407.89 0.6213134    
## PerformanceRating         1    3.624       587     404.26 0.0569597 .  
## RelationshipSatisfaction  1    5.949       586     398.31 0.0147250 *  
## StockOptionLevel          1    1.752       585     396.56 0.1855692    
## TotalWorkingYears         1   20.088       584     376.47 7.397e-06 ***
## TrainingTimesLastYear     1    0.004       583     376.47 0.9465288    
## WorkLifeBalance           1    1.904       582     374.57 0.1676249    
## YearsAtCompany            1    5.657       581     368.91 0.0173872 *  
## YearsInCurrentRole        1    2.895       580     366.01 0.0888674 .  
## YearsSinceLastPromotion   1    1.319       579     364.70 0.2508314    
## YearsWithCurrManager      1    0.521       578     364.17 0.4705305    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
test_data$pred_prob_logit <- predict(logit_model, newdata=test_data, type="response")
test_data$pred_class_logit <- factor(ifelse(test_data$pred_prob_logit > 0.5, "Yes", "No"))
conf_matrix_logit <- confusionMatrix(test_data$pred_class_logit, test_data$Attrition)
print(conf_matrix_logit)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  214  22
##        Yes  13  18
##                                          
##                Accuracy : 0.8689         
##                  95% CI : (0.8224, 0.907)
##     No Information Rate : 0.8502         
##     P-Value [Acc > NIR] : 0.2226         
##                                          
##                   Kappa : 0.4328         
##                                          
##  Mcnemar's Test P-Value : 0.1763         
##                                          
##             Sensitivity : 0.9427         
##             Specificity : 0.4500         
##          Pos Pred Value : 0.9068         
##          Neg Pred Value : 0.5806         
##              Prevalence : 0.8502         
##          Detection Rate : 0.8015         
##    Detection Prevalence : 0.8839         
##       Balanced Accuracy : 0.6964         
##                                          
##        'Positive' Class : No             
## 
cmlr <- as.table(conf_matrix_logit$table)

cmlr_df <- as.data.frame(cmlr)
colnames(cmlr_df) <- c("Predicted", "Actual", "Freq")

ggplot(cmlr_df, aes(x = Actual, y = Predicted, fill = Freq)) +
  geom_tile(color = "black") +
  geom_text(aes(label = Freq), color = "white", size = 6) +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  labs(title = "Confusion Matrix Logistic Regression",
       x = "Actual Label",
       y = "Predicted Label") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

cmlogit <- conf_matrix_logit$table
TP <- cmlogit["Yes", "Yes"]
TN <- cmlogit["No", "No"]
FP <- cmlogit["Yes", "No"]
FN <- cmlogit["No", "Yes"]

accuracy <- (TP + TN) / sum(cmlogit)
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
f1_score <- 2 * (precision * recall) / (precision + recall)

cat("Akurasi:", round(accuracy, 4), "\n")
## Akurasi: 0.8689
cat("Precision:", round(precision, 4), "\n")
## Precision: 0.5806
cat("Recall:", round(recall, 4), "\n")
## Recall: 0.45
cat("F1-score:", round(f1_score, 4), "\n")
## F1-score: 0.507

UJI ASUMSI UNTUK LDA pada data asli

Uji Normalitas Lilliefors

normality_results <- sapply(numeric_cols, function(x) {
  yes_data <- na.omit(train_data %>% filter(Attrition == "Yes") %>% pull(x))
  no_data <- na.omit(train_data %>% filter(Attrition == "No") %>% pull(x))
  p_yes <- if(length(yes_data) >= 4) nortest::lillie.test(yes_data)$p.value else NA
  p_no <- if(length(no_data) >= 4) nortest::lillie.test(no_data)$p.value else NA
  c(Yes = p_yes, No = p_no)
})
print("P-value uji normalitas Lilliefors tiap variabel:")
## [1] "P-value uji normalitas Lilliefors tiap variabel:"
print(normality_results)
##              Age    DailyRate DistanceFromHome   HourlyRate MonthlyIncome
## Yes 2.068406e-04 6.849324e-04     1.524981e-06 7.753191e-02  1.161827e-11
## No  5.830763e-12 2.922622e-09     5.533947e-33 7.190823e-07  4.190625e-21
##      MonthlyRate NumCompaniesWorked PercentSalaryHike PerformanceRating
## Yes 4.870166e-02       1.163422e-28      1.494536e-10      3.315593e-81
## No  2.235624e-07       5.624869e-87      3.382705e-40      0.000000e+00
##     TotalWorkingYears TrainingTimesLastYear YearsAtCompany YearsInCurrentRole
## Yes      6.451584e-05          6.608783e-32   7.040844e-11       1.212209e-16
## No       8.270227e-23          4.387245e-96   1.875544e-32       9.302912e-59
##     YearsSinceLastPromotion YearsWithCurrManager
## Yes            2.599799e-31         1.766402e-16
## No             3.797598e-99         1.066115e-58

Uji homogenitas kovarian dengan Box’s M test

boxm_test <- boxM(train_data[, numeric_cols], train_data$Attrition)
print(boxm_test)
## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  train_data[, numeric_cols]
## Chi-Sq (approx.) = 204.51, df = 120, p-value = 2.411e-06

Model LDA tanpa reduksi dimensi

lda_model <- lda(Attrition ~ ., data=train_data)
## Warning in lda.default(x, grouping, ...): variables are collinear
pred_lda <- predict(lda_model, newdata=test_data)
conf_matrix_lda <- confusionMatrix(pred_lda$class, test_data$Attrition)
print(conf_matrix_lda)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  216  22
##        Yes  11  18
##                                           
##                Accuracy : 0.8764          
##                  95% CI : (0.8308, 0.9134)
##     No Information Rate : 0.8502          
##     P-Value [Acc > NIR] : 0.13124         
##                                           
##                   Kappa : 0.4528          
##                                           
##  Mcnemar's Test P-Value : 0.08172         
##                                           
##             Sensitivity : 0.9515          
##             Specificity : 0.4500          
##          Pos Pred Value : 0.9076          
##          Neg Pred Value : 0.6207          
##              Prevalence : 0.8502          
##          Detection Rate : 0.8090          
##    Detection Prevalence : 0.8914          
##       Balanced Accuracy : 0.7008          
##                                           
##        'Positive' Class : No              
## 
cmlda <- as.table(conf_matrix_lda$table)

cmlda_df <- as.data.frame(cmlda)
colnames(cmlda_df) <- c("Predicted", "Actual", "Freq")

ggplot(cmlda_df, aes(x = Actual, y = Predicted, fill = Freq)) +
  geom_tile(color = "black") +
  geom_text(aes(label = Freq), color = "white", size = 6) +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  labs(title = "Confusion Matrix LDA",
       x = "Actual Label",
       y = "Predicted Label") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

cmlda <- conf_matrix_lda$table
TP <- cmlda["Yes", "Yes"]
TN <- cmlda["No", "No"]
FP <- cmlda["Yes", "No"]
FN <- cmlda["No", "Yes"]

accuracy <- (TP + TN) / sum(cmlda)
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
f1_score <- 2 * (precision * recall) / (precision + recall)

cat("Akurasi:", round(accuracy, 4), "\n")
## Akurasi: 0.8764
cat("Precision:", round(precision, 4), "\n")
## Precision: 0.6207
cat("Recall:", round(recall, 4), "\n")
## Recall: 0.45
cat("F1-score:", round(f1_score, 4), "\n")
## F1-score: 0.5217

==== MODEL 2: Logistic Regression dan LDA dengan PCA ====

Standarisasi data numerik

num_scaled <- scale(clean_data[, numeric_cols])
pca_result <- prcomp(num_scaled, center=TRUE, scale.=TRUE)
summary(pca_result)
## Importance of components:
##                           PC1    PC2    PC3     PC4    PC5     PC6     PC7
## Standard deviation     1.8432 1.3651 1.3274 1.05424 1.0276 1.00303 0.96992
## Proportion of Variance 0.2265 0.1242 0.1175 0.07409 0.0704 0.06707 0.06272
## Cumulative Proportion  0.2265 0.3507 0.4682 0.54228 0.6127 0.67976 0.74247
##                            PC8    PC9    PC10    PC11    PC12    PC13    PC14
## Standard deviation     0.95060 0.9116 0.81008 0.77588 0.53692 0.50680 0.47347
## Proportion of Variance 0.06024 0.0554 0.04375 0.04013 0.01922 0.01712 0.01495
## Cumulative Proportion  0.80272 0.8581 0.90187 0.94200 0.96122 0.97834 0.99329
##                           PC15
## Standard deviation     0.31735
## Proportion of Variance 0.00671
## Cumulative Proportion  1.00000
eigenvalues <- pca_result$sdev^2
eigenvalues
##  [1] 3.3973886 1.8635514 1.7619062 1.1114121 1.0560111 1.0060737 0.9407477
##  [8] 0.9036475 0.8310322 0.6562262 0.6019880 0.2882780 0.2568463 0.2241786
## [15] 0.1007123

Screeplot

plot(eigenvalues, type = "b", pch = 19,
     xlab = "Komponen Utama",
     ylab = "Eigenvalue",
     main = "Scree Plot dengan Elbow Method")

# Tambahkan garis horizontal di eigenvalue = 1 (kriteria Kaiser)
abline(h = 1, col = "red", lty = 2)

pca_result <- prcomp(train_data[, numeric_cols], scale. = TRUE)

attrition_labels <- train_data$Attrition

library(ggfortify)
autoplot(pca_result, data = train_data, colour = 'Attrition',
         loadings = TRUE, loadings.label = TRUE, loadings.label.size = 3) +
  ggtitle("Biplot PCA") +
  theme_minimal()

n_pca <- sum(eigenvalues > 1)
print(paste("Jumlah komponen PCA (eigenvalue>1):", n_pca))
## [1] "Jumlah komponen PCA (eigenvalue>1): 6"
pca_scores <- pca_result$x[, 1:n_pca]
pca_data <- data.frame(pca_scores, Attrition = train_data$Attrition)
loadings <- pca_result$rotation
print(round(loadings, 3))
##                            PC1    PC2    PC3    PC4    PC5    PC6    PC7    PC8
## Age                      0.186 -0.479 -0.139 -0.138  0.104  0.110  0.014  0.110
## DailyRate                0.054 -0.101 -0.044 -0.276 -0.474 -0.565  0.369 -0.435
## DistanceFromHome         0.062 -0.075 -0.015  0.371 -0.506  0.447 -0.306 -0.457
## HourlyRate               0.027 -0.049 -0.039 -0.319 -0.663  0.214  0.007  0.611
## MonthlyIncome            0.297 -0.319  0.027  0.195  0.018  0.033 -0.080 -0.027
## MonthlyRate             -0.029 -0.098  0.091  0.551 -0.040  0.123  0.759  0.221
## NumCompaniesWorked       0.058 -0.510 -0.149 -0.096  0.140 -0.124 -0.023  0.013
## PercentSalaryHike        0.057  0.144 -0.681  0.079  0.014  0.004  0.041 -0.046
## PerformanceRating        0.053  0.180 -0.673  0.087  0.005  0.011  0.054  0.066
## TotalWorkingYears        0.391 -0.380 -0.060  0.000  0.059  0.026 -0.034 -0.003
## TrainingTimesLastYear   -0.014  0.024 -0.015 -0.508  0.165  0.620  0.408 -0.354
## YearsAtCompany           0.481  0.217  0.099  0.027 -0.021 -0.020  0.061 -0.018
## YearsInCurrentRole       0.462  0.206  0.085  0.011  0.005 -0.002  0.090 -0.007
## YearsSinceLastPromotion  0.255  0.206  0.043 -0.204  0.124  0.046 -0.055  0.183
## YearsWithCurrManager     0.443  0.226  0.075  0.016 -0.011 -0.058  0.001 -0.024
##                            PC9   PC10   PC11   PC12   PC13   PC14   PC15
## Age                      0.014 -0.022 -0.745  0.321 -0.055  0.011  0.030
## DailyRate               -0.084  0.157 -0.069  0.015  0.011  0.047 -0.012
## DistanceFromHome        -0.285 -0.081 -0.068  0.034 -0.005  0.032  0.002
## HourlyRate               0.105 -0.037  0.139  0.005  0.003 -0.046 -0.003
## MonthlyIncome            0.340  0.611  0.377  0.340 -0.001  0.129 -0.004
## MonthlyRate             -0.174 -0.004 -0.012  0.002  0.068 -0.028 -0.008
## NumCompaniesWorked      -0.357 -0.490  0.496  0.212 -0.036  0.046  0.083
## PercentSalaryHike        0.016  0.092  0.067  0.107 -0.022 -0.690  0.026
## PerformanceRating        0.032 -0.018 -0.015 -0.104  0.020  0.695 -0.015
## TotalWorkingYears        0.037  0.056  0.000 -0.804  0.132 -0.114 -0.115
## TrainingTimesLastYear    0.104  0.019  0.150  0.008  0.048  0.034  0.000
## YearsAtCompany           0.102 -0.159 -0.013 -0.018 -0.081  0.005  0.814
## YearsInCurrentRole       0.019 -0.176  0.029  0.055 -0.700 -0.001 -0.451
## YearsSinceLastPromotion -0.767  0.443 -0.005  0.041  0.093  0.030  0.001
## YearsWithCurrManager     0.120 -0.298 -0.012  0.248  0.682 -0.022 -0.335

Split train-test untuk PCA data

train_pca <- pca_data[train_idx, ]
test_pca <- pca_data[-train_idx, ]

Model Logistic Regression dengan PCA

logit_pca <- glm(Attrition ~ ., data=train_pca, family=binomial)
summary(logit_pca)
## 
## Call:
## glm(formula = Attrition ~ ., family = binomial, data = train_pca)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.63120    0.14262 -11.437  < 2e-16 ***
## PC1         -0.38560    0.08323  -4.633 3.61e-06 ***
## PC2         -0.01245    0.09428  -0.132   0.8949    
## PC3         -0.11542    0.09079  -1.271   0.2036    
## PC4          0.20217    0.12178   1.660   0.0969 .  
## PC5         -0.23888    0.12280  -1.945   0.0517 .  
## PC6          0.11055    0.12716   0.869   0.3846    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 436.91  on 444  degrees of freedom
## Residual deviance: 402.04  on 438  degrees of freedom
##   (177 observations deleted due to missingness)
## AIC: 416.04
## 
## Number of Fisher Scoring iterations: 5
test_pca$pred_prob_logit <- predict(logit_pca, newdata=test_pca, type="response")
test_pca$pred_class_logit <- factor(ifelse(test_pca$pred_prob_logit > 0.5, "Yes", "No"))
conf_matrix_logit_pca <- confusionMatrix(test_pca$pred_class_logit, test_pca$Attrition)
print(conf_matrix_logit_pca)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  147  28
##        Yes   0   2
##                                           
##                Accuracy : 0.8418          
##                  95% CI : (0.7795, 0.8922)
##     No Information Rate : 0.8305          
##     P-Value [Acc > NIR] : 0.3897          
##                                           
##                   Kappa : 0.1061          
##                                           
##  Mcnemar's Test P-Value : 3.352e-07       
##                                           
##             Sensitivity : 1.00000         
##             Specificity : 0.06667         
##          Pos Pred Value : 0.84000         
##          Neg Pred Value : 1.00000         
##              Prevalence : 0.83051         
##          Detection Rate : 0.83051         
##    Detection Prevalence : 0.98870         
##       Balanced Accuracy : 0.53333         
##                                           
##        'Positive' Class : No              
## 
cmlogpca <- as.table(conf_matrix_logit_pca$table)

cmlogpca_df <- as.data.frame(cmlogpca)
colnames(cmlogpca_df) <- c("Predicted", "Actual", "Freq")

ggplot(cmlogpca_df, aes(x = Actual, y = Predicted, fill = Freq)) +
  geom_tile(color = "black") +
  geom_text(aes(label = Freq), color = "white", size = 6) +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  labs(title = "Confusion Matrix Logistic Regression dengan PCA",
       x = "Actual Label",
       y = "Predicted Label") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

cmlogpca <- conf_matrix_logit_pca$table
TP <- cmlogpca["Yes", "Yes"]
TN <- cmlogpca["No", "No"]
FP <- cmlogpca["Yes", "No"]
FN <- cmlogpca["No", "Yes"]

accuracy <- (TP + TN) / sum(cmlogpca)
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
f1_score <- 2 * (precision * recall) / (precision + recall)

cat("Akurasi:", round(accuracy, 4), "\n")
## Akurasi: 0.8418
cat("Precision:", round(precision, 4), "\n")
## Precision: 1
cat("Recall:", round(recall, 4), "\n")
## Recall: 0.0667
cat("F1-score:", round(f1_score, 4), "\n")
## F1-score: 0.125

Model LDA dengan PCA

lda_pca <- lda(Attrition ~ ., data=train_pca)

pred_lda_pca <- predict(lda_pca, newdata=test_pca)
conf_matrix_lda_pca <- confusionMatrix(pred_lda_pca$class, test_pca$Attrition)
print(conf_matrix_lda_pca)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  147  28
##        Yes   0   2
##                                           
##                Accuracy : 0.8418          
##                  95% CI : (0.7795, 0.8922)
##     No Information Rate : 0.8305          
##     P-Value [Acc > NIR] : 0.3897          
##                                           
##                   Kappa : 0.1061          
##                                           
##  Mcnemar's Test P-Value : 3.352e-07       
##                                           
##             Sensitivity : 1.00000         
##             Specificity : 0.06667         
##          Pos Pred Value : 0.84000         
##          Neg Pred Value : 1.00000         
##              Prevalence : 0.83051         
##          Detection Rate : 0.83051         
##    Detection Prevalence : 0.98870         
##       Balanced Accuracy : 0.53333         
##                                           
##        'Positive' Class : No              
## 
cmldapca <- as.table(conf_matrix_lda_pca$table)

cmldapca_df <- as.data.frame(cmldapca)
colnames(cmldapca_df) <- c("Predicted", "Actual", "Freq")

ggplot(cmldapca_df, aes(x = Actual, y = Predicted, fill = Freq)) +
  geom_tile(color = "black") +
  geom_text(aes(label = Freq), color = "white", size = 6) +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  labs(title = "Confusion Matrix LDA dengan PCA",
       x = "Actual Label",
       y = "Predicted Label") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

cmldapca <- conf_matrix_lda_pca$table
TP <- cmldapca["Yes", "Yes"]
TN <- cmldapca["No", "No"]
FP <- cmldapca["Yes", "No"]
FN <- cmldapca["No", "Yes"]

accuracy <- (TP + TN) / sum(cmldapca)
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
f1_score <- 2 * (precision * recall) / (precision + recall)

cat("Akurasi:", round(accuracy, 4), "\n")
## Akurasi: 0.8418
cat("Precision:", round(precision, 4), "\n")
## Precision: 1
cat("Recall:", round(recall, 4), "\n")
## Recall: 0.0667
cat("F1-score:", round(f1_score, 4), "\n")
## F1-score: 0.125

==== MODEL 3: Logistic Regression dan LDA dengan Factor Analysis (FA) ====

fa_result <- fa(train_data[, numeric_cols], nfactors=n_pca, rotate="varimax", fm="ml")
print(fa_result)
## Factor Analysis using method =  ml
## Call: fa(r = train_data[, numeric_cols], nfactors = n_pca, rotate = "varimax", 
##     fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
##                           ML1   ML2   ML5   ML6   ML3   ML4    h2    u2 com
## Age                      0.04  0.02  0.62 -0.01  0.12  0.01 0.397 0.603 1.1
## DailyRate                0.05  0.01  0.11 -0.08 -0.02  0.04 0.022 0.978 2.7
## DistanceFromHome         0.04  0.02  0.08  0.16 -0.07  0.01 0.039 0.961 2.1
## HourlyRate               0.02  0.01  0.04 -0.04  0.09  0.00 0.012 0.988 2.0
## MonthlyIncome            0.26 -0.03  0.49  0.41 -0.10  0.04 0.484 0.516 2.6
## MonthlyRate             -0.04 -0.05 -0.02  0.19 -0.03 -0.06 0.046 0.954 1.5
## NumCompaniesWorked      -0.14  0.01  0.54 -0.06  0.06 -0.08 0.324 0.676 1.2
## PercentSalaryHike        0.03  0.94  0.08 -0.12 -0.28  0.06 0.995 0.005 1.2
## PerformanceRating        0.04  0.93 -0.05  0.04  0.36 -0.05 0.995 0.005 1.3
## TotalWorkingYears        0.38  0.01  0.83  0.20  0.09  0.07 0.890 0.110 1.6
## TrainingTimesLastYear    0.01 -0.01  0.02 -0.17  0.00 -0.03 0.031 0.969 1.1
## YearsAtCompany           0.97  0.01  0.07  0.10  0.04  0.21 0.995 0.005 1.1
## YearsInCurrentRole       0.95  0.02  0.10  0.04 -0.04 -0.29 0.995 0.005 1.2
## YearsSinceLastPromotion  0.42  0.02  0.06 -0.12  0.02 -0.04 0.193 0.807 1.2
## YearsWithCurrManager     0.82  0.03  0.06  0.04  0.05  0.17 0.715 0.285 1.1
## 
##                        ML1  ML2  ML5  ML6  ML3  ML4
## SS loadings           2.93 1.75 1.65 0.35 0.27 0.18
## Proportion Var        0.20 0.12 0.11 0.02 0.02 0.01
## Cumulative Var        0.20 0.31 0.42 0.45 0.46 0.48
## Proportion Explained  0.41 0.25 0.23 0.05 0.04 0.03
## Cumulative Proportion 0.41 0.66 0.89 0.94 0.97 1.00
## 
## Mean item complexity =  1.5
## Test of the hypothesis that 6 factors are sufficient.
## 
## df null model =  105  with the objective function =  5.21 with Chi Square =  3203.9
## df of  the model are 30  and the objective function was  0.05 
## 
## The root mean square of the residuals (RMSR) is  0.02 
## The df corrected root mean square of the residuals is  0.03 
## 
## The harmonic n.obs is  622 with the empirical chi square  33.62  with prob <  0.3 
## The total n.obs was  622  with Likelihood Chi Square =  28.88  with prob <  0.52 
## 
## Tucker Lewis Index of factoring reliability =  1.001
## RMSEA index =  0  and the 90 % confidence intervals are  0 0.029
## BIC =  -164.11
## Fit based upon off diagonal values = 0.99
## Measures of factor score adequacy             
##                                                    ML1  ML2  ML5   ML6  ML3
## Correlation of (regression) scores with factors   1.00 1.00 0.93  0.60 0.96
## Multiple R square of scores with factors          0.99 1.00 0.87  0.37 0.92
## Minimum correlation of possible factor scores     0.99 0.99 0.74 -0.27 0.84
##                                                    ML4
## Correlation of (regression) scores with factors   0.98
## Multiple R square of scores with factors          0.96
## Minimum correlation of possible factor scores     0.92
fa_facto <- PCA(train_data[, numeric_cols], graph = FALSE)

fviz_pca_biplot(fa_facto, 
                label = "var", 
                habillage = train_data$Attrition, 
                palette = "jco",
                addEllipses = FALSE,
                title = "Biplot FA")

fa_scores <- factor.scores(train_data[, numeric_cols], fa_result)$scores
fa_data <- data.frame(fa_scores, Attrition = train_data$Attrition)

Model Logistic Regression dengan FA

logit_fa <- glm(Attrition ~ ., data=fa_data, family=binomial)
summary(logit_fa)
## 
## Call:
## glm(formula = Attrition ~ ., family = binomial, data = fa_data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.69446    0.12455 -13.605  < 2e-16 ***
## ML1         -0.77288    0.14502  -5.329 9.85e-08 ***
## ML2          0.09769    0.10226   0.955 0.339399    
## ML5         -0.39425    0.11047  -3.569 0.000359 ***
## ML6          0.04193    0.11454   0.366 0.714323    
## ML3          0.09593    0.10977   0.874 0.382161    
## ML4         -0.16581    0.13539  -1.225 0.220687    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 598.49  on 621  degrees of freedom
## Residual deviance: 545.42  on 615  degrees of freedom
## AIC: 559.42
## 
## Number of Fisher Scoring iterations: 5
fa_loadings <- fa_result$loadings

train_means <- attr(scale(train_data[, numeric_cols]), "scaled:center")
train_sds <- attr(scale(train_data[, numeric_cols]), "scaled:scale")

test_scaled <- scale(test_data[, numeric_cols], center = train_means, scale = train_sds)

test_fa_scores <- as.matrix(test_scaled) %*% as.matrix(fa_loadings)

colnames(test_fa_scores) <- colnames(fa_scores)
test_fa <- data.frame(test_fa_scores, Attrition = test_data$Attrition)
print(fa_result$loadings)
## 
## Loadings:
##                         ML1    ML2    ML5    ML6    ML3    ML4   
## Age                                    0.616         0.121       
## DailyRate                              0.109                     
## DistanceFromHome                              0.161              
## HourlyRate                                                       
## MonthlyIncome            0.258         0.487  0.408 -0.103       
## MonthlyRate                                   0.194              
## NumCompaniesWorked      -0.141         0.538                     
## PercentSalaryHike               0.942        -0.121 -0.285       
## PerformanceRating               0.927                0.355       
## TotalWorkingYears        0.385         0.831  0.196              
## TrainingTimesLastYear                        -0.173              
## YearsAtCompany           0.968                              0.205
## YearsInCurrentRole       0.949                             -0.287
## YearsSinceLastPromotion  0.416               -0.122              
## YearsWithCurrManager     0.824                              0.167
## 
##                  ML1   ML2   ML5   ML6   ML3   ML4
## SS loadings    2.933 1.755 1.647 0.354 0.265 0.179
## Proportion Var 0.196 0.117 0.110 0.024 0.018 0.012
## Cumulative Var 0.196 0.313 0.422 0.446 0.464 0.476
test_fa$pred_prob_logit <- predict(logit_fa, newdata = test_fa, type = "response")
test_fa$pred_class_logit <- factor(ifelse(test_fa$pred_prob_logit > 0.5, "Yes", "No"))

conf_matrix_logit_fa <- confusionMatrix(test_fa$pred_class_logit, test_fa$Attrition)
print(conf_matrix_logit_fa)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  172  20
##        Yes  55  20
##                                           
##                Accuracy : 0.7191          
##                  95% CI : (0.6611, 0.7722)
##     No Information Rate : 0.8502          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1894          
##                                           
##  Mcnemar's Test P-Value : 8.638e-05       
##                                           
##             Sensitivity : 0.7577          
##             Specificity : 0.5000          
##          Pos Pred Value : 0.8958          
##          Neg Pred Value : 0.2667          
##              Prevalence : 0.8502          
##          Detection Rate : 0.6442          
##    Detection Prevalence : 0.7191          
##       Balanced Accuracy : 0.6289          
##                                           
##        'Positive' Class : No              
## 
cmlogfa <- as.table(conf_matrix_logit_fa$table)

cmlogfa_df <- as.data.frame(cmlogfa)
colnames(cmlogfa_df) <- c("Predicted", "Actual", "Freq")

ggplot(cmlogfa_df, aes(x = Actual, y = Predicted, fill = Freq)) +
  geom_tile(color = "black") +
  geom_text(aes(label = Freq), color = "white", size = 6) +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  labs(title = "Confusion Matrix Logistic Regression dengan FA",
       x = "Actual Label",
       y = "Predicted Label") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

cmlogfa <- conf_matrix_logit_fa$table
TP <- cmlogfa["Yes", "Yes"]
TN <- cmlogfa["No", "No"]
FP <- cmlogfa["Yes", "No"]
FN <- cmlogfa["No", "Yes"]

accuracy <- (TP + TN) / sum(cmlogfa)
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
f1_score <- 2 * (precision * recall) / (precision + recall)

cat("Akurasi:", round(accuracy, 4), "\n")
## Akurasi: 0.7191
cat("Precision:", round(precision, 4), "\n")
## Precision: 0.2667
cat("Recall:", round(recall, 4), "\n")
## Recall: 0.5
cat("F1-score:", round(f1_score, 4), "\n")
## F1-score: 0.3478

Model LDA dengan FA

lda_fa <- lda(Attrition ~ ., data = fa_data)
lda_pred_fa <- predict(lda_fa, newdata = test_fa)
conf_matrix_lda_fa <- confusionMatrix(lda_pred_fa$class, test_fa$Attrition)
print(conf_matrix_lda_fa)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  182  23
##        Yes  45  17
##                                           
##                Accuracy : 0.7453          
##                  95% CI : (0.6886, 0.7965)
##     No Information Rate : 0.8502          
##     P-Value [Acc > NIR] : 1.00000         
##                                           
##                   Kappa : 0.1849          
##                                           
##  Mcnemar's Test P-Value : 0.01088         
##                                           
##             Sensitivity : 0.8018          
##             Specificity : 0.4250          
##          Pos Pred Value : 0.8878          
##          Neg Pred Value : 0.2742          
##              Prevalence : 0.8502          
##          Detection Rate : 0.6816          
##    Detection Prevalence : 0.7678          
##       Balanced Accuracy : 0.6134          
##                                           
##        'Positive' Class : No              
## 
cmldafa <- as.table(conf_matrix_lda_fa$table)

cmldafa_df <- as.data.frame(cmldafa)
colnames(cmldafa_df) <- c("Predicted", "Actual", "Freq")

ggplot(cmldafa_df, aes(x = Actual, y = Predicted, fill = Freq)) +
  geom_tile(color = "black") +
  geom_text(aes(label = Freq), color = "white", size = 6) +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  labs(title = "Confusion Matrix LDA dengan FA",
       x = "Actual Label",
       y = "Predicted Label") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

cmldafa <- conf_matrix_lda_fa$table
TP <- cmldafa["Yes", "Yes"]
TN <- cmldafa["No", "No"]
FP <- cmldafa["Yes", "No"]
FN <- cmldafa["No", "Yes"]

accuracy <- (TP + TN) / sum(cmldafa)
precision <- TP / cmldafa
recall <- TP / (TP + FN)
f1_score <- 2 * (precision * recall) / (precision + recall)

cat("Akurasi:", round(accuracy, 4), "\n")
## Akurasi: 0.7453
cat("Precision:", round(precision, 4), "\n")
## Precision: 0.0934 0.3778 0.7391 1
cat("Recall:", round(recall, 4), "\n")
## Recall: 0.425
cat("F1-score:", round(f1_score, 4), "\n")
## F1-score: 0.1532 0.4 0.5397 0.5965

==== PERCEPTUAL MAPPING ====

1. MDS

num_data_scaled <- scale(clean_data[, numeric_cols])

dist_matrix <- dist(num_data_scaled, method = "euclidean")

mds_result <- cmdscale(dist_matrix, k = 2, eig = TRUE)

mds_coords <- as.data.frame(mds_result$points)
names(mds_coords) <- c("Dim1", "Dim2")

mds_coords$Attrition <- clean_data$Attrition

p_mds <- ggplot(mds_coords, aes(x = Dim1, y = Dim2, color = Attrition)) +
  geom_point(alpha = 0.7, size = 3) +
  labs(title = "Perceptual Mapping - MDS (2D)",
       x = "Dimensi 1",
       y = "Dimensi 2") +
  theme_minimal() +
  theme(legend.position = "right")

print(p_mds)

2. PCA Biplot

cat("Eigenvalues (varians tiap PC):\n")
## Eigenvalues (varians tiap PC):
print(round(eigenvalues, 3))
##  [1] 3.397 1.864 1.762 1.111 1.056 1.006 0.941 0.904 0.831 0.656 0.602 0.288
## [13] 0.257 0.224 0.101
cat("Proporsi varians tiap PC:\n")
## Proporsi varians tiap PC:
print(round(eigenvalues / sum(eigenvalues), 3))
##  [1] 0.226 0.124 0.117 0.074 0.070 0.067 0.063 0.060 0.055 0.044 0.040 0.019
## [13] 0.017 0.015 0.007
cat("Proporsi kumulatif varians:\n")
## Proporsi kumulatif varians:
print(round(cumsum(eigenvalues / sum(eigenvalues)), 3))
##  [1] 0.226 0.351 0.468 0.542 0.613 0.680 0.742 0.803 0.858 0.902 0.942 0.961
## [13] 0.978 0.993 1.000
p_biplot <- fviz_pca_biplot(pca_result, 
                            geom.ind = "point",
                            habillage = train_data$Attrition,
                            addEllipses = TRUE,
                            ellipse.level = 0.95,
                            palette = c("#00AFBB", "#FC4E07"),
                            repel = TRUE) +
  ggtitle("Biplot PCA dengan Pewarnaan berdasarkan Attrition") +
  theme_minimal()

print(p_biplot)