1. Raden Roro Azahra Tzitziliani Foulin
(23031554003)
2. Fardaniyah Hazhiratul Dzauq (23031554045)
3. Novanna Zahrah Zahrani (23031554141)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(corrplot)
## corrplot 0.95 loaded
library(ggpubr)
library(nortest)
library(biotools)
## Loading required package: MASS
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
##
## ---
## biotools version 4.3
library(MASS)
library(psych)
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(FactoMineR)
library(ggfortify)
data <- read.csv("C:\\Users\\LENOVO\\Downloads\\WA_Fn-UseC_-HR-Employee-Attrition.csv")
original_data <- data
drop_cols <- c("EmployeeCount", "EmployeeNumber", "StandardHours", "Over18")
data <- data[, !(names(data) %in% drop_cols)]
missing_values <- sapply(data, function(x) sum(is.na(x)))
print(missing_values)
## Age Attrition BusinessTravel
## 0 0 0
## DailyRate Department DistanceFromHome
## 0 0 0
## Education EducationField EnvironmentSatisfaction
## 0 0 0
## Gender HourlyRate JobInvolvement
## 0 0 0
## JobLevel JobRole JobSatisfaction
## 0 0 0
## MaritalStatus MonthlyIncome MonthlyRate
## 0 0 0
## NumCompaniesWorked OverTime PercentSalaryHike
## 0 0 0
## PerformanceRating RelationshipSatisfaction StockOptionLevel
## 0 0 0
## TotalWorkingYears TrainingTimesLastYear WorkLifeBalance
## 0 0 0
## YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion
## 0 0 0
## YearsWithCurrManager
## 0
categorical_vars <- c(
"Attrition", "BusinessTravel", "Department", "Education", "EducationField",
"EnvironmentSatisfaction", "Gender", "JobInvolvement", "JobLevel", "JobRole",
"JobSatisfaction", "MaritalStatus", "OverTime", "RelationshipSatisfaction",
"StockOptionLevel", "WorkLifeBalance"
)
numeric_cols <- c(
"Age", "DailyRate", "DistanceFromHome", "HourlyRate", "MonthlyIncome",
"MonthlyRate", "NumCompaniesWorked", "PercentSalaryHike", "PerformanceRating",
"TotalWorkingYears", "TrainingTimesLastYear", "YearsAtCompany",
"YearsInCurrentRole", "YearsSinceLastPromotion", "YearsWithCurrManager"
)
outlier_count <- sapply(numeric_cols, function(col) {
Q1 <- quantile(data[[col]], 0.25)
Q3 <- quantile(data[[col]], 0.75)
IQR <- Q3 - Q1
sum(data[[col]] < (Q1 - 1.5 * IQR) | data[[col]] > (Q3 + 1.5 * IQR))
})
outlier_count <- sort(outlier_count, decreasing = TRUE)
print("Top 10 kolom dengan outlier terbanyak:")
## [1] "Top 10 kolom dengan outlier terbanyak:"
print(head(outlier_count, 10))
## TrainingTimesLastYear PerformanceRating MonthlyIncome
## 238 226 114
## YearsSinceLastPromotion YearsAtCompany TotalWorkingYears
## 107 104 63
## NumCompaniesWorked YearsInCurrentRole YearsWithCurrManager
## 52 21 14
## Age
## 0
num_data_before <- data %>% dplyr::select(dplyr::all_of(numeric_cols))
clean_data <- data
cols_for_outlier_removal <- setdiff(numeric_cols, c("PerformanceRating"))
for (col in cols_for_outlier_removal) {
Q1 <- quantile(clean_data[[col]], 0.25)
Q3 <- quantile(clean_data[[col]], 0.75)
IQR <- Q3 - Q1
lower <- Q1 - 1.5 * IQR
upper <- Q3 + 1.5 * IQR
clean_data <- clean_data %>% filter(clean_data[[col]] >= lower & clean_data[[col]] <= upper)
}
num_data_after <- clean_data %>% dplyr::select(dplyr::all_of(numeric_cols))
par(mfrow=c(1,2))
boxplot(num_data_before, main="Sebelum Outlier Removal", las=2, col="lightblue")
boxplot(num_data_after, main="Setelah Outlier Removal", las=2, col="lightgreen")
par(mfrow=c(1,1))
clean_data <- clean_data %>% mutate_if(is.character, as.factor)
clean_data$Attrition <- factor(clean_data$Attrition)
ggplot(original_data, aes(x=Attrition)) +
geom_bar(fill="steelblue") +
labs(title="Distribusi Attrition (Sebelum Outlier Removal)")
ggplot(clean_data, aes(x=Attrition)) +
geom_bar(fill="steelblue") +
labs(title="Distribusi Attrition (Sesudah Outlier Removal)")
cor_matrix <- cor(clean_data[, numeric_cols])
corrplot(cor_matrix, method="color", tl.cex=0.8)
set.seed(123)
train_idx <- sample(seq_len(nrow(clean_data)), size = 0.7*nrow(clean_data))
train_data <- clean_data[train_idx, ]
test_data <- clean_data[-train_idx, ]
logit_model <- glm(Attrition ~ ., data=train_data, family=binomial)
summary(logit_model)
##
## Call:
## glm(formula = Attrition ~ ., family = binomial, data = train_data)
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.502e+00 2.491e+00 -1.004 0.315183
## Age -3.581e-02 2.049e-02 -1.748 0.080467 .
## BusinessTravelTravel_Frequently 2.072e+00 6.535e-01 3.170 0.001524 **
## BusinessTravelTravel_Rarely 1.061e+00 6.102e-01 1.739 0.081972 .
## DailyRate -1.275e-04 3.439e-04 -0.371 0.710899
## DepartmentResearch & Development 2.805e-01 1.277e+00 0.220 0.826080
## DepartmentSales 8.897e-01 3.134e+03 0.000 0.999773
## DistanceFromHome 6.239e-02 1.741e-02 3.583 0.000340 ***
## Education 3.387e-01 1.443e-01 2.347 0.018923 *
## EducationFieldLife Sciences -1.969e-01 1.915e+00 -0.103 0.918123
## EducationFieldMarketing 7.147e-02 1.976e+00 0.036 0.971141
## EducationFieldMedical 3.934e-02 1.932e+00 0.020 0.983758
## EducationFieldOther -1.496e-01 1.967e+00 -0.076 0.939379
## EducationFieldTechnical Degree 8.453e-01 1.962e+00 0.431 0.666576
## EnvironmentSatisfaction -5.402e-01 1.303e-01 -4.144 3.41e-05 ***
## GenderMale 2.378e-01 2.951e-01 0.806 0.420376
## HourlyRate 3.076e-03 7.023e-03 0.438 0.661424
## JobInvolvement -5.331e-01 1.959e-01 -2.721 0.006507 **
## JobLevel 2.577e-01 5.062e-01 0.509 0.610661
## JobRoleHuman Resources NA NA NA NA
## JobRoleLaboratory Technician 1.689e+00 7.624e-01 2.216 0.026721 *
## JobRoleManager -1.405e+01 1.662e+03 -0.008 0.993254
## JobRoleManufacturing Director -8.038e-02 8.238e-01 -0.098 0.922266
## JobRoleResearch Director -1.571e+01 1.024e+03 -0.015 0.987757
## JobRoleResearch Scientist 6.527e-01 7.813e-01 0.835 0.403518
## JobRoleSales Executive 1.835e-01 3.134e+03 0.000 0.999953
## JobRoleSales Representative 1.445e+00 3.134e+03 0.000 0.999632
## JobSatisfaction -6.387e-01 1.304e-01 -4.897 9.71e-07 ***
## MaritalStatusMarried -3.864e-02 4.201e-01 -0.092 0.926720
## MaritalStatusSingle 9.682e-01 5.316e-01 1.821 0.068544 .
## MonthlyIncome 9.596e-05 1.330e-04 0.721 0.470694
## MonthlyRate 2.802e-05 1.991e-05 1.407 0.159383
## NumCompaniesWorked 2.573e-01 7.410e-02 3.473 0.000515 ***
## OverTimeYes 1.829e+00 3.068e-01 5.961 2.50e-09 ***
## PercentSalaryHike -8.254e-02 6.221e-02 -1.327 0.184594
## PerformanceRating 1.289e+00 6.165e-01 2.091 0.036570 *
## RelationshipSatisfaction -3.800e-01 1.348e-01 -2.818 0.004829 **
## StockOptionLevel -3.044e-01 2.264e-01 -1.345 0.178692
## TotalWorkingYears -1.485e-01 4.933e-02 -3.010 0.002609 **
## TrainingTimesLastYear 1.696e-02 1.891e-01 0.090 0.928538
## WorkLifeBalance -2.863e-01 2.045e-01 -1.400 0.161591
## YearsAtCompany 6.114e-02 1.219e-01 0.501 0.616095
## YearsInCurrentRole -2.161e-01 1.096e-01 -1.972 0.048584 *
## YearsSinceLastPromotion 1.725e-01 1.479e-01 1.166 0.243636
## YearsWithCurrManager -8.094e-02 1.112e-01 -0.728 0.466695
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 598.49 on 621 degrees of freedom
## Residual deviance: 364.17 on 578 degrees of freedom
## AIC: 452.17
##
## Number of Fisher Scoring iterations: 16
anova(logit_model, test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Attrition
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 621 598.49
## Age 1 18.268 620 580.22 1.919e-05 ***
## BusinessTravel 2 13.028 618 567.19 0.0014829 **
## DailyRate 1 0.739 617 566.45 0.3899481
## Department 2 6.222 615 560.23 0.0445488 *
## DistanceFromHome 1 4.962 614 555.27 0.0259097 *
## Education 1 1.843 613 553.43 0.1745420
## EducationField 5 2.756 608 550.67 0.7375249
## EnvironmentSatisfaction 1 12.723 607 537.95 0.0003612 ***
## Gender 1 1.123 606 536.83 0.2893104
## HourlyRate 1 0.122 605 536.70 0.7273830
## JobInvolvement 1 11.267 604 525.44 0.0007888 ***
## JobLevel 1 11.254 603 514.18 0.0007946 ***
## JobRole 7 14.598 596 499.58 0.0415100 *
## JobSatisfaction 1 18.765 595 480.82 1.478e-05 ***
## MaritalStatus 2 21.883 593 458.94 1.771e-05 ***
## MonthlyIncome 1 0.090 592 458.85 0.7646387
## MonthlyRate 1 1.131 591 457.71 0.2875058
## NumCompaniesWorked 1 8.868 590 448.85 0.0029019 **
## OverTime 1 40.715 589 408.13 1.761e-10 ***
## PercentSalaryHike 1 0.244 588 407.89 0.6213134
## PerformanceRating 1 3.624 587 404.26 0.0569597 .
## RelationshipSatisfaction 1 5.949 586 398.31 0.0147250 *
## StockOptionLevel 1 1.752 585 396.56 0.1855692
## TotalWorkingYears 1 20.088 584 376.47 7.397e-06 ***
## TrainingTimesLastYear 1 0.004 583 376.47 0.9465288
## WorkLifeBalance 1 1.904 582 374.57 0.1676249
## YearsAtCompany 1 5.657 581 368.91 0.0173872 *
## YearsInCurrentRole 1 2.895 580 366.01 0.0888674 .
## YearsSinceLastPromotion 1 1.319 579 364.70 0.2508314
## YearsWithCurrManager 1 0.521 578 364.17 0.4705305
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
test_data$pred_prob_logit <- predict(logit_model, newdata=test_data, type="response")
test_data$pred_class_logit <- factor(ifelse(test_data$pred_prob_logit > 0.5, "Yes", "No"))
conf_matrix_logit <- confusionMatrix(test_data$pred_class_logit, test_data$Attrition)
print(conf_matrix_logit)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 214 22
## Yes 13 18
##
## Accuracy : 0.8689
## 95% CI : (0.8224, 0.907)
## No Information Rate : 0.8502
## P-Value [Acc > NIR] : 0.2226
##
## Kappa : 0.4328
##
## Mcnemar's Test P-Value : 0.1763
##
## Sensitivity : 0.9427
## Specificity : 0.4500
## Pos Pred Value : 0.9068
## Neg Pred Value : 0.5806
## Prevalence : 0.8502
## Detection Rate : 0.8015
## Detection Prevalence : 0.8839
## Balanced Accuracy : 0.6964
##
## 'Positive' Class : No
##
cmlr <- as.table(conf_matrix_logit$table)
cmlr_df <- as.data.frame(cmlr)
colnames(cmlr_df) <- c("Predicted", "Actual", "Freq")
ggplot(cmlr_df, aes(x = Actual, y = Predicted, fill = Freq)) +
geom_tile(color = "black") +
geom_text(aes(label = Freq), color = "white", size = 6) +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
labs(title = "Confusion Matrix Logistic Regression",
x = "Actual Label",
y = "Predicted Label") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
cmlogit <- conf_matrix_logit$table
TP <- cmlogit["Yes", "Yes"]
TN <- cmlogit["No", "No"]
FP <- cmlogit["Yes", "No"]
FN <- cmlogit["No", "Yes"]
accuracy <- (TP + TN) / sum(cmlogit)
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("Akurasi:", round(accuracy, 4), "\n")
## Akurasi: 0.8689
cat("Precision:", round(precision, 4), "\n")
## Precision: 0.5806
cat("Recall:", round(recall, 4), "\n")
## Recall: 0.45
cat("F1-score:", round(f1_score, 4), "\n")
## F1-score: 0.507
normality_results <- sapply(numeric_cols, function(x) {
yes_data <- na.omit(train_data %>% filter(Attrition == "Yes") %>% pull(x))
no_data <- na.omit(train_data %>% filter(Attrition == "No") %>% pull(x))
p_yes <- if(length(yes_data) >= 4) nortest::lillie.test(yes_data)$p.value else NA
p_no <- if(length(no_data) >= 4) nortest::lillie.test(no_data)$p.value else NA
c(Yes = p_yes, No = p_no)
})
print("P-value uji normalitas Lilliefors tiap variabel:")
## [1] "P-value uji normalitas Lilliefors tiap variabel:"
print(normality_results)
## Age DailyRate DistanceFromHome HourlyRate MonthlyIncome
## Yes 2.068406e-04 6.849324e-04 1.524981e-06 7.753191e-02 1.161827e-11
## No 5.830763e-12 2.922622e-09 5.533947e-33 7.190823e-07 4.190625e-21
## MonthlyRate NumCompaniesWorked PercentSalaryHike PerformanceRating
## Yes 4.870166e-02 1.163422e-28 1.494536e-10 3.315593e-81
## No 2.235624e-07 5.624869e-87 3.382705e-40 0.000000e+00
## TotalWorkingYears TrainingTimesLastYear YearsAtCompany YearsInCurrentRole
## Yes 6.451584e-05 6.608783e-32 7.040844e-11 1.212209e-16
## No 8.270227e-23 4.387245e-96 1.875544e-32 9.302912e-59
## YearsSinceLastPromotion YearsWithCurrManager
## Yes 2.599799e-31 1.766402e-16
## No 3.797598e-99 1.066115e-58
boxm_test <- boxM(train_data[, numeric_cols], train_data$Attrition)
print(boxm_test)
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: train_data[, numeric_cols]
## Chi-Sq (approx.) = 204.51, df = 120, p-value = 2.411e-06
lda_model <- lda(Attrition ~ ., data=train_data)
## Warning in lda.default(x, grouping, ...): variables are collinear
pred_lda <- predict(lda_model, newdata=test_data)
conf_matrix_lda <- confusionMatrix(pred_lda$class, test_data$Attrition)
print(conf_matrix_lda)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 216 22
## Yes 11 18
##
## Accuracy : 0.8764
## 95% CI : (0.8308, 0.9134)
## No Information Rate : 0.8502
## P-Value [Acc > NIR] : 0.13124
##
## Kappa : 0.4528
##
## Mcnemar's Test P-Value : 0.08172
##
## Sensitivity : 0.9515
## Specificity : 0.4500
## Pos Pred Value : 0.9076
## Neg Pred Value : 0.6207
## Prevalence : 0.8502
## Detection Rate : 0.8090
## Detection Prevalence : 0.8914
## Balanced Accuracy : 0.7008
##
## 'Positive' Class : No
##
cmlda <- as.table(conf_matrix_lda$table)
cmlda_df <- as.data.frame(cmlda)
colnames(cmlda_df) <- c("Predicted", "Actual", "Freq")
ggplot(cmlda_df, aes(x = Actual, y = Predicted, fill = Freq)) +
geom_tile(color = "black") +
geom_text(aes(label = Freq), color = "white", size = 6) +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
labs(title = "Confusion Matrix LDA",
x = "Actual Label",
y = "Predicted Label") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
cmlda <- conf_matrix_lda$table
TP <- cmlda["Yes", "Yes"]
TN <- cmlda["No", "No"]
FP <- cmlda["Yes", "No"]
FN <- cmlda["No", "Yes"]
accuracy <- (TP + TN) / sum(cmlda)
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("Akurasi:", round(accuracy, 4), "\n")
## Akurasi: 0.8764
cat("Precision:", round(precision, 4), "\n")
## Precision: 0.6207
cat("Recall:", round(recall, 4), "\n")
## Recall: 0.45
cat("F1-score:", round(f1_score, 4), "\n")
## F1-score: 0.5217
num_scaled <- scale(clean_data[, numeric_cols])
pca_result <- prcomp(num_scaled, center=TRUE, scale.=TRUE)
summary(pca_result)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.8432 1.3651 1.3274 1.05424 1.0276 1.00303 0.96992
## Proportion of Variance 0.2265 0.1242 0.1175 0.07409 0.0704 0.06707 0.06272
## Cumulative Proportion 0.2265 0.3507 0.4682 0.54228 0.6127 0.67976 0.74247
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.95060 0.9116 0.81008 0.77588 0.53692 0.50680 0.47347
## Proportion of Variance 0.06024 0.0554 0.04375 0.04013 0.01922 0.01712 0.01495
## Cumulative Proportion 0.80272 0.8581 0.90187 0.94200 0.96122 0.97834 0.99329
## PC15
## Standard deviation 0.31735
## Proportion of Variance 0.00671
## Cumulative Proportion 1.00000
eigenvalues <- pca_result$sdev^2
eigenvalues
## [1] 3.3973886 1.8635514 1.7619062 1.1114121 1.0560111 1.0060737 0.9407477
## [8] 0.9036475 0.8310322 0.6562262 0.6019880 0.2882780 0.2568463 0.2241786
## [15] 0.1007123
plot(eigenvalues, type = "b", pch = 19,
xlab = "Komponen Utama",
ylab = "Eigenvalue",
main = "Scree Plot dengan Elbow Method")
# Tambahkan garis horizontal di eigenvalue = 1 (kriteria Kaiser)
abline(h = 1, col = "red", lty = 2)
pca_result <- prcomp(train_data[, numeric_cols], scale. = TRUE)
attrition_labels <- train_data$Attrition
library(ggfortify)
autoplot(pca_result, data = train_data, colour = 'Attrition',
loadings = TRUE, loadings.label = TRUE, loadings.label.size = 3) +
ggtitle("Biplot PCA") +
theme_minimal()
n_pca <- sum(eigenvalues > 1)
print(paste("Jumlah komponen PCA (eigenvalue>1):", n_pca))
## [1] "Jumlah komponen PCA (eigenvalue>1): 6"
pca_scores <- pca_result$x[, 1:n_pca]
pca_data <- data.frame(pca_scores, Attrition = train_data$Attrition)
loadings <- pca_result$rotation
print(round(loadings, 3))
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
## Age 0.186 -0.479 -0.139 -0.138 0.104 0.110 0.014 0.110
## DailyRate 0.054 -0.101 -0.044 -0.276 -0.474 -0.565 0.369 -0.435
## DistanceFromHome 0.062 -0.075 -0.015 0.371 -0.506 0.447 -0.306 -0.457
## HourlyRate 0.027 -0.049 -0.039 -0.319 -0.663 0.214 0.007 0.611
## MonthlyIncome 0.297 -0.319 0.027 0.195 0.018 0.033 -0.080 -0.027
## MonthlyRate -0.029 -0.098 0.091 0.551 -0.040 0.123 0.759 0.221
## NumCompaniesWorked 0.058 -0.510 -0.149 -0.096 0.140 -0.124 -0.023 0.013
## PercentSalaryHike 0.057 0.144 -0.681 0.079 0.014 0.004 0.041 -0.046
## PerformanceRating 0.053 0.180 -0.673 0.087 0.005 0.011 0.054 0.066
## TotalWorkingYears 0.391 -0.380 -0.060 0.000 0.059 0.026 -0.034 -0.003
## TrainingTimesLastYear -0.014 0.024 -0.015 -0.508 0.165 0.620 0.408 -0.354
## YearsAtCompany 0.481 0.217 0.099 0.027 -0.021 -0.020 0.061 -0.018
## YearsInCurrentRole 0.462 0.206 0.085 0.011 0.005 -0.002 0.090 -0.007
## YearsSinceLastPromotion 0.255 0.206 0.043 -0.204 0.124 0.046 -0.055 0.183
## YearsWithCurrManager 0.443 0.226 0.075 0.016 -0.011 -0.058 0.001 -0.024
## PC9 PC10 PC11 PC12 PC13 PC14 PC15
## Age 0.014 -0.022 -0.745 0.321 -0.055 0.011 0.030
## DailyRate -0.084 0.157 -0.069 0.015 0.011 0.047 -0.012
## DistanceFromHome -0.285 -0.081 -0.068 0.034 -0.005 0.032 0.002
## HourlyRate 0.105 -0.037 0.139 0.005 0.003 -0.046 -0.003
## MonthlyIncome 0.340 0.611 0.377 0.340 -0.001 0.129 -0.004
## MonthlyRate -0.174 -0.004 -0.012 0.002 0.068 -0.028 -0.008
## NumCompaniesWorked -0.357 -0.490 0.496 0.212 -0.036 0.046 0.083
## PercentSalaryHike 0.016 0.092 0.067 0.107 -0.022 -0.690 0.026
## PerformanceRating 0.032 -0.018 -0.015 -0.104 0.020 0.695 -0.015
## TotalWorkingYears 0.037 0.056 0.000 -0.804 0.132 -0.114 -0.115
## TrainingTimesLastYear 0.104 0.019 0.150 0.008 0.048 0.034 0.000
## YearsAtCompany 0.102 -0.159 -0.013 -0.018 -0.081 0.005 0.814
## YearsInCurrentRole 0.019 -0.176 0.029 0.055 -0.700 -0.001 -0.451
## YearsSinceLastPromotion -0.767 0.443 -0.005 0.041 0.093 0.030 0.001
## YearsWithCurrManager 0.120 -0.298 -0.012 0.248 0.682 -0.022 -0.335
train_pca <- pca_data[train_idx, ]
test_pca <- pca_data[-train_idx, ]
logit_pca <- glm(Attrition ~ ., data=train_pca, family=binomial)
summary(logit_pca)
##
## Call:
## glm(formula = Attrition ~ ., family = binomial, data = train_pca)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.63120 0.14262 -11.437 < 2e-16 ***
## PC1 -0.38560 0.08323 -4.633 3.61e-06 ***
## PC2 -0.01245 0.09428 -0.132 0.8949
## PC3 -0.11542 0.09079 -1.271 0.2036
## PC4 0.20217 0.12178 1.660 0.0969 .
## PC5 -0.23888 0.12280 -1.945 0.0517 .
## PC6 0.11055 0.12716 0.869 0.3846
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 436.91 on 444 degrees of freedom
## Residual deviance: 402.04 on 438 degrees of freedom
## (177 observations deleted due to missingness)
## AIC: 416.04
##
## Number of Fisher Scoring iterations: 5
test_pca$pred_prob_logit <- predict(logit_pca, newdata=test_pca, type="response")
test_pca$pred_class_logit <- factor(ifelse(test_pca$pred_prob_logit > 0.5, "Yes", "No"))
conf_matrix_logit_pca <- confusionMatrix(test_pca$pred_class_logit, test_pca$Attrition)
print(conf_matrix_logit_pca)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 147 28
## Yes 0 2
##
## Accuracy : 0.8418
## 95% CI : (0.7795, 0.8922)
## No Information Rate : 0.8305
## P-Value [Acc > NIR] : 0.3897
##
## Kappa : 0.1061
##
## Mcnemar's Test P-Value : 3.352e-07
##
## Sensitivity : 1.00000
## Specificity : 0.06667
## Pos Pred Value : 0.84000
## Neg Pred Value : 1.00000
## Prevalence : 0.83051
## Detection Rate : 0.83051
## Detection Prevalence : 0.98870
## Balanced Accuracy : 0.53333
##
## 'Positive' Class : No
##
cmlogpca <- as.table(conf_matrix_logit_pca$table)
cmlogpca_df <- as.data.frame(cmlogpca)
colnames(cmlogpca_df) <- c("Predicted", "Actual", "Freq")
ggplot(cmlogpca_df, aes(x = Actual, y = Predicted, fill = Freq)) +
geom_tile(color = "black") +
geom_text(aes(label = Freq), color = "white", size = 6) +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
labs(title = "Confusion Matrix Logistic Regression dengan PCA",
x = "Actual Label",
y = "Predicted Label") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
cmlogpca <- conf_matrix_logit_pca$table
TP <- cmlogpca["Yes", "Yes"]
TN <- cmlogpca["No", "No"]
FP <- cmlogpca["Yes", "No"]
FN <- cmlogpca["No", "Yes"]
accuracy <- (TP + TN) / sum(cmlogpca)
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("Akurasi:", round(accuracy, 4), "\n")
## Akurasi: 0.8418
cat("Precision:", round(precision, 4), "\n")
## Precision: 1
cat("Recall:", round(recall, 4), "\n")
## Recall: 0.0667
cat("F1-score:", round(f1_score, 4), "\n")
## F1-score: 0.125
lda_pca <- lda(Attrition ~ ., data=train_pca)
pred_lda_pca <- predict(lda_pca, newdata=test_pca)
conf_matrix_lda_pca <- confusionMatrix(pred_lda_pca$class, test_pca$Attrition)
print(conf_matrix_lda_pca)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 147 28
## Yes 0 2
##
## Accuracy : 0.8418
## 95% CI : (0.7795, 0.8922)
## No Information Rate : 0.8305
## P-Value [Acc > NIR] : 0.3897
##
## Kappa : 0.1061
##
## Mcnemar's Test P-Value : 3.352e-07
##
## Sensitivity : 1.00000
## Specificity : 0.06667
## Pos Pred Value : 0.84000
## Neg Pred Value : 1.00000
## Prevalence : 0.83051
## Detection Rate : 0.83051
## Detection Prevalence : 0.98870
## Balanced Accuracy : 0.53333
##
## 'Positive' Class : No
##
cmldapca <- as.table(conf_matrix_lda_pca$table)
cmldapca_df <- as.data.frame(cmldapca)
colnames(cmldapca_df) <- c("Predicted", "Actual", "Freq")
ggplot(cmldapca_df, aes(x = Actual, y = Predicted, fill = Freq)) +
geom_tile(color = "black") +
geom_text(aes(label = Freq), color = "white", size = 6) +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
labs(title = "Confusion Matrix LDA dengan PCA",
x = "Actual Label",
y = "Predicted Label") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
cmldapca <- conf_matrix_lda_pca$table
TP <- cmldapca["Yes", "Yes"]
TN <- cmldapca["No", "No"]
FP <- cmldapca["Yes", "No"]
FN <- cmldapca["No", "Yes"]
accuracy <- (TP + TN) / sum(cmldapca)
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("Akurasi:", round(accuracy, 4), "\n")
## Akurasi: 0.8418
cat("Precision:", round(precision, 4), "\n")
## Precision: 1
cat("Recall:", round(recall, 4), "\n")
## Recall: 0.0667
cat("F1-score:", round(f1_score, 4), "\n")
## F1-score: 0.125
fa_result <- fa(train_data[, numeric_cols], nfactors=n_pca, rotate="varimax", fm="ml")
print(fa_result)
## Factor Analysis using method = ml
## Call: fa(r = train_data[, numeric_cols], nfactors = n_pca, rotate = "varimax",
## fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
## ML1 ML2 ML5 ML6 ML3 ML4 h2 u2 com
## Age 0.04 0.02 0.62 -0.01 0.12 0.01 0.397 0.603 1.1
## DailyRate 0.05 0.01 0.11 -0.08 -0.02 0.04 0.022 0.978 2.7
## DistanceFromHome 0.04 0.02 0.08 0.16 -0.07 0.01 0.039 0.961 2.1
## HourlyRate 0.02 0.01 0.04 -0.04 0.09 0.00 0.012 0.988 2.0
## MonthlyIncome 0.26 -0.03 0.49 0.41 -0.10 0.04 0.484 0.516 2.6
## MonthlyRate -0.04 -0.05 -0.02 0.19 -0.03 -0.06 0.046 0.954 1.5
## NumCompaniesWorked -0.14 0.01 0.54 -0.06 0.06 -0.08 0.324 0.676 1.2
## PercentSalaryHike 0.03 0.94 0.08 -0.12 -0.28 0.06 0.995 0.005 1.2
## PerformanceRating 0.04 0.93 -0.05 0.04 0.36 -0.05 0.995 0.005 1.3
## TotalWorkingYears 0.38 0.01 0.83 0.20 0.09 0.07 0.890 0.110 1.6
## TrainingTimesLastYear 0.01 -0.01 0.02 -0.17 0.00 -0.03 0.031 0.969 1.1
## YearsAtCompany 0.97 0.01 0.07 0.10 0.04 0.21 0.995 0.005 1.1
## YearsInCurrentRole 0.95 0.02 0.10 0.04 -0.04 -0.29 0.995 0.005 1.2
## YearsSinceLastPromotion 0.42 0.02 0.06 -0.12 0.02 -0.04 0.193 0.807 1.2
## YearsWithCurrManager 0.82 0.03 0.06 0.04 0.05 0.17 0.715 0.285 1.1
##
## ML1 ML2 ML5 ML6 ML3 ML4
## SS loadings 2.93 1.75 1.65 0.35 0.27 0.18
## Proportion Var 0.20 0.12 0.11 0.02 0.02 0.01
## Cumulative Var 0.20 0.31 0.42 0.45 0.46 0.48
## Proportion Explained 0.41 0.25 0.23 0.05 0.04 0.03
## Cumulative Proportion 0.41 0.66 0.89 0.94 0.97 1.00
##
## Mean item complexity = 1.5
## Test of the hypothesis that 6 factors are sufficient.
##
## df null model = 105 with the objective function = 5.21 with Chi Square = 3203.9
## df of the model are 30 and the objective function was 0.05
##
## The root mean square of the residuals (RMSR) is 0.02
## The df corrected root mean square of the residuals is 0.03
##
## The harmonic n.obs is 622 with the empirical chi square 33.62 with prob < 0.3
## The total n.obs was 622 with Likelihood Chi Square = 28.88 with prob < 0.52
##
## Tucker Lewis Index of factoring reliability = 1.001
## RMSEA index = 0 and the 90 % confidence intervals are 0 0.029
## BIC = -164.11
## Fit based upon off diagonal values = 0.99
## Measures of factor score adequacy
## ML1 ML2 ML5 ML6 ML3
## Correlation of (regression) scores with factors 1.00 1.00 0.93 0.60 0.96
## Multiple R square of scores with factors 0.99 1.00 0.87 0.37 0.92
## Minimum correlation of possible factor scores 0.99 0.99 0.74 -0.27 0.84
## ML4
## Correlation of (regression) scores with factors 0.98
## Multiple R square of scores with factors 0.96
## Minimum correlation of possible factor scores 0.92
fa_facto <- PCA(train_data[, numeric_cols], graph = FALSE)
fviz_pca_biplot(fa_facto,
label = "var",
habillage = train_data$Attrition,
palette = "jco",
addEllipses = FALSE,
title = "Biplot FA")
fa_scores <- factor.scores(train_data[, numeric_cols], fa_result)$scores
fa_data <- data.frame(fa_scores, Attrition = train_data$Attrition)
logit_fa <- glm(Attrition ~ ., data=fa_data, family=binomial)
summary(logit_fa)
##
## Call:
## glm(formula = Attrition ~ ., family = binomial, data = fa_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.69446 0.12455 -13.605 < 2e-16 ***
## ML1 -0.77288 0.14502 -5.329 9.85e-08 ***
## ML2 0.09769 0.10226 0.955 0.339399
## ML5 -0.39425 0.11047 -3.569 0.000359 ***
## ML6 0.04193 0.11454 0.366 0.714323
## ML3 0.09593 0.10977 0.874 0.382161
## ML4 -0.16581 0.13539 -1.225 0.220687
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 598.49 on 621 degrees of freedom
## Residual deviance: 545.42 on 615 degrees of freedom
## AIC: 559.42
##
## Number of Fisher Scoring iterations: 5
fa_loadings <- fa_result$loadings
train_means <- attr(scale(train_data[, numeric_cols]), "scaled:center")
train_sds <- attr(scale(train_data[, numeric_cols]), "scaled:scale")
test_scaled <- scale(test_data[, numeric_cols], center = train_means, scale = train_sds)
test_fa_scores <- as.matrix(test_scaled) %*% as.matrix(fa_loadings)
colnames(test_fa_scores) <- colnames(fa_scores)
test_fa <- data.frame(test_fa_scores, Attrition = test_data$Attrition)
print(fa_result$loadings)
##
## Loadings:
## ML1 ML2 ML5 ML6 ML3 ML4
## Age 0.616 0.121
## DailyRate 0.109
## DistanceFromHome 0.161
## HourlyRate
## MonthlyIncome 0.258 0.487 0.408 -0.103
## MonthlyRate 0.194
## NumCompaniesWorked -0.141 0.538
## PercentSalaryHike 0.942 -0.121 -0.285
## PerformanceRating 0.927 0.355
## TotalWorkingYears 0.385 0.831 0.196
## TrainingTimesLastYear -0.173
## YearsAtCompany 0.968 0.205
## YearsInCurrentRole 0.949 -0.287
## YearsSinceLastPromotion 0.416 -0.122
## YearsWithCurrManager 0.824 0.167
##
## ML1 ML2 ML5 ML6 ML3 ML4
## SS loadings 2.933 1.755 1.647 0.354 0.265 0.179
## Proportion Var 0.196 0.117 0.110 0.024 0.018 0.012
## Cumulative Var 0.196 0.313 0.422 0.446 0.464 0.476
test_fa$pred_prob_logit <- predict(logit_fa, newdata = test_fa, type = "response")
test_fa$pred_class_logit <- factor(ifelse(test_fa$pred_prob_logit > 0.5, "Yes", "No"))
conf_matrix_logit_fa <- confusionMatrix(test_fa$pred_class_logit, test_fa$Attrition)
print(conf_matrix_logit_fa)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 172 20
## Yes 55 20
##
## Accuracy : 0.7191
## 95% CI : (0.6611, 0.7722)
## No Information Rate : 0.8502
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1894
##
## Mcnemar's Test P-Value : 8.638e-05
##
## Sensitivity : 0.7577
## Specificity : 0.5000
## Pos Pred Value : 0.8958
## Neg Pred Value : 0.2667
## Prevalence : 0.8502
## Detection Rate : 0.6442
## Detection Prevalence : 0.7191
## Balanced Accuracy : 0.6289
##
## 'Positive' Class : No
##
cmlogfa <- as.table(conf_matrix_logit_fa$table)
cmlogfa_df <- as.data.frame(cmlogfa)
colnames(cmlogfa_df) <- c("Predicted", "Actual", "Freq")
ggplot(cmlogfa_df, aes(x = Actual, y = Predicted, fill = Freq)) +
geom_tile(color = "black") +
geom_text(aes(label = Freq), color = "white", size = 6) +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
labs(title = "Confusion Matrix Logistic Regression dengan FA",
x = "Actual Label",
y = "Predicted Label") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
cmlogfa <- conf_matrix_logit_fa$table
TP <- cmlogfa["Yes", "Yes"]
TN <- cmlogfa["No", "No"]
FP <- cmlogfa["Yes", "No"]
FN <- cmlogfa["No", "Yes"]
accuracy <- (TP + TN) / sum(cmlogfa)
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("Akurasi:", round(accuracy, 4), "\n")
## Akurasi: 0.7191
cat("Precision:", round(precision, 4), "\n")
## Precision: 0.2667
cat("Recall:", round(recall, 4), "\n")
## Recall: 0.5
cat("F1-score:", round(f1_score, 4), "\n")
## F1-score: 0.3478
lda_fa <- lda(Attrition ~ ., data = fa_data)
lda_pred_fa <- predict(lda_fa, newdata = test_fa)
conf_matrix_lda_fa <- confusionMatrix(lda_pred_fa$class, test_fa$Attrition)
print(conf_matrix_lda_fa)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 182 23
## Yes 45 17
##
## Accuracy : 0.7453
## 95% CI : (0.6886, 0.7965)
## No Information Rate : 0.8502
## P-Value [Acc > NIR] : 1.00000
##
## Kappa : 0.1849
##
## Mcnemar's Test P-Value : 0.01088
##
## Sensitivity : 0.8018
## Specificity : 0.4250
## Pos Pred Value : 0.8878
## Neg Pred Value : 0.2742
## Prevalence : 0.8502
## Detection Rate : 0.6816
## Detection Prevalence : 0.7678
## Balanced Accuracy : 0.6134
##
## 'Positive' Class : No
##
cmldafa <- as.table(conf_matrix_lda_fa$table)
cmldafa_df <- as.data.frame(cmldafa)
colnames(cmldafa_df) <- c("Predicted", "Actual", "Freq")
ggplot(cmldafa_df, aes(x = Actual, y = Predicted, fill = Freq)) +
geom_tile(color = "black") +
geom_text(aes(label = Freq), color = "white", size = 6) +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
labs(title = "Confusion Matrix LDA dengan FA",
x = "Actual Label",
y = "Predicted Label") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
cmldafa <- conf_matrix_lda_fa$table
TP <- cmldafa["Yes", "Yes"]
TN <- cmldafa["No", "No"]
FP <- cmldafa["Yes", "No"]
FN <- cmldafa["No", "Yes"]
accuracy <- (TP + TN) / sum(cmldafa)
precision <- TP / cmldafa
recall <- TP / (TP + FN)
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("Akurasi:", round(accuracy, 4), "\n")
## Akurasi: 0.7453
cat("Precision:", round(precision, 4), "\n")
## Precision: 0.0934 0.3778 0.7391 1
cat("Recall:", round(recall, 4), "\n")
## Recall: 0.425
cat("F1-score:", round(f1_score, 4), "\n")
## F1-score: 0.1532 0.4 0.5397 0.5965
num_data_scaled <- scale(clean_data[, numeric_cols])
dist_matrix <- dist(num_data_scaled, method = "euclidean")
mds_result <- cmdscale(dist_matrix, k = 2, eig = TRUE)
mds_coords <- as.data.frame(mds_result$points)
names(mds_coords) <- c("Dim1", "Dim2")
mds_coords$Attrition <- clean_data$Attrition
p_mds <- ggplot(mds_coords, aes(x = Dim1, y = Dim2, color = Attrition)) +
geom_point(alpha = 0.7, size = 3) +
labs(title = "Perceptual Mapping - MDS (2D)",
x = "Dimensi 1",
y = "Dimensi 2") +
theme_minimal() +
theme(legend.position = "right")
print(p_mds)
cat("Eigenvalues (varians tiap PC):\n")
## Eigenvalues (varians tiap PC):
print(round(eigenvalues, 3))
## [1] 3.397 1.864 1.762 1.111 1.056 1.006 0.941 0.904 0.831 0.656 0.602 0.288
## [13] 0.257 0.224 0.101
cat("Proporsi varians tiap PC:\n")
## Proporsi varians tiap PC:
print(round(eigenvalues / sum(eigenvalues), 3))
## [1] 0.226 0.124 0.117 0.074 0.070 0.067 0.063 0.060 0.055 0.044 0.040 0.019
## [13] 0.017 0.015 0.007
cat("Proporsi kumulatif varians:\n")
## Proporsi kumulatif varians:
print(round(cumsum(eigenvalues / sum(eigenvalues)), 3))
## [1] 0.226 0.351 0.468 0.542 0.613 0.680 0.742 0.803 0.858 0.902 0.942 0.961
## [13] 0.978 0.993 1.000
p_biplot <- fviz_pca_biplot(pca_result,
geom.ind = "point",
habillage = train_data$Attrition,
addEllipses = TRUE,
ellipse.level = 0.95,
palette = c("#00AFBB", "#FC4E07"),
repel = TRUE) +
ggtitle("Biplot PCA dengan Pewarnaan berdasarkan Attrition") +
theme_minimal()
print(p_biplot)