#libraries to use
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(corrplot)
## corrplot 0.92 loaded
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(dummy)
## dummy 0.1.3
## dummyNews()
attrition <- read.csv("Employee_Attrition.csv")
str(attrition)
## 'data.frame': 1470 obs. of 35 variables:
## $ Age : int 41 49 37 33 27 32 59 30 38 36 ...
## $ Attrition : chr "Yes" "No" "Yes" "No" ...
## $ BusinessTravel : chr "Travel_Rarely" "Travel_Frequently" "Travel_Rarely" "Travel_Frequently" ...
## $ DailyRate : int 1102 279 1373 1392 591 1005 1324 1358 216 1299 ...
## $ Department : chr "Sales" "Research & Development" "Research & Development" "Research & Development" ...
## $ DistanceFromHome : int 1 8 2 3 2 2 3 24 23 27 ...
## $ Education : int 2 1 2 4 1 2 3 1 3 3 ...
## $ EducationField : chr "Life Sciences" "Life Sciences" "Other" "Life Sciences" ...
## $ EmployeeCount : int 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : int 1 2 4 5 7 8 10 11 12 13 ...
## $ EnvironmentSatisfaction : int 2 3 4 4 1 4 3 4 4 3 ...
## $ Gender : chr "Female" "Male" "Male" "Female" ...
## $ HourlyRate : int 94 61 92 56 40 79 81 67 44 94 ...
## $ JobInvolvement : int 3 2 2 3 3 3 4 3 2 3 ...
## $ JobLevel : int 2 2 1 1 1 1 1 1 3 2 ...
## $ JobRole : chr "Sales Executive" "Research Scientist" "Laboratory Technician" "Research Scientist" ...
## $ JobSatisfaction : int 4 2 3 3 2 4 1 3 3 3 ...
## $ MaritalStatus : chr "Single" "Married" "Single" "Married" ...
## $ MonthlyIncome : int 5993 5130 2090 2909 3468 3068 2670 2693 9526 5237 ...
## $ MonthlyRate : int 19479 24907 2396 23159 16632 11864 9964 13335 8787 16577 ...
## $ NumCompaniesWorked : int 8 1 6 1 9 0 4 1 0 6 ...
## $ Over18 : chr "Y" "Y" "Y" "Y" ...
## $ OverTime : chr "Yes" "No" "Yes" "Yes" ...
## $ PercentSalaryHike : int 11 23 15 11 12 13 20 22 21 13 ...
## $ PerformanceRating : int 3 4 3 3 3 3 4 4 4 3 ...
## $ RelationshipSatisfaction: int 1 4 2 3 4 3 1 2 2 2 ...
## $ StandardHours : int 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : int 0 1 0 0 1 0 3 1 0 2 ...
## $ TotalWorkingYears : int 8 10 7 8 6 8 12 1 10 17 ...
## $ TrainingTimesLastYear : int 0 3 3 3 3 2 3 2 2 3 ...
## $ WorkLifeBalance : int 1 3 3 3 3 2 2 3 3 2 ...
## $ YearsAtCompany : int 6 10 0 8 2 7 1 1 9 7 ...
## $ YearsInCurrentRole : int 4 7 0 7 2 7 0 0 7 7 ...
## $ YearsSinceLastPromotion : int 0 1 0 3 2 3 0 0 1 7 ...
## $ YearsWithCurrManager : int 5 7 0 0 2 6 0 0 8 7 ...
names(attrition)
## [1] "Age" "Attrition"
## [3] "BusinessTravel" "DailyRate"
## [5] "Department" "DistanceFromHome"
## [7] "Education" "EducationField"
## [9] "EmployeeCount" "EmployeeNumber"
## [11] "EnvironmentSatisfaction" "Gender"
## [13] "HourlyRate" "JobInvolvement"
## [15] "JobLevel" "JobRole"
## [17] "JobSatisfaction" "MaritalStatus"
## [19] "MonthlyIncome" "MonthlyRate"
## [21] "NumCompaniesWorked" "Over18"
## [23] "OverTime" "PercentSalaryHike"
## [25] "PerformanceRating" "RelationshipSatisfaction"
## [27] "StandardHours" "StockOptionLevel"
## [29] "TotalWorkingYears" "TrainingTimesLastYear"
## [31] "WorkLifeBalance" "YearsAtCompany"
## [33] "YearsInCurrentRole" "YearsSinceLastPromotion"
## [35] "YearsWithCurrManager"
colSums(is.na(attrition))
## Age Attrition BusinessTravel
## 0 0 0
## DailyRate Department DistanceFromHome
## 0 0 0
## Education EducationField EmployeeCount
## 0 0 0
## EmployeeNumber EnvironmentSatisfaction Gender
## 0 0 0
## HourlyRate JobInvolvement JobLevel
## 0 0 0
## JobRole JobSatisfaction MaritalStatus
## 0 0 0
## MonthlyIncome MonthlyRate NumCompaniesWorked
## 0 0 0
## Over18 OverTime PercentSalaryHike
## 0 0 0
## PerformanceRating RelationshipSatisfaction StandardHours
## 0 0 0
## StockOptionLevel TotalWorkingYears TrainingTimesLastYear
## 0 0 0
## WorkLifeBalance YearsAtCompany YearsInCurrentRole
## 0 0 0
## YearsSinceLastPromotion YearsWithCurrManager
## 0 0
No columns with missing values
# Find columns with only one distinct value
anomaly_columns <- sapply(attrition, function(x) length(unique(x))) == 1
anomaly_columns
## Age Attrition BusinessTravel
## FALSE FALSE FALSE
## DailyRate Department DistanceFromHome
## FALSE FALSE FALSE
## Education EducationField EmployeeCount
## FALSE FALSE TRUE
## EmployeeNumber EnvironmentSatisfaction Gender
## FALSE FALSE FALSE
## HourlyRate JobInvolvement JobLevel
## FALSE FALSE FALSE
## JobRole JobSatisfaction MaritalStatus
## FALSE FALSE FALSE
## MonthlyIncome MonthlyRate NumCompaniesWorked
## FALSE FALSE FALSE
## Over18 OverTime PercentSalaryHike
## TRUE FALSE FALSE
## PerformanceRating RelationshipSatisfaction StandardHours
## FALSE FALSE TRUE
## StockOptionLevel TotalWorkingYears TrainingTimesLastYear
## FALSE FALSE FALSE
## WorkLifeBalance YearsAtCompany YearsInCurrentRole
## FALSE FALSE FALSE
## YearsSinceLastPromotion YearsWithCurrManager
## FALSE FALSE
Over18 column
All the values has Y response thus we can drop the column
# Remove the "Over18" column
attrition <- subset(attrition, select = -Over18)
# Convert numerical features to factors/categorical variables
attrition$Education <- factor(attrition$Education)
attrition$EnvironmentSatisfaction <- factor(attrition$EnvironmentSatisfaction)
attrition$JobInvolvement <- factor(attrition$JobInvolvement)
attrition$JobLevel <- factor(attrition$JobLevel)
attrition$JobSatisfaction <- factor(attrition$JobSatisfaction)
attrition$RelationshipSatisfaction <- factor(attrition$RelationshipSatisfaction)
attrition$StockOptionLevel <- factor(attrition$StockOptionLevel)
attrition$WorkLifeBalance <- factor(attrition$WorkLifeBalance)
attrition %>%
select(Age, DailyRate, DistanceFromHome, HourlyRate, MonthlyIncome, MonthlyRate,
PercentSalaryHike, TotalWorkingYears, YearsAtCompany, YearsInCurrentRole,
YearsSinceLastPromotion, YearsWithCurrManager) %>%
gather(key = "Variable", value = "Value") %>%
ggplot(aes(x = Value)) +
geom_histogram(fill = "skyblue", color = "black", bins = 20) +
facet_wrap(~ Variable, scales = "free") +
theme_gray()
# BusinessTravel - Bar Graph
attrition %>%
count(BusinessTravel) %>%
ggplot(aes(x = BusinessTravel, y = n, fill = BusinessTravel)) +
geom_bar(stat = "identity") +
geom_text(aes(label = n), vjust = -0.5, color = "black", size = 3) +
labs(x = "Business Travel", y = "Count", fill = "Business Travel") +
theme_minimal()
# Department - Bar Graph
attrition %>%
count(Department) %>%
ggplot(aes(x = Department, y = n, fill = Department)) +
geom_bar(stat = "identity") +
geom_text(aes(label = n), vjust = -0.5, color = "black", size = 3) +
labs(x = "Department", y = "Count", fill = "Department") +
theme_minimal()
# EducationField - Bar Graph
attrition %>%
count(EducationField) %>%
ggplot(aes(x = EducationField, y = n, fill = EducationField)) +
geom_bar(stat = "identity") +
geom_text(aes(label = n), vjust = -0.5, color = "black", size = 3) +
labs(x = "Education Field", y = "Count", fill = "Education Field") +
theme_minimal()
# Gender - Pie Chart
attrition %>%
count(Gender) %>%
ggplot(aes(x = "", y = n, fill = Gender)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start = 0) +
labs(x = "", y = "Count", fill = "Gender") +
theme_minimal()
# JobRole - Bar Graph
attrition %>%
count(JobRole) %>%
ggplot(aes(x = JobRole, y = n, fill = JobRole)) +
geom_bar(stat = "identity") +
geom_text(aes(label = n), vjust = -0.5, color = "black", size = 3) +
labs(x = "Job Role", y = "Count", fill = "Job Role") +
theme_minimal()
# MaritalStatus - Bar Graph
attrition %>%
count(MaritalStatus) %>%
ggplot(aes(x = MaritalStatus, y = n, fill = MaritalStatus)) +
geom_bar(stat = "identity") +
geom_text(aes(label = n), vjust = -0.5, color = "black", size = 3) +
labs(x = "Marital Status", y = "Count", fill = "Marital Status") +
theme_minimal()
# OverTime - Pie Chart
attrition %>%
count(OverTime) %>%
ggplot(aes(x = "", y = n, fill = OverTime)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start = 0) +
labs(x = "", y = "Count", fill = "OverTime") +
theme_minimal()
attrition %>%
count(BusinessTravel) %>%
ggplot(aes(x = BusinessTravel, y = n, fill = BusinessTravel)) +
geom_bar(stat = "identity") +
geom_text(aes(label = n), vjust = -0.5, color = "black", size = 3) +
labs(x = "Business Travel", y = "Count", fill = "Business Travel") +
theme_minimal()
# Select numerical features
numerical_features <- c("Age", "DailyRate", "DistanceFromHome", "HourlyRate", "MonthlyIncome",
"MonthlyRate", "NumCompaniesWorked", "PercentSalaryHike",
"TotalWorkingYears", "YearsAtCompany", "YearsInCurrentRole",
"YearsSinceLastPromotion", "YearsWithCurrManager")
# Compute the correlation matrix
cor_matrix <- cor(attrition[numerical_features])
# Plot the correlation matrix
corrplot(cor_matrix, method = "color", type = "upper", tl.col = "black")
# Identify variables with the highest correlation
highest_correlation <- which(cor_matrix > 0.6 & cor_matrix < 1, arr.ind = TRUE)
highest_correlation_variables <- rownames(cor_matrix)[highest_correlation[, 1]]
# Select numerical features
numerical_features <- c("Age", "DailyRate", "DistanceFromHome", "HourlyRate", "MonthlyIncome",
"MonthlyRate", "NumCompaniesWorked", "PercentSalaryHike",
"TotalWorkingYears", "YearsAtCompany", "YearsInCurrentRole",
"YearsSinceLastPromotion", "YearsWithCurrManager")
# Compute the correlation matrix
cor_matrix <- cor(attrition[numerical_features])
# Round correlation matrix to two decimal places
cor_matrix <- round(cor_matrix, 2)
# Print the correlation matrix
print(as.data.frame(cor_matrix))
## Age DailyRate DistanceFromHome HourlyRate
## Age 1.00 0.01 0.00 0.02
## DailyRate 0.01 1.00 0.00 0.02
## DistanceFromHome 0.00 0.00 1.00 0.03
## HourlyRate 0.02 0.02 0.03 1.00
## MonthlyIncome 0.50 0.01 -0.02 -0.02
## MonthlyRate 0.03 -0.03 0.03 -0.02
## NumCompaniesWorked 0.30 0.04 -0.03 0.02
## PercentSalaryHike 0.00 0.02 0.04 -0.01
## TotalWorkingYears 0.68 0.01 0.00 0.00
## YearsAtCompany 0.31 -0.03 0.01 -0.02
## YearsInCurrentRole 0.21 0.01 0.02 -0.02
## YearsSinceLastPromotion 0.22 -0.03 0.01 -0.03
## YearsWithCurrManager 0.20 -0.03 0.01 -0.02
## MonthlyIncome MonthlyRate NumCompaniesWorked
## Age 0.50 0.03 0.30
## DailyRate 0.01 -0.03 0.04
## DistanceFromHome -0.02 0.03 -0.03
## HourlyRate -0.02 -0.02 0.02
## MonthlyIncome 1.00 0.03 0.15
## MonthlyRate 0.03 1.00 0.02
## NumCompaniesWorked 0.15 0.02 1.00
## PercentSalaryHike -0.03 -0.01 -0.01
## TotalWorkingYears 0.77 0.03 0.24
## YearsAtCompany 0.51 -0.02 -0.12
## YearsInCurrentRole 0.36 -0.01 -0.09
## YearsSinceLastPromotion 0.34 0.00 -0.04
## YearsWithCurrManager 0.34 -0.04 -0.11
## PercentSalaryHike TotalWorkingYears YearsAtCompany
## Age 0.00 0.68 0.31
## DailyRate 0.02 0.01 -0.03
## DistanceFromHome 0.04 0.00 0.01
## HourlyRate -0.01 0.00 -0.02
## MonthlyIncome -0.03 0.77 0.51
## MonthlyRate -0.01 0.03 -0.02
## NumCompaniesWorked -0.01 0.24 -0.12
## PercentSalaryHike 1.00 -0.02 -0.04
## TotalWorkingYears -0.02 1.00 0.63
## YearsAtCompany -0.04 0.63 1.00
## YearsInCurrentRole 0.00 0.46 0.76
## YearsSinceLastPromotion -0.02 0.40 0.62
## YearsWithCurrManager -0.01 0.46 0.77
## YearsInCurrentRole YearsSinceLastPromotion
## Age 0.21 0.22
## DailyRate 0.01 -0.03
## DistanceFromHome 0.02 0.01
## HourlyRate -0.02 -0.03
## MonthlyIncome 0.36 0.34
## MonthlyRate -0.01 0.00
## NumCompaniesWorked -0.09 -0.04
## PercentSalaryHike 0.00 -0.02
## TotalWorkingYears 0.46 0.40
## YearsAtCompany 0.76 0.62
## YearsInCurrentRole 1.00 0.55
## YearsSinceLastPromotion 0.55 1.00
## YearsWithCurrManager 0.71 0.51
## YearsWithCurrManager
## Age 0.20
## DailyRate -0.03
## DistanceFromHome 0.01
## HourlyRate -0.02
## MonthlyIncome 0.34
## MonthlyRate -0.04
## NumCompaniesWorked -0.11
## PercentSalaryHike -0.01
## TotalWorkingYears 0.46
## YearsAtCompany 0.77
## YearsInCurrentRole 0.71
## YearsSinceLastPromotion 0.51
## YearsWithCurrManager 1.00
attrition %>%
ggplot(aes(x = Attrition)) +
geom_bar(aes(fill = Attrition)) +
geom_text(stat = 'count', aes(label = ..count..), vjust = -0.5) +
labs(x = "Attrition", y = "Count") +
ggtitle("Distribution of Attrition")
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
library(tidyverse)
attrition %>%
group_by(JobRole) %>%
summarise(AttritionRate = mean(Attrition == "Yes")) %>%
arrange(desc(AttritionRate)) %>%
head(5)
## # A tibble: 5 × 2
## JobRole AttritionRate
## <chr> <dbl>
## 1 Sales Representative 0.398
## 2 Laboratory Technician 0.239
## 3 Human Resources 0.231
## 4 Sales Executive 0.175
## 5 Research Scientist 0.161
attrition %>%
group_by(Gender) %>%
summarise(MeanMonthlyIncome = mean(MonthlyIncome, na.rm = TRUE))
## # A tibble: 2 × 2
## Gender MeanMonthlyIncome
## <chr> <dbl>
## 1 Female 6687.
## 2 Male 6381.
attrition %>%
ggplot(aes(x = Gender, y = MonthlyIncome)) +
geom_boxplot() +
labs(x = "Gender", y = "Monthly Income") +
ggtitle("Gender Disparities in Monthly Income")
attrition_d<- attrition %>%
mutate(
Attrition = recode(Attrition, "Yes" = 1, "No" = 0),
BusinessTravel = recode(BusinessTravel, "Travel_Rarely" = 1, "Travel_Frequently" = 2, "Non-Travel" = 3),
Department = recode(Department, "Sales" = 1, "Research & Development" = 2, "Human Resources" = 3),
EducationField = recode(EducationField, "Life Sciences" = 1, "Medical" = 2, "Marketing" = 3, "Technical Degree" = 4,"Human Resources" =5, "Other" = 6),
Gender = recode(Gender, "Male" = 1, "Female" = 2),
JobRole = recode(JobRole, "Sales Executive" = 1, "Research Scientist" = 2, "Laboratory Technician" = 3, "Manufacturing Director" = 4, "Healthcare Representative" = 5, "Manager" = 6, "Sales Representative" = 7, "Research Director" = 8, "Human Resources" = 9),
MaritalStatus = recode(MaritalStatus, "Single" = 1, "Married" = 2, "Divorced" = 3),
OverTime = recode(OverTime, "Yes" = 1, "No" = 2)
)
model <- glm(Attrition ~ ., data = attrition_d, family = binomial)
Model Performance
Confusion Matrix: A confusion matrix shows the counts of true positives (TP), true negatives (TN), false positives (FP), and false negatives (FN) of the model’s predictions. It provides an overview of the model’s performance in terms of correctly classified and misclassified instances.
# Predicting the response variable using the logistic regression model
attrition$predicted <- predict(model, type = "response")
# Creating a binary classification based on a threshold (e.g., 0.5)
attrition$predicted_class <- ifelse(attrition$predicted >= 0.5, 1, 0)
# Creating a confusion matrix
confusion_matrix <- table(attrition$Attrition, attrition$predicted_class)
confusion_matrix
##
## 0 1
## No 1200 33
## Yes 126 111
Accuracy measures the proportion of correctly classified instances out of the total number of instances.
# Calculating accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
accuracy
## [1] 0.8918367
Precision and Recall: Precision measures the proportion of true positives out of the total predicted positives, while recall (also known as sensitivity or true positive rate) measures the proportion of true positives out of the actual positives.
# Calculating precision and recall
precision <- confusion_matrix[2, 2] / sum(confusion_matrix[, 2])
recall <- confusion_matrix[2, 2] / sum(confusion_matrix[2, ])
precision
## [1] 0.7708333
recall
## [1] 0.4683544
F1 Score: The F1 score is the harmonic mean of precision and recall and provides a single metric that balances both metrics.
# Calculating F1 score
f1_score <- 2 * (precision * recall) / (precision + recall)
f1_score
## [1] 0.5826772