#
library(dplyr)
library(readr)
library(ggplot2)
library(caret)
library(rpart)
#
df <- read_csv("Employee_Attrition_Analysis.csv")
df <- as.data.frame(df)
# iii) CHECKING FOR COLUMNS
cat(colnames(df), sep = "\n")
## Employee_ID
## Age
## Gender
## Marital_Status
## Department
## Job_Role
## Job_Level
## Monthly_Income
## Hourly_Rate
## Years_at_Company
## Years_in_Current_Role
## Years_Since_Last_Promotion
## Work_Life_Balance
## Job_Satisfaction
## Performance_Rating
## Training_Hours_Last_Year
## Overtime
## Project_Count
## Average_Hours_Worked_Per_Week
## Absenteeism
## Work_Environment_Satisfaction
## Relationship_with_Manager
## Job_Involvement
## Distance_From_Home
## Number_of_Companies_Worked
## Attrition
# iv) UNDERSTANDING THE STRUCTURE OF DATA & DISPLAYING SNIPPET OF DATASET
#DISPLAYING FEW RECORDS OF DATA
knitr::kable(head(df), caption = "First 6 Rows of Dataset")
| Employee_ID | Age | Gender | Marital_Status | Department | Job_Role | Job_Level | Monthly_Income | Hourly_Rate | Years_at_Company | Years_in_Current_Role | Years_Since_Last_Promotion | Work_Life_Balance | Job_Satisfaction | Performance_Rating | Training_Hours_Last_Year | Overtime | Project_Count | Average_Hours_Worked_Per_Week | Absenteeism | Work_Environment_Satisfaction | Relationship_with_Manager | Job_Involvement | Distance_From_Home | Number_of_Companies_Worked | Attrition |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 58 | Male | Single | Finance | Manager | 5 | 7332 | 81 | 24 | 12 | 3 | 1 | 3 | 2 | 74 | No | 9 | 48 | 16 | 4 | 1 | 1 | 49 | 3 | No |
| 2 | 48 | Female | Divorced | HR | Assistant | 4 | 6069 | 55 | 18 | 7 | 5 | 1 | 2 | 2 | 24 | Yes | 9 | 57 | 10 | 4 | 1 | 1 | 25 | 1 | No |
| 3 | 34 | Female | Married | Marketing | Manager | 4 | 11485 | 65 | 6 | 4 | 3 | 4 | 5 | 1 | 63 | Yes | 3 | 55 | 1 | 1 | 4 | 3 | 21 | 1 | Yes |
| 4 | 27 | Female | Divorced | HR | Manager | 4 | 18707 | 28 | 12 | 9 | 1 | 1 | 1 | 2 | 4 | No | 9 | 53 | 2 | 3 | 4 | 1 | 46 | 2 | No |
| 5 | 40 | Male | Married | HR | Analyst | 1 | 16398 | 92 | 3 | 9 | 1 | 3 | 4 | 3 | 62 | No | 1 | 54 | 11 | 1 | 1 | 1 | 43 | 4 | No |
| 6 | 58 | Male | Married | Finance | Executive | 3 | 7305 | 63 | 25 | 2 | 3 | 4 | 5 | 3 | 84 | No | 1 | 42 | 11 | 2 | 3 | 4 | 4 | 3 | Yes |
# v) STRUCTURE OF THE DATASET
str(df)
## 'data.frame': 10032 obs. of 26 variables:
## $ Employee_ID : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : num 58 48 34 27 40 58 38 42 30 30 ...
## $ Gender : chr "Male" "Female" "Female" "Female" ...
## $ Marital_Status : chr "Single" "Divorced" "Married" "Divorced" ...
## $ Department : chr "Finance" "HR" "Marketing" "HR" ...
## $ Job_Role : chr "Manager" "Assistant" "Manager" "Manager" ...
## $ Job_Level : num 5 4 4 4 1 3 5 1 4 4 ...
## $ Monthly_Income : num 7332 6069 11485 18707 16398 ...
## $ Hourly_Rate : num 81 55 65 28 92 63 63 41 95 53 ...
## $ Years_at_Company : num 24 18 6 12 3 25 3 16 17 16 ...
## $ Years_in_Current_Role : num 12 7 4 9 9 2 3 8 10 14 ...
## $ Years_Since_Last_Promotion : num 3 5 3 1 1 3 4 0 2 4 ...
## $ Work_Life_Balance : num 1 1 4 1 3 4 4 2 2 1 ...
## $ Job_Satisfaction : num 3 2 5 1 4 5 3 4 3 4 ...
## $ Performance_Rating : num 2 2 1 2 3 3 4 3 3 3 ...
## $ Training_Hours_Last_Year : num 74 24 63 4 62 84 98 75 51 45 ...
## $ Overtime : chr "No" "Yes" "Yes" "No" ...
## $ Project_Count : num 9 9 3 9 1 1 1 3 8 6 ...
## $ Average_Hours_Worked_Per_Week: num 48 57 55 53 54 42 58 45 42 41 ...
## $ Absenteeism : num 16 10 1 2 11 11 16 9 4 12 ...
## $ Work_Environment_Satisfaction: num 4 4 1 3 1 2 3 2 3 4 ...
## $ Relationship_with_Manager : num 1 1 4 4 1 3 3 1 3 2 ...
## $ Job_Involvement : num 1 1 3 1 1 4 4 4 1 1 ...
## $ Distance_From_Home : num 49 25 21 46 43 4 33 3 39 1 ...
## $ Number_of_Companies_Worked : num 3 1 1 2 4 3 1 2 4 4 ...
## $ Attrition : chr "No" "No" "Yes" "No" ...
#now moving further with the data preprocessing ; checking the null, duplicate records and if existing any , removing them to remove the dedundancy.
# i) CHECKING FOR NULL VALUES IF EXIST IN DATASET
#sum(is.na(df))
colSums(is.na(df))
## Employee_ID Age
## 0 0
## Gender Marital_Status
## 0 0
## Department Job_Role
## 0 0
## Job_Level Monthly_Income
## 0 0
## Hourly_Rate Years_at_Company
## 0 0
## Years_in_Current_Role Years_Since_Last_Promotion
## 0 0
## Work_Life_Balance Job_Satisfaction
## 0 0
## Performance_Rating Training_Hours_Last_Year
## 0 0
## Overtime Project_Count
## 0 0
## Average_Hours_Worked_Per_Week Absenteeism
## 0 0
## Work_Environment_Satisfaction Relationship_with_Manager
## 0 0
## Job_Involvement Distance_From_Home
## 0 0
## Number_of_Companies_Worked Attrition
## 0 0
# ii) CHECKING FOR DUPLICACY
sum(duplicated(df))
## [1] 32
# iii) FINDING THE INDEX POSITION OF DUPLICATE ROWS/RECORDS
duplicate_rows <- which(duplicated(df), arr.ind = TRUE)
duplicate_rows
## [1] 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192
## [13] 2193 2194 2195 2196 2197 10014 10015 10016 10017 10018 10019 10020
## [25] 10021 10022 10023 10024 10025 10026 10027 10028
# iv) REMOVING DUPLICATE RECORDS
df<-df[!duplicated(df), ]
# v) NOW AGAIN CHECHING FOR DUPLICACY
sum(duplicated(df))
## [1] 0
total_employees <- nrow(df)
total_employees
## [1] 10000
:=> After cleaning the dataset , we have 10,000 records in our dataset.
table(df$Attrition)
##
## No Yes
## 8003 1997
attrition_percent <-prop.table(table(df$Attrition)) * 100
attrition_percent
##
## No Yes
## 80.03 19.97
attrition_counts <- as.data.frame(table(df$Attrition))
ggplot(attrition_counts, aes(x = Var1, y = Freq)) +
geom_bar(stat = "identity", fill = "darkorchid4", width = 0.6) +
geom_text(aes(label = Freq),
color = "white",
size = 5,
vjust = 2.5) +
labs(title = "Overall Attrition Distribution",
x = "Attrition",
y = "Number of Employees")
minority_class <- min(attrition_percent)
if(minority_class < 30){
cat(" The attrition rate is approximately", minority_class,"% indicating that employee attrition is present but not dominant in the company.")
} else {
print("Attrition Rate is affecting the company")
}
## The attrition rate is approximately 19.97 % indicating that employee attrition is present but not dominant in the company.
table(df$Gender)
##
## Female Male
## 5042 4958
prop.table(table(df$Gender)) * 100
##
## Female Male
## 50.42 49.58
prop.table(table(df$Gender, df$Attrition), 1) * 100
##
## No Yes
## Female 80.32527 19.67473
## Male 79.72973 20.27027
df$Age_Group <- cut(df$Age,
breaks = c(18,25,35,45,55,65),
labels = c("18-25","26-35","36-45","46-55","56-65"),
right = FALSE)
#CHECK DISTRIBUTION
table(df$Age_Group, df$Attrition)
##
## No Yes
## 18-25 958 272
## 26-35 2009 442
## 36-45 2055 526
## 46-55 2006 502
## 56-65 975 255
prop.table(table(df$Age_Group, df$Attrition), 1) * 100
##
## No Yes
## 18-25 77.88618 22.11382
## 26-35 81.96654 18.03346
## 36-45 79.62030 20.37970
## 46-55 79.98405 20.01595
## 56-65 79.26829 20.73171
ggplot(df, aes(x = Age_Group, fill = Attrition)) +
geom_bar(position = "fill") +
scale_fill_manual(values = c("Yes" = "#5D478B",
"No" = "mediumpurple1")) +
labs(title = "Attrition by Age Group",
x = "Age Group",
y = "Proportion")
mean(subset(df, Attrition == "Yes")$Monthly_Income)
## [1] 11438.04
mean(subset(df, Attrition == "No")$Monthly_Income)
## [1] 11436.39
median(subset(df, Attrition == "Yes")$Monthly_Income)
## [1] 11323
median(subset(df, Attrition == "No")$Monthly_Income)
## [1] 11417
#Create Income Categories
df$Income_Group <- cut(df$Monthly_Income,
breaks = quantile(df$Monthly_Income, probs = c(0, 0.33, 0.66, 1)),
labels = c("Low", "Medium", "High"),
include.lowest = TRUE)
# Check Attrition by Income Group
prop.table(table(df$Income_Group, df$Attrition), 1) * 100
##
## No Yes
## Low 79.90909 20.09091
## Medium 80.12121 19.87879
## High 80.05882 19.94118
table(df$Department, df$Attrition)
##
## No Yes
## Finance 1575 415
## HR 1572 381
## IT 1526 390
## Marketing 1720 413
## Sales 1610 398
dept_table <- prop.table(table(df$Department, df$Attrition), 1) * 100
ggplot(df %>% filter(Attrition == "Yes"),
aes(x = Department)) +
geom_bar(fill = "#9370DB", width = 0.5) +
labs(title = "Number of Employees Who Left by Department",
x = "Department",
y = "Count of Attrited Employees")
df_summary <- df %>%
group_by(Years_at_Company) %>%
summarise(avg_satisfaction = mean(Job_Satisfaction, na.rm = TRUE))
ggplot(df_summary, aes(x = Years_at_Company, y = avg_satisfaction)) +
geom_line(color = "#B23AEE", linewidth = 1) +
labs(title = "Average Job Satisfaction by Years at Company",
x = "Years at Company",
y = "Average Job Satisfaction")
attrition_table <- table(df$Job_Role, df$Attrition)
leaving_counts <- attrition_table[, "Yes"]
percentages <- round(leaving_counts / sum(leaving_counts) * 100, 1)
labels <- paste(names(percentages), percentages, "%")
pie(
leaving_counts,
labels = labels,
col = c("darkorchid4", "#AB82FF", "#9A32CD", "#BA55D3"),
main = "Employees Leaving by Job Role",
border = "white"
)
level_table <- table(df$Job_Level, df$Attrition)
round(prop.table(level_table, margin = 1) * 100, 1)
##
## No Yes
## 1 80.6 19.4
## 2 80.8 19.2
## 3 80.2 19.8
## 4 78.3 21.7
## 5 80.2 19.8
table(df$Overtime, df$Attrition)
##
## No Yes
## No 4089 1014
## Yes 3914 983
:=> The attrition rate is almost the same for employees who do overtime and those who do not. Therefore, overtime does not significantly increase attrition in this dataset.
mean(subset(df, Attrition == "Yes")$Years_at_Company)
## [1] 14.98147
mean(subset(df, Attrition == "No")$Years_at_Company)
## [1] 14.9249
#Create Experience Groups , Count Attrition by Experience Group
df$Experience_Group <- cut(df$Years_at_Company,
breaks = c(0,3,7,15,40),
labels = c("0-3","4-7","8-15","15+"),
right = TRUE)
table(df$Experience_Group, df$Attrition)
##
## No Yes
## 0-3 835 200
## 4-7 1116 286
## 8-15 2267 551
## 15+ 3785 960
prop.table(table(df$Experience_Group, df$Attrition), 1) * 100
##
## No Yes
## 0-3 80.67633 19.32367
## 4-7 79.60057 20.39943
## 8-15 80.44713 19.55287
## 15+ 79.76818 20.23182
ggplot(df %>% filter(Attrition == "Yes"),
aes(y = Experience_Group)) +
geom_bar(fill = "#68228B") +
labs(title = "Attrited Employees by Experience Group",
x = "Number of Employees",
y = "Experience Group")
tapply(df$Years_in_Current_Role, df$Attrition, mean)
## No Yes
## 7.440960 7.492739
tapply(df$Years_Since_Last_Promotion, df$Attrition, mean)
## No Yes
## 4.474697 4.460691
df$Promotion_Group <- ifelse(df$Years_Since_Last_Promotion >= 3,
"No Recent Promotion",
"Recently Promoted")
ggplot(df, aes(x = Promotion_Group, fill = Attrition)) +
geom_bar(position = "fill", width = 0.5) +
coord_flip() +
scale_fill_manual(values = c("No" = "#5D478B",
"Yes" = "mediumpurple1")) +
labs(title = "Attrition by Promotion Status",
x = "Promotion Status",
y = "Proportion") +
theme_minimal()
prop.table(table(df$Promotion_Group, df$Attrition), 1) * 100
##
## No Yes
## No Recent Promotion 79.97979 20.02021
## Recently Promoted 80.14323 19.85677
df$Attrition <- as.factor(df$Attrition)
set.seed(123)
train_index <- createDataPartition(df$Attrition, p = 0.7, list = FALSE)
train_data <- df[train_index, ]
test_data <- df[-train_index, ]
dim(train_data)
## [1] 7001 30
dim(test_data)
## [1] 2999 30
model <- glm(Attrition ~ .,
data = train_data,
family = "binomial",
weights = ifelse(train_data$Attrition == "Yes", 3, 1))
summary(model)
##
## Call:
## glm(formula = Attrition ~ ., family = "binomial", data = train_data,
## weights = ifelse(train_data$Attrition == "Yes", 3, 1))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.094e+00 3.002e-01 -3.645 0.000267 ***
## Employee_ID -5.307e-06 7.149e-06 -0.742 0.457839
## Age 1.826e-02 7.969e-03 2.291 0.021959 *
## GenderMale 4.095e-02 4.114e-02 0.995 0.319658
## Marital_StatusMarried -5.033e-02 5.042e-02 -0.998 0.318207
## Marital_StatusSingle 2.324e-02 5.023e-02 0.463 0.643542
## DepartmentHR -4.511e-02 6.569e-02 -0.687 0.492290
## DepartmentIT -9.450e-03 6.550e-02 -0.144 0.885289
## DepartmentMarketing -2.321e-03 6.394e-02 -0.036 0.971036
## DepartmentSales -3.467e-02 6.488e-02 -0.534 0.593063
## Job_RoleAssistant 8.591e-02 5.679e-02 1.513 0.130334
## Job_RoleExecutive -1.005e-02 5.745e-02 -0.175 0.861189
## Job_RoleManager -1.807e-01 5.942e-02 -3.041 0.002359 **
## Job_Level 2.224e-02 1.463e-02 1.520 0.128401
## Monthly_Income 1.130e-05 1.245e-05 0.908 0.363835
## Hourly_Rate -1.241e-03 8.265e-04 -1.501 0.133317
## Years_at_Company 6.195e-04 6.780e-03 0.091 0.927197
## Years_in_Current_Role 2.550e-04 5.080e-03 0.050 0.959975
## Years_Since_Last_Promotion -9.009e-03 1.196e-02 -0.753 0.451392
## Work_Life_Balance 4.478e-02 1.855e-02 2.414 0.015772 *
## Job_Satisfaction 2.556e-02 1.459e-02 1.752 0.079713 .
## Performance_Rating 4.509e-02 1.846e-02 2.443 0.014563 *
## Training_Hours_Last_Year -2.666e-04 7.152e-04 -0.373 0.709289
## OvertimeYes 3.449e-02 4.113e-02 0.838 0.401758
## Project_Count -3.232e-03 7.997e-03 -0.404 0.686076
## Average_Hours_Worked_Per_Week 4.284e-03 2.384e-03 1.797 0.072341 .
## Absenteeism 7.458e-04 3.542e-03 0.211 0.833239
## Work_Environment_Satisfaction -9.975e-03 1.843e-02 -0.541 0.588316
## Relationship_with_Manager 1.209e-04 1.857e-02 0.007 0.994805
## Job_Involvement 3.059e-02 1.855e-02 1.650 0.099044 .
## Distance_From_Home 1.832e-03 1.443e-03 1.269 0.204494
## Number_of_Companies_Worked -2.333e-02 1.847e-02 -1.263 0.206678
## Age_Group26-35 -2.675e-01 9.392e-02 -2.848 0.004393 **
## Age_Group36-45 -2.868e-01 1.561e-01 -1.838 0.066110 .
## Age_Group46-55 -5.023e-01 2.308e-01 -2.176 0.029551 *
## Age_Group56-65 -6.565e-01 2.904e-01 -2.260 0.023796 *
## Income_GroupMedium -7.124e-02 8.496e-02 -0.839 0.401741
## Income_GroupHigh -1.415e-01 1.500e-01 -0.944 0.345330
## Experience_Group4-7 -8.186e-02 8.660e-02 -0.945 0.344506
## Experience_Group8-15 -4.811e-02 9.713e-02 -0.495 0.620368
## Experience_Group15+ -7.160e-02 1.563e-01 -0.458 0.646973
## Promotion_GroupRecently Promoted -4.913e-02 7.465e-02 -0.658 0.510445
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 13378 on 7000 degrees of freedom
## Residual deviance: 13305 on 6959 degrees of freedom
## AIC: 13389
##
## Number of Fisher Scoring iterations: 4
prob_predictions <- predict(model, test_data, type = "response")
predicted_class <- ifelse(prob_predictions > 0.5, "Yes", "No")
predicted_class <- as.factor(predicted_class)
head(prob_predictions)
## 3 4 5 10 11 14
## 0.4211484 0.3385969 0.4477636 0.4043930 0.4389556 0.5136492
#head(predicted_class)
test_data$Attrition <- factor(test_data$Attrition,
levels = c("Yes", "No"))
predicted_class <- factor(predicted_class,
levels = c("Yes", "No"))
confusionMatrix(predicted_class, test_data$Attrition)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Yes No
## Yes 23 88
## No 576 2312
##
## Accuracy : 0.7786
## 95% CI : (0.7633, 0.7933)
## No Information Rate : 0.8003
## P-Value [Acc > NIR] : 0.9984
##
## Kappa : 0.0025
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.038397
## Specificity : 0.963333
## Pos Pred Value : 0.207207
## Neg Pred Value : 0.800554
## Prevalence : 0.199733
## Detection Rate : 0.007669
## Detection Prevalence : 0.037012
## Balanced Accuracy : 0.500865
##
## 'Positive' Class : Yes
##
tree_model <- rpart(Attrition ~ ., data = train_data, method = "class")
tree_pred <- predict(tree_model, test_data, type = "class")
confusionMatrix(tree_pred, test_data$Attrition)
## Warning in confusionMatrix.default(tree_pred, test_data$Attrition): Levels are
## not in the same order for reference and data. Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction Yes No
## Yes 0 0
## No 599 2400
##
## Accuracy : 0.8003
## 95% CI : (0.7855, 0.8144)
## No Information Rate : 0.8003
## P-Value [Acc > NIR] : 0.5109
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0000
## Specificity : 1.0000
## Pos Pred Value : NaN
## Neg Pred Value : 0.8003
## Prevalence : 0.1997
## Detection Rate : 0.0000
## Detection Prevalence : 0.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Yes
##
model_names <- c("Logistic Regression", "Decision Tree")
accuracy <- c(0.7786, 0.8003)
comparison <- data.frame(model_names, accuracy)
library(ggplot2)
ggplot(comparison, aes(x = model_names, y = accuracy)) +
geom_bar(stat = "identity", fill ="purple3", width = 0.4) +
labs(title = "Model Accuracy Comparison",
x = "Models",
y = "Accuracy")