# i) LOADING ALL NECESSARY LIBRARIES
library(dplyr)
library(readr)
library(ggplot2)
# ii) LOADING THE DATASET
df <- read_csv("Employee_Attrition_Analysis.csv")
df <- as.data.frame(df)
# iii) CHECKING FOR COLUMNS
cat(colnames(df), sep = "\n")
## Employee_ID
## Age
## Gender
## Marital_Status
## Department
## Job_Role
## Job_Level
## Monthly_Income
## Hourly_Rate
## Years_at_Company
## Years_in_Current_Role
## Years_Since_Last_Promotion
## Work_Life_Balance
## Job_Satisfaction
## Performance_Rating
## Training_Hours_Last_Year
## Overtime
## Project_Count
## Average_Hours_Worked_Per_Week
## Absenteeism
## Work_Environment_Satisfaction
## Relationship_with_Manager
## Job_Involvement
## Distance_From_Home
## Number_of_Companies_Worked
## Attrition
# iv) UNDERSTANDING THE STRUCTURE OF DATA & DISPLAYING SNIPPET OF DATASET
#DISPLAYING FEW RECORDS OF DATA
knitr::kable(head(df), caption = "First 6 Rows of Dataset")
| Employee_ID | Age | Gender | Marital_Status | Department | Job_Role | Job_Level | Monthly_Income | Hourly_Rate | Years_at_Company | Years_in_Current_Role | Years_Since_Last_Promotion | Work_Life_Balance | Job_Satisfaction | Performance_Rating | Training_Hours_Last_Year | Overtime | Project_Count | Average_Hours_Worked_Per_Week | Absenteeism | Work_Environment_Satisfaction | Relationship_with_Manager | Job_Involvement | Distance_From_Home | Number_of_Companies_Worked | Attrition |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 58 | Male | Single | Finance | Manager | 5 | 7332 | 81 | 24 | 12 | 3 | 1 | 3 | 2 | 74 | No | 9 | 48 | 16 | 4 | 1 | 1 | 49 | 3 | No |
| 2 | 48 | Female | Divorced | HR | Assistant | 4 | 6069 | 55 | 18 | 7 | 5 | 1 | 2 | 2 | 24 | Yes | 9 | 57 | 10 | 4 | 1 | 1 | 25 | 1 | No |
| 3 | 34 | Female | Married | Marketing | Manager | 4 | 11485 | 65 | 6 | 4 | 3 | 4 | 5 | 1 | 63 | Yes | 3 | 55 | 1 | 1 | 4 | 3 | 21 | 1 | Yes |
| 4 | 27 | Female | Divorced | HR | Manager | 4 | 18707 | 28 | 12 | 9 | 1 | 1 | 1 | 2 | 4 | No | 9 | 53 | 2 | 3 | 4 | 1 | 46 | 2 | No |
| 5 | 40 | Male | Married | HR | Analyst | 1 | 16398 | 92 | 3 | 9 | 1 | 3 | 4 | 3 | 62 | No | 1 | 54 | 11 | 1 | 1 | 1 | 43 | 4 | No |
| 6 | 58 | Male | Married | Finance | Executive | 3 | 7305 | 63 | 25 | 2 | 3 | 4 | 5 | 3 | 84 | No | 1 | 42 | 11 | 2 | 3 | 4 | 4 | 3 | Yes |
# v) STRUCTURE OF THE DATASET
str(df)
## 'data.frame': 10032 obs. of 26 variables:
## $ Employee_ID : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : num 58 48 34 27 40 58 38 42 30 30 ...
## $ Gender : chr "Male" "Female" "Female" "Female" ...
## $ Marital_Status : chr "Single" "Divorced" "Married" "Divorced" ...
## $ Department : chr "Finance" "HR" "Marketing" "HR" ...
## $ Job_Role : chr "Manager" "Assistant" "Manager" "Manager" ...
## $ Job_Level : num 5 4 4 4 1 3 5 1 4 4 ...
## $ Monthly_Income : num 7332 6069 11485 18707 16398 ...
## $ Hourly_Rate : num 81 55 65 28 92 63 63 41 95 53 ...
## $ Years_at_Company : num 24 18 6 12 3 25 3 16 17 16 ...
## $ Years_in_Current_Role : num 12 7 4 9 9 2 3 8 10 14 ...
## $ Years_Since_Last_Promotion : num 3 5 3 1 1 3 4 0 2 4 ...
## $ Work_Life_Balance : num 1 1 4 1 3 4 4 2 2 1 ...
## $ Job_Satisfaction : num 3 2 5 1 4 5 3 4 3 4 ...
## $ Performance_Rating : num 2 2 1 2 3 3 4 3 3 3 ...
## $ Training_Hours_Last_Year : num 74 24 63 4 62 84 98 75 51 45 ...
## $ Overtime : chr "No" "Yes" "Yes" "No" ...
## $ Project_Count : num 9 9 3 9 1 1 1 3 8 6 ...
## $ Average_Hours_Worked_Per_Week: num 48 57 55 53 54 42 58 45 42 41 ...
## $ Absenteeism : num 16 10 1 2 11 11 16 9 4 12 ...
## $ Work_Environment_Satisfaction: num 4 4 1 3 1 2 3 2 3 4 ...
## $ Relationship_with_Manager : num 1 1 4 4 1 3 3 1 3 2 ...
## $ Job_Involvement : num 1 1 3 1 1 4 4 4 1 1 ...
## $ Distance_From_Home : num 49 25 21 46 43 4 33 3 39 1 ...
## $ Number_of_Companies_Worked : num 3 1 1 2 4 3 1 2 4 4 ...
## $ Attrition : chr "No" "No" "Yes" "No" ...
#now moving further with the data preprocessing ; checking the null, duplicate records and if existing any , removing them to remove the dedundancy.
# i) CHECKING FOR NULL VALUES IF EXIST IN DATASET
#sum(is.na(df))
colSums(is.na(df))
## Employee_ID Age
## 0 0
## Gender Marital_Status
## 0 0
## Department Job_Role
## 0 0
## Job_Level Monthly_Income
## 0 0
## Hourly_Rate Years_at_Company
## 0 0
## Years_in_Current_Role Years_Since_Last_Promotion
## 0 0
## Work_Life_Balance Job_Satisfaction
## 0 0
## Performance_Rating Training_Hours_Last_Year
## 0 0
## Overtime Project_Count
## 0 0
## Average_Hours_Worked_Per_Week Absenteeism
## 0 0
## Work_Environment_Satisfaction Relationship_with_Manager
## 0 0
## Job_Involvement Distance_From_Home
## 0 0
## Number_of_Companies_Worked Attrition
## 0 0
# ii) CHECKING FOR DUPLICACY
sum(duplicated(df))
## [1] 32
# iii) FINDING THE INDEX POSITION OF DUPLICATE ROWS/RECORDS
duplicate_rows <- which(duplicated(df), arr.ind = TRUE)
duplicate_rows
## [1] 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192
## [13] 2193 2194 2195 2196 2197 10014 10015 10016 10017 10018 10019 10020
## [25] 10021 10022 10023 10024 10025 10026 10027 10028
# iv) REMOVING DUPLICATE RECORDS
df<-df[!duplicated(df), ]
# v) NOW AGAIN CHECHING FOR DUPLICACY
sum(duplicated(df))
## [1] 0
total_employees <- nrow(df)
total_employees
## [1] 10000
:=> After cleaning the dataset , we have 10,000 records in our dataset.
table(df$Attrition)
##
## No Yes
## 8003 1997
attrition_percent <-prop.table(table(df$Attrition)) * 100
attrition_percent
##
## No Yes
## 80.03 19.97
attrition_counts <- as.data.frame(table(df$Attrition))
ggplot(attrition_counts, aes(x = "", y = Freq, fill = Var1)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y") +
geom_text(aes(label = paste0(round(prop.table(Freq)*100,1), "%")),
position = position_stack(vjust = 0.5),
color = "white", size = 5) +
scale_fill_manual(values = c("#AB82FF", "darkorchid4")) +
labs(title = "Overall Attrition Distribution",
fill = "Attrition") +
theme_void()
minority_class <- min(attrition_percent)
if(minority_class < 30){
cat(" The attrition rate is approximately", minority_class,"% indicating that employee attrition is present but not dominant in the company.")
} else {
print("Attrition Rate is affecting the company")
}
## The attrition rate is approximately 19.97 % indicating that employee attrition is present but not dominant in the company.
table(df$Gender)
##
## Female Male
## 5042 4958
prop.table(table(df$Gender)) * 100
##
## Female Male
## 50.42 49.58
prop.table(table(df$Gender, df$Attrition), 1) * 100
##
## No Yes
## Female 80.32527 19.67473
## Male 79.72973 20.27027
df$Age_Group <- cut(df$Age,
breaks = c(18,25,35,45,55,65),
labels = c("18-25","26-35","36-45","46-55","56-65"),
right = FALSE)
#CHECK DISTRIBUTION
table(df$Age_Group, df$Attrition)
##
## No Yes
## 18-25 958 272
## 26-35 2009 442
## 36-45 2055 526
## 46-55 2006 502
## 56-65 975 255
prop.table(table(df$Age_Group, df$Attrition), 1) * 100
##
## No Yes
## 18-25 77.88618 22.11382
## 26-35 81.96654 18.03346
## 36-45 79.62030 20.37970
## 46-55 79.98405 20.01595
## 56-65 79.26829 20.73171
ggplot(df, aes(x = Age_Group, fill = Attrition)) +
geom_bar(position = "fill") +
scale_fill_manual(values = c("Yes" = "#5D478B",
"No" = "mediumpurple1")) +
labs(title = "Attrition by Age Group",
x = "Age Group",
y = "Proportion")
mean(subset(df, Attrition == "Yes")$Monthly_Income)
## [1] 11438.04
mean(subset(df, Attrition == "No")$Monthly_Income)
## [1] 11436.39
median(subset(df, Attrition == "Yes")$Monthly_Income)
## [1] 11323
median(subset(df, Attrition == "No")$Monthly_Income)
## [1] 11417
#Create Income Categories
df$Income_Group <- cut(df$Monthly_Income,
breaks = quantile(df$Monthly_Income, probs = c(0, 0.33, 0.66, 1)),
labels = c("Low", "Medium", "High"),
include.lowest = TRUE)
# Check Attrition by Income Group
prop.table(table(df$Income_Group, df$Attrition), 1) * 100
##
## No Yes
## Low 79.90909 20.09091
## Medium 80.12121 19.87879
## High 80.05882 19.94118
table(df$Department, df$Attrition)
##
## No Yes
## Finance 1575 415
## HR 1572 381
## IT 1526 390
## Marketing 1720 413
## Sales 1610 398
dept_table <- prop.table(table(df$Department, df$Attrition), 1) * 100
ggplot(df %>% filter(Attrition == "Yes"),
aes(x = Department)) +
geom_bar(fill = "#9370DB", width = 0.5) +
labs(title = "Number of Employees Who Left by Department",
x = "Department",
y = "Count of Attrited Employees")
df_summary <- df %>%
group_by(Years_at_Company) %>%
summarise(avg_satisfaction = mean(Job_Satisfaction, na.rm = TRUE))
ggplot(df_summary, aes(x = Years_at_Company, y = avg_satisfaction)) +
geom_line(color = "#B23AEE", linewidth = 1) +
labs(title = "Average Job Satisfaction by Years at Company",
x = "Years at Company",
y = "Average Job Satisfaction")
table(df$Job_Role, df$Attrition)
##
## No Yes
## Analyst 2052 520
## Assistant 1994 544
## Executive 1979 497
## Manager 1978 436
table(df$Job_Level, df$Attrition)
##
## No Yes
## 1 1597 385
## 2 1674 398
## 3 1588 391
## 4 1558 432
## 5 1586 391
table(df$Overtime, df$Attrition)
##
## No Yes
## No 4089 1014
## Yes 3914 983
:=> The attrition rate is almost the same for employees who do overtime and those who do not. Therefore, overtime does not significantly increase attrition in this dataset.
mean(subset(df, Attrition == "Yes")$Years_at_Company)
## [1] 14.98147
mean(subset(df, Attrition == "No")$Years_at_Company)
## [1] 14.9249
#Create Experience Groups , Count Attrition by Experience Group
df$Experience_Group <- cut(df$Years_at_Company,
breaks = c(0,3,7,15,40),
labels = c("0-3","4-7","8-15","15+"),
right = TRUE)
table(df$Experience_Group, df$Attrition)
##
## No Yes
## 0-3 835 200
## 4-7 1116 286
## 8-15 2267 551
## 15+ 3785 960
prop.table(table(df$Experience_Group, df$Attrition), 1) * 100
##
## No Yes
## 0-3 80.67633 19.32367
## 4-7 79.60057 20.39943
## 8-15 80.44713 19.55287
## 15+ 79.76818 20.23182
ggplot(df %>% filter(Attrition == "Yes"),
aes(y = Experience_Group)) +
geom_bar(fill = "#68228B") +
labs(title = "Attrited Employees by Experience Group",
x = "Number of Employees",
y = "Experience Group")
tapply(df$Years_in_Current_Role, df$Attrition, mean)
## No Yes
## 7.440960 7.492739
tapply(df$Years_Since_Last_Promotion, df$Attrition, mean)
## No Yes
## 4.474697 4.460691
df$Promotion_Group <- ifelse(df$Years_Since_Last_Promotion >= 3,
"No Recent Promotion",
"Recently Promoted")
ggplot(df, aes(x = Promotion_Group, fill = Attrition)) +
geom_bar(position = "fill", width = 0.5) +
coord_flip() +
scale_fill_manual(values = c("No" = "#5D478B",
"Yes" = "mediumpurple1")) +
labs(title = "Attrition by Promotion Status",
x = "Promotion Status",
y = "Proportion") +
theme_minimal()
This indicates that lack of recent promotion does not strongly impact attrition in this dataset.
prop.table(table(df$Promotion_Group, df$Attrition), 1) * 100
##
## No Yes
## No Recent Promotion 79.97979 20.02021
## Recently Promoted 80.14323 19.85677
df$Attrition_Flag <- ifelse(df$Attrition == "Yes", 1, 0)
df$Gender <- as.factor(df$Gender)
df$Marital_Status <- as.factor(df$Marital_Status)
df$Department <- as.factor(df$Department)
df$Job_Role <- as.factor(df$Job_Role)
df$Overtime <- as.factor(df$Overtime)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.3
## corrplot 0.95 loaded
numeric_data <- df[, c("Age", "Monthly_Income", "Job_Satisfaction",
"Work_Life_Balance", "Years_at_Company",
"Job_Level", "Distance_From_Home")]
cor_matrix <- cor(numeric_data)
corrplot(cor_matrix,
method = "color",
type = "upper",
col = colorRampPalette(c("darkorchid", "white", "darkorchid4"))(200),
addCoef.col = "black",
tl.cex = 0.8)
set.seed(123)
train_index <- sample(1:nrow(df), 0.7 * nrow(df))
train <- df[train_index, ]
test <- df[-train_index, ]
model <- glm(Attrition_Flag ~ Age + Monthly_Income + Job_Satisfaction +
Work_Life_Balance + Years_at_Company +
Overtime + Job_Level + Distance_From_Home,
data = train,
family = "binomial")
probabilities <- predict(model, test, type = "response")
predictions <- ifelse(probabilities > 0.5, 1, 0)
table(Predicted = predictions, Actual = test$Attrition_Flag)
## Actual
## Predicted 0 1
## 0 2415 585
mean(predictions == test$Attrition_Flag)
## [1] 0.805
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.5.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
rf_model <- randomForest(as.factor(Attrition_Flag) ~ Age + Monthly_Income +
Job_Satisfaction + Work_Life_Balance +
Years_at_Company + Overtime +
Job_Level + Distance_From_Home,
data = train,
ntree = 100)
varImpPlot(rf_model)
rf_pred <- predict(rf_model, test)
table(Predicted = rf_pred, Actual = test$Attrition_Flag)
## Actual
## Predicted 0 1
## 0 2414 581
## 1 1 4
mean(rf_pred == test$Attrition_Flag)
## [1] 0.806