#LOADING ALL NECESSARY LIBRARIES
#library(dplyr) #Loads the dplyr package, which contains useful libraries like dplyr, ggplot2 for data manipulation and visualization.
#Loads the readr package, which is used to read data files (like CSV files) into R quickly and efficiently.
#Loads the ggplot package for data visualizations.
#LOADING THE DATASET
df <- read_csv("Employee_Attrition_Analysis.csv")
df <- as.data.frame(df)
:=> Reading the CSV file and storing the dataset in a variable called df.Converting the dataset into a data-frame format for easier analysis in R.
#CHECKING FOR COLUMNS
cat(colnames(df), sep = "\n")
## Employee_ID
## Age
## Gender
## Marital_Status
## Department
## Job_Role
## Job_Level
## Monthly_Income
## Hourly_Rate
## Years_at_Company
## Years_in_Current_Role
## Years_Since_Last_Promotion
## Work_Life_Balance
## Job_Satisfaction
## Performance_Rating
## Training_Hours_Last_Year
## Overtime
## Project_Count
## Average_Hours_Worked_Per_Week
## Absenteeism
## Work_Environment_Satisfaction
## Relationship_with_Manager
## Job_Involvement
## Distance_From_Home
## Number_of_Companies_Worked
## Attrition
:=> Checking for all the column names of the dataset.
#UNDERSTANDING THE STRUCTURE OF DATA & DISPLAYING SNIPPET OF DATASET
#DISPLAYING FEW RECORDS OF DATA
knitr::kable(head(df), caption = "First 6 Rows of Dataset")
| Employee_ID | Age | Gender | Marital_Status | Department | Job_Role | Job_Level | Monthly_Income | Hourly_Rate | Years_at_Company | Years_in_Current_Role | Years_Since_Last_Promotion | Work_Life_Balance | Job_Satisfaction | Performance_Rating | Training_Hours_Last_Year | Overtime | Project_Count | Average_Hours_Worked_Per_Week | Absenteeism | Work_Environment_Satisfaction | Relationship_with_Manager | Job_Involvement | Distance_From_Home | Number_of_Companies_Worked | Attrition |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 58 | Male | Single | Finance | Manager | 5 | 7332 | 81 | 24 | 12 | 3 | 1 | 3 | 2 | 74 | No | 9 | 48 | 16 | 4 | 1 | 1 | 49 | 3 | No |
| 2 | 48 | Female | Divorced | HR | Assistant | 4 | 6069 | 55 | 18 | 7 | 5 | 1 | 2 | 2 | 24 | Yes | 9 | 57 | 10 | 4 | 1 | 1 | 25 | 1 | No |
| 3 | 34 | Female | Married | Marketing | Manager | 4 | 11485 | 65 | 6 | 4 | 3 | 4 | 5 | 1 | 63 | Yes | 3 | 55 | 1 | 1 | 4 | 3 | 21 | 1 | Yes |
| 4 | 27 | Female | Divorced | HR | Manager | 4 | 18707 | 28 | 12 | 9 | 1 | 1 | 1 | 2 | 4 | No | 9 | 53 | 2 | 3 | 4 | 1 | 46 | 2 | No |
| 5 | 40 | Male | Married | HR | Analyst | 1 | 16398 | 92 | 3 | 9 | 1 | 3 | 4 | 3 | 62 | No | 1 | 54 | 11 | 1 | 1 | 1 | 43 | 4 | No |
| 6 | 58 | Male | Married | Finance | Executive | 3 | 7305 | 63 | 25 | 2 | 3 | 4 | 5 | 3 | 84 | No | 1 | 42 | 11 | 2 | 3 | 4 | 4 | 3 | Yes |
:=> just displaying a snippet of the dataset to understand the data well.
#STRUCTURE OF THE DATASET
str(df)
## 'data.frame': 10032 obs. of 26 variables:
## $ Employee_ID : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : num 58 48 34 27 40 58 38 42 30 30 ...
## $ Gender : chr "Male" "Female" "Female" "Female" ...
## $ Marital_Status : chr "Single" "Divorced" "Married" "Divorced" ...
## $ Department : chr "Finance" "HR" "Marketing" "HR" ...
## $ Job_Role : chr "Manager" "Assistant" "Manager" "Manager" ...
## $ Job_Level : num 5 4 4 4 1 3 5 1 4 4 ...
## $ Monthly_Income : num 7332 6069 11485 18707 16398 ...
## $ Hourly_Rate : num 81 55 65 28 92 63 63 41 95 53 ...
## $ Years_at_Company : num 24 18 6 12 3 25 3 16 17 16 ...
## $ Years_in_Current_Role : num 12 7 4 9 9 2 3 8 10 14 ...
## $ Years_Since_Last_Promotion : num 3 5 3 1 1 3 4 0 2 4 ...
## $ Work_Life_Balance : num 1 1 4 1 3 4 4 2 2 1 ...
## $ Job_Satisfaction : num 3 2 5 1 4 5 3 4 3 4 ...
## $ Performance_Rating : num 2 2 1 2 3 3 4 3 3 3 ...
## $ Training_Hours_Last_Year : num 74 24 63 4 62 84 98 75 51 45 ...
## $ Overtime : chr "No" "Yes" "Yes" "No" ...
## $ Project_Count : num 9 9 3 9 1 1 1 3 8 6 ...
## $ Average_Hours_Worked_Per_Week: num 48 57 55 53 54 42 58 45 42 41 ...
## $ Absenteeism : num 16 10 1 2 11 11 16 9 4 12 ...
## $ Work_Environment_Satisfaction: num 4 4 1 3 1 2 3 2 3 4 ...
## $ Relationship_with_Manager : num 1 1 4 4 1 3 3 1 3 2 ...
## $ Job_Involvement : num 1 1 3 1 1 4 4 4 1 1 ...
## $ Distance_From_Home : num 49 25 21 46 43 4 33 3 39 1 ...
## $ Number_of_Companies_Worked : num 3 1 1 2 4 3 1 2 4 4 ...
## $ Attrition : chr "No" "No" "Yes" "No" ...
:=> After checking the structure of the dataset using str(df), I found that the dataset contains 10,032 records and 26 variables. Out of these, 6 columns are character (chr) type and 20 columns are numeric (num) type.
#DATA PREPROCESSING
#CHECKING FOR NULL VALUES IF EXIST IN DATASET
#sum(is.na(df))
colSums(is.na(df))
## Employee_ID Age
## 0 0
## Gender Marital_Status
## 0 0
## Department Job_Role
## 0 0
## Job_Level Monthly_Income
## 0 0
## Hourly_Rate Years_at_Company
## 0 0
## Years_in_Current_Role Years_Since_Last_Promotion
## 0 0
## Work_Life_Balance Job_Satisfaction
## 0 0
## Performance_Rating Training_Hours_Last_Year
## 0 0
## Overtime Project_Count
## 0 0
## Average_Hours_Worked_Per_Week Absenteeism
## 0 0
## Work_Environment_Satisfaction Relationship_with_Manager
## 0 0
## Job_Involvement Distance_From_Home
## 0 0
## Number_of_Companies_Worked Attrition
## 0 0
:=> checking for nulls if exist there in the dataset
#CHECKING FOR DUPLICACY
sum(duplicated(df))
## [1] 32
:=> finding that there are 32 duplicate records.
#FINDING THE INDEX POSITION OF DUPLICATE ROWS/RECORDS
duplicate_rows <- which(duplicated(df), arr.ind = TRUE)
duplicate_rows
## [1] 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192
## [13] 2193 2194 2195 2196 2197 10014 10015 10016 10017 10018 10019 10020
## [25] 10021 10022 10023 10024 10025 10026 10027 10028
:=> checking for the indexes of the duplicate records found.
#REMOVING DUPLICATE RECORDS
df<-df[!duplicated(df), ]
:=> removing all duplicate records
#NOW AGAIN CHECHING FOR DUPLICACY
sum(duplicated(df))
## [1] 0
:=> finding that now the dataset is containing all unique records without any duplicate record.
#EXPLORATORY DATA ANALYSIS
#SECTION 1: Understanding the Target Variable
#1. What is the total number of employees?
total_employees <- nrow(df)
total_employees
## [1] 10000
:=> after cleaning the dataset , we have 10,000 records in our dataset.
#2. What is the total number of attrited employee and overall attrition rate in percentage?
table(df$Attrition)
##
## No Yes
## 8003 1997
attrition_percent <-prop.table(table(df$Attrition)) * 100
attrition_percent
##
## No Yes
## 80.03 19.97
attrition_counts <- as.data.frame(table(df$Attrition))
colnames(attrition_counts) <- c("Attrition", "Count")
ggplot(attrition_counts, aes(x = "", y = Count, fill = Attrition)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y") +
scale_fill_manual(values = c("No" = "darkorchid1",
"Yes" = "darkorchid4")) +
labs(title = "Overall Attrition Distribution") +
theme_void()
:=> After calculating the frequency of the Attrition column, I found
that:
Total employees who left (Attrition = Yes) = 1,997 Total employees who stayed (Attrition = No) = 8,003
The overall attrition rate is 19.97%, while 80.03% employees stayed in the company.
#3. Is the dataset imbalanced?
If:
Attrition Yes < 30% Attrition No > 70%
Then dataset is imbalanced.
minority_class <- min(attrition_percent)
if(minority_class < 30){
cat(":=> The attrition rate is approximately", minority_class,"% indicating that employee attrition is present but not dominant in the company.")
} else {
print("Attrition Rate is affecting the company")
}
## :=> The attrition rate is approximately 19.97 % indicating that employee attrition is present but not dominant in the company.
#SECTION 2: Gender-Wise Analysis
#1. What is the gender distribution?
table(df$Gender)
##
## Female Male
## 5042 4958
prop.table(table(df$Gender)) * 100
##
## Female Male
## 50.42 49.58
:=> Total Number of Males we have 4958 with Female 5042
#2.Is attrition rate different across gender?
prop.table(table(df$Gender, df$Attrition), 1) * 100
##
## No Yes
## Female 80.32527 19.67473
## Male 79.72973 20.27027
:=> After calculating the attrition rate by gender, I found that:
Female employees → 19.67% attrition Male employees → 20.27% attrition
The attrition rate is almost similar for both genders, with males having a slightly higher percentage.
#3. Which age group has highest attrition?
#CREATING AGE GROUPS
df$Age_Group <- cut(df$Age,
breaks = c(18,25,35,45,55,65),
labels = c("18-25","26-35","36-45","46-55","56-65"),
right = FALSE)
#CHECK DISTRIBUTION
table(df$Age_Group, df$Attrition)
##
## No Yes
## 18-25 958 272
## 26-35 2009 442
## 36-45 2055 526
## 46-55 2006 502
## 56-65 975 255
prop.table(table(df$Age_Group, df$Attrition), 1) * 100
##
## No Yes
## 18-25 77.88618 22.11382
## 26-35 81.96654 18.03346
## 36-45 79.62030 20.37970
## 46-55 79.98405 20.01595
## 56-65 79.26829 20.73171
:=> The “18–25” age group has the highest attrition rate with 272 employees leaving from this group.
ggplot(df, aes(x = Age_Group, fill = Attrition)) +
geom_bar(position = "fill") +
scale_fill_manual(values = c("Yes" = "#5D478B",
"No" = "mediumpurple1")) +
labs(title = "Attrition by Age Group",
x = "Age Group",
y = "Proportion")
#SECTION 3: Financial Analysis
#1. Does monthly income affect attrition?
mean(subset(df, Attrition == "Yes")$Monthly_Income)
## [1] 11438.04
mean(subset(df, Attrition == "No")$Monthly_Income)
## [1] 11436.39
median(subset(df, Attrition == "Yes")$Monthly_Income)
## [1] 11323
median(subset(df, Attrition == "No")$Monthly_Income)
## [1] 11417
:=> the mean and median is almost same so this implies monthly income does not appear to significantly affect attrition in this dataset
#2. Do Lower Income Employees Leave More?
#Create Income Categories
df$Income_Group <- cut(df$Monthly_Income,
breaks = quantile(df$Monthly_Income, probs = c(0, 0.33, 0.66, 1)),
labels = c("Low", "Medium", "High"),
include.lowest = TRUE)
#Check Attrition by Income Group
prop.table(table(df$Income_Group, df$Attrition), 1) * 100
##
## No Yes
## Low 79.90909 20.09091
## Medium 80.12121 19.87879
## High 80.05882 19.94118
:=> After dividing employees into Low, Medium, and High income groups and calculating attrition percentage:
Low income → 20.09% attrition Medium income → 19.88% attrition High income → 19.94% attrition
The attrition rate is almost the same across all income groups.
Therefore, lower income employees do not leave more than others, and income level does not seem to significantly impact attrition in this dataset.
#SECTION 4: Department and Growth Analysis
#1. Which department has highest attrition rate?
table(df$Department, df$Attrition)
##
## No Yes
## Finance 1575 415
## HR 1572 381
## IT 1526 390
## Marketing 1720 413
## Sales 1610 398
dept_table <- prop.table(table(df$Department, df$Attrition), 1) * 100
:=> The “Finance department” has the highest attrition rate with 415 employees, followed closely by IT with 390.
ggplot(df %>% filter(Attrition == "Yes"),
aes(x = Department)) +
geom_bar(fill = "#9370DB", width = 0.5) +
labs(title = "Number of Employees Who Left by Department",
x = "Department",
y = "Count of Attrited Employees")
#2. Does Employee Tenure Influence Satisfaction (and Potential Attrition
Risk)?
df_summary <- df %>%
group_by(Years_at_Company) %>%
summarise(avg_satisfaction = mean(Job_Satisfaction, na.rm = TRUE))
ggplot(df_summary, aes(x = Years_at_Company, y = avg_satisfaction)) +
geom_line(color = "#B23AEE", linewidth = 1) +
labs(title = "Average Job Satisfaction by Years at Company",
x = "Years at Company",
y = "Average Job Satisfaction")
:=> The average job satisfaction is almost same across different
years at the company, with only small ups and downs. There is no major
increasing or decreasing trend, which means years at company does not
strongly affect satisfaction or attrition.
#3.Which job role has highest attrition?
table(df$Job_Role, df$Attrition)
##
## No Yes
## Analyst 2052 520
## Assistant 1994 544
## Executive 1979 497
## Manager 1978 436
prop.table(table(df$Job_Role, df$Attrition), 1) * 100
##
## No Yes
## Analyst 79.78227 20.21773
## Assistant 78.56580 21.43420
## Executive 79.92730 20.07270
## Manager 81.93869 18.06131
:=> The Assistant role has the highest attrition rate (≈21.43%).
#4. Does job level affect attrition?
table(df$Job_Level, df$Attrition)
##
## No Yes
## 1 1597 385
## 2 1674 398
## 3 1588 391
## 4 1558 432
## 5 1586 391
:=> Job Level 4 has the highest attrition rate (≈21.71%).
#5. Does overtime increase attrition?
table(df$Overtime, df$Attrition)
##
## No Yes
## No 4089 1014
## Yes 3914 983
:=> The attrition rate is almost the same for employees who do overtime and those who do not.
Therefore, overtime does not significantly increase attrition in this dataset.
#6. Does Years at Company Affect Attrition?
mean(subset(df, Attrition == "Yes")$Years_at_Company)
## [1] 14.98147
mean(subset(df, Attrition == "No")$Years_at_Company)
## [1] 14.9249
:=> The average years at company is almost the same for both groups.
Therefore, years at company does not seem to significantly affect attrition in this dataset.
#7. Are Employees Leaving Early (0–3 Years)?
#Create Experience Groups
df$Experience_Group <- cut(df$Years_at_Company,
breaks = c(0,3,7,15,40),
labels = c("0-3","4-7","8-15","15+"),
right = TRUE)
#Count Attrition by Experience Group
table(df$Experience_Group, df$Attrition)
##
## No Yes
## 0-3 835 200
## 4-7 1116 286
## 8-15 2267 551
## 15+ 3785 960
:=> BY looking at the numbers the attrition rate for all is almost same so in conclusion, the employers are not leaving early.
ggplot(df %>% filter(Attrition == "Yes"),
aes(y = Experience_Group)) +
geom_bar(fill = "#68228B") +
labs(title = "Attrited Employees by Experience Group",
x = "Number of Employees",
y = "Experience Group")
#SECTION 5 : Career Progression & Growth Analysis
#1. Does years in current role affect attrition?
tapply(df$Years_in_Current_Role, df$Attrition, mean)
## No Yes
## 7.440960 7.492739
:=> The average years in current role is almost the same for employees who stayed (7.44 years) and those who left (7.49 years).
#2. Does years since last promotion influence attrition?
tapply(df$Years_Since_Last_Promotion, df$Attrition, mean)
## No Yes
## 4.474697 4.460691
#3. Are employees without recent promotion more likely to leave?
df$Promotion_Group <- ifelse(df$Years_Since_Last_Promotion >= 3,
"No Recent Promotion",
"Recently Promoted")
ggplot(df, aes(x = Promotion_Group, fill = Attrition)) +
geom_bar(position = "fill", width = 0.5) +
coord_flip() +
scale_fill_manual(values = c("No" = "#5D478B",
"Yes" = "mediumpurple1")) +
labs(title = "Attrition by Promotion Status",
x = "Promotion Status",
y = "Proportion") +
theme_minimal()
prop.table(table(df$Promotion_Group, df$Attrition), 1) * 100
##
## No Yes
## No Recent Promotion 79.97979 20.02021
## Recently Promoted 80.14323 19.85677
:=> Employees without recent promotion have an attrition rate of 20.02%, while recently promoted employees have an attrition rate of 19.86%. Since the difference is very minimal, promotion status does not significantly impact employee attrition in this dataset.
#SECTION 6 : Predictive Modeling
#1. Converting Target Variable to Numeric
df$Attrition_Flag <- ifelse(df$Attrition == "Yes", 1, 0)
#2. Convert Categorical Variables to Factor
df$Gender <- as.factor(df$Gender)
df$Marital_Status <- as.factor(df$Marital_Status)
df$Department <- as.factor(df$Department)
df$Job_Role <- as.factor(df$Job_Role)
df$Overtime <- as.factor(df$Overtime)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.3
## corrplot 0.95 loaded
numeric_data <- df[, c("Age", "Monthly_Income", "Job_Satisfaction",
"Work_Life_Balance", "Years_at_Company",
"Job_Level", "Distance_From_Home")]
cor_matrix <- cor(numeric_data)
corrplot(cor_matrix,
method = "color",
type = "upper",
col = colorRampPalette(c("darkorchid", "white", "darkorchid4"))(200),
addCoef.col = "black",
tl.cex = 0.8)
#3. Train-Test Split (70–30)
set.seed(123)
train_index <- sample(1:nrow(df), 0.7 * nrow(df))
train <- df[train_index, ]
test <- df[-train_index, ]
#4. Build Logistic Regression Model
model <- glm(Attrition_Flag ~ Age + Monthly_Income + Job_Satisfaction +
Work_Life_Balance + Years_at_Company +
Overtime + Job_Level + Distance_From_Home,
data = train,
family = "binomial")
#5. Predict on Test Data
probabilities <- predict(model, test, type = "response")
predictions <- ifelse(probabilities > 0.5, 1, 0)
#5. Confusion Matrix
table(Predicted = predictions, Actual = test$Attrition_Flag)
## Actual
## Predicted 0 1
## 0 2415 585
#6. Calculate Accuracy
mean(predictions == test$Attrition_Flag)
## [1] 0.805
#SECTION 7 : TRYING RANDOM FOREST
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.5.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
rf_model <- randomForest(as.factor(Attrition_Flag) ~ Age + Monthly_Income +
Job_Satisfaction + Work_Life_Balance +
Years_at_Company + Overtime +
Job_Level + Distance_From_Home,
data = train,
ntree = 100)
varImpPlot(rf_model)
rf_pred <- predict(rf_model, test)
table(Predicted = rf_pred, Actual = test$Attrition_Flag)
## Actual
## Predicted 0 1
## 0 2414 581
## 1 1 4
mean(rf_pred == test$Attrition_Flag)
## [1] 0.806