SECTION - 1

UNDERSTANDING THE DATA

# i) LOADING ALL NECESSARY LIBRARIES
library(dplyr)   
library(readr)   
library(ggplot2)

library(dplyr) : Loads the dplyr package, which contains useful libraries like dplyr, ggplot2 for data manipulation and visualization.

library(readr) : Loads the readr package, which is used to read data files (like CSV files) into R quickly and efficiently.

library(ggplot2) : Loads the ggplot package for data visualizations.

# ii) LOADING THE DATASET
df <- read_csv("Employee_Attrition_Analysis.csv")
df <- as.data.frame(df)

Reading the CSV file and storing the dataset in a variable called df. Converting the dataset into a data-frame format for easier analysis in R.

# iii) CHECKING FOR COLUMNS

cat(colnames(df), sep = "\n")
## Employee_ID
## Age
## Gender
## Marital_Status
## Department
## Job_Role
## Job_Level
## Monthly_Income
## Hourly_Rate
## Years_at_Company
## Years_in_Current_Role
## Years_Since_Last_Promotion
## Work_Life_Balance
## Job_Satisfaction
## Performance_Rating
## Training_Hours_Last_Year
## Overtime
## Project_Count
## Average_Hours_Worked_Per_Week
## Absenteeism
## Work_Environment_Satisfaction
## Relationship_with_Manager
## Job_Involvement
## Distance_From_Home
## Number_of_Companies_Worked
## Attrition

Checking for all the column names of the dataset.

# iv) UNDERSTANDING THE STRUCTURE OF DATA & DISPLAYING SNIPPET OF DATASET

#DISPLAYING FEW RECORDS OF DATA

knitr::kable(head(df), caption = "First 6 Rows of Dataset")
First 6 Rows of Dataset
Employee_ID Age Gender Marital_Status Department Job_Role Job_Level Monthly_Income Hourly_Rate Years_at_Company Years_in_Current_Role Years_Since_Last_Promotion Work_Life_Balance Job_Satisfaction Performance_Rating Training_Hours_Last_Year Overtime Project_Count Average_Hours_Worked_Per_Week Absenteeism Work_Environment_Satisfaction Relationship_with_Manager Job_Involvement Distance_From_Home Number_of_Companies_Worked Attrition
1 58 Male Single Finance Manager 5 7332 81 24 12 3 1 3 2 74 No 9 48 16 4 1 1 49 3 No
2 48 Female Divorced HR Assistant 4 6069 55 18 7 5 1 2 2 24 Yes 9 57 10 4 1 1 25 1 No
3 34 Female Married Marketing Manager 4 11485 65 6 4 3 4 5 1 63 Yes 3 55 1 1 4 3 21 1 Yes
4 27 Female Divorced HR Manager 4 18707 28 12 9 1 1 1 2 4 No 9 53 2 3 4 1 46 2 No
5 40 Male Married HR Analyst 1 16398 92 3 9 1 3 4 3 62 No 1 54 11 1 1 1 43 4 No
6 58 Male Married Finance Executive 3 7305 63 25 2 3 4 5 3 84 No 1 42 11 2 3 4 4 3 Yes

Displaying a snippet of the dataset to understand the data well.

# v) STRUCTURE OF THE DATASET 
str(df)
## 'data.frame':    10032 obs. of  26 variables:
##  $ Employee_ID                  : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Age                          : num  58 48 34 27 40 58 38 42 30 30 ...
##  $ Gender                       : chr  "Male" "Female" "Female" "Female" ...
##  $ Marital_Status               : chr  "Single" "Divorced" "Married" "Divorced" ...
##  $ Department                   : chr  "Finance" "HR" "Marketing" "HR" ...
##  $ Job_Role                     : chr  "Manager" "Assistant" "Manager" "Manager" ...
##  $ Job_Level                    : num  5 4 4 4 1 3 5 1 4 4 ...
##  $ Monthly_Income               : num  7332 6069 11485 18707 16398 ...
##  $ Hourly_Rate                  : num  81 55 65 28 92 63 63 41 95 53 ...
##  $ Years_at_Company             : num  24 18 6 12 3 25 3 16 17 16 ...
##  $ Years_in_Current_Role        : num  12 7 4 9 9 2 3 8 10 14 ...
##  $ Years_Since_Last_Promotion   : num  3 5 3 1 1 3 4 0 2 4 ...
##  $ Work_Life_Balance            : num  1 1 4 1 3 4 4 2 2 1 ...
##  $ Job_Satisfaction             : num  3 2 5 1 4 5 3 4 3 4 ...
##  $ Performance_Rating           : num  2 2 1 2 3 3 4 3 3 3 ...
##  $ Training_Hours_Last_Year     : num  74 24 63 4 62 84 98 75 51 45 ...
##  $ Overtime                     : chr  "No" "Yes" "Yes" "No" ...
##  $ Project_Count                : num  9 9 3 9 1 1 1 3 8 6 ...
##  $ Average_Hours_Worked_Per_Week: num  48 57 55 53 54 42 58 45 42 41 ...
##  $ Absenteeism                  : num  16 10 1 2 11 11 16 9 4 12 ...
##  $ Work_Environment_Satisfaction: num  4 4 1 3 1 2 3 2 3 4 ...
##  $ Relationship_with_Manager    : num  1 1 4 4 1 3 3 1 3 2 ...
##  $ Job_Involvement              : num  1 1 3 1 1 4 4 4 1 1 ...
##  $ Distance_From_Home           : num  49 25 21 46 43 4 33 3 39 1 ...
##  $ Number_of_Companies_Worked   : num  3 1 1 2 4 3 1 2 4 4 ...
##  $ Attrition                    : chr  "No" "No" "Yes" "No" ...

After checking the structure of the dataset using str(df), I found that the dataset contains 10,032 records and 26 variables. Out of these, 6 columns are character (chr) type and 20 columns are numeric (num) type.

#now moving further with the data preprocessing ; checking the null, duplicate records and if existing any , removing them to remove the dedundancy. 

SECTION - 2

DATA PREPROCESSING

# i) CHECKING FOR NULL VALUES IF EXIST IN DATASET

#sum(is.na(df))
colSums(is.na(df))
##                   Employee_ID                           Age 
##                             0                             0 
##                        Gender                Marital_Status 
##                             0                             0 
##                    Department                      Job_Role 
##                             0                             0 
##                     Job_Level                Monthly_Income 
##                             0                             0 
##                   Hourly_Rate              Years_at_Company 
##                             0                             0 
##         Years_in_Current_Role    Years_Since_Last_Promotion 
##                             0                             0 
##             Work_Life_Balance              Job_Satisfaction 
##                             0                             0 
##            Performance_Rating      Training_Hours_Last_Year 
##                             0                             0 
##                      Overtime                 Project_Count 
##                             0                             0 
## Average_Hours_Worked_Per_Week                   Absenteeism 
##                             0                             0 
## Work_Environment_Satisfaction     Relationship_with_Manager 
##                             0                             0 
##               Job_Involvement            Distance_From_Home 
##                             0                             0 
##    Number_of_Companies_Worked                     Attrition 
##                             0                             0

Checking for nulls if there is any in the dataset.

# ii) CHECKING FOR DUPLICACY
sum(duplicated(df))
## [1] 32

Finding that there are 32 duplicate records.

# iii) FINDING THE INDEX POSITION OF DUPLICATE ROWS/RECORDS

duplicate_rows <- which(duplicated(df), arr.ind = TRUE)
duplicate_rows
##  [1]  2181  2182  2183  2184  2185  2186  2187  2188  2189  2190  2191  2192
## [13]  2193  2194  2195  2196  2197 10014 10015 10016 10017 10018 10019 10020
## [25] 10021 10022 10023 10024 10025 10026 10027 10028

Checking for the indexes of the duplicate records found.

# iv) REMOVING DUPLICATE RECORDS
df<-df[!duplicated(df), ]

Removing all duplicate records

# v) NOW AGAIN CHECHING FOR DUPLICACY
sum(duplicated(df))
## [1] 0

Now the dataset is containing all the unique records without any duplicate record.

SECTION - 3

EXPLORATORY DATA ANALYSIS(EDA)

i) What is the total number of employees?

total_employees <- nrow(df)
total_employees
## [1] 10000

:=> After cleaning the dataset , we have 10,000 records in our dataset.

ii) What is the total number of attrited employee and overall attrition rate in percentage?

table(df$Attrition)
## 
##   No  Yes 
## 8003 1997
attrition_percent <-prop.table(table(df$Attrition)) * 100
attrition_percent
## 
##    No   Yes 
## 80.03 19.97
attrition_counts <- as.data.frame(table(df$Attrition))

ggplot(attrition_counts, aes(x = "", y = Freq, fill = Var1)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y") +
  geom_text(aes(label = paste0(round(prop.table(Freq)*100,1), "%")),
            position = position_stack(vjust = 0.5),
            color = "white", size = 5) +
  scale_fill_manual(values = c("#AB82FF", "darkorchid4")) +
  labs(title = "Overall Attrition Distribution",
       fill = "Attrition") +
  theme_void()

After calculating the frequency of the Attrition column :

Total employees who left (Attrition = Yes) = 1,997

Total employees who stayed (Attrition = No) = 8,003

The overall attrition rate is 19.97%, while 80.03% employees stayed in the company.

iii) Is the dataset imbalanced?

If:

Attrition Yes < 30%

Attrition No > 70%

Then dataset is imbalanced.

minority_class <- min(attrition_percent)
if(minority_class < 30){
  cat(" The attrition rate is approximately", minority_class,"% indicating that employee attrition is present but not dominant in the company.")
} else {
  print("Attrition Rate is affecting the company")
}
##  The attrition rate is approximately 19.97 % indicating that employee attrition is present but not dominant in the company.

Gender-Wise Analysis

i) What is the gender distribution?

table(df$Gender)
## 
## Female   Male 
##   5042   4958
prop.table(table(df$Gender)) * 100
## 
## Female   Male 
##  50.42  49.58

Total Number of Males we have 4958 with Female 5042

ii) Is attrition rate different across gender?

prop.table(table(df$Gender, df$Attrition), 1) * 100
##         
##                No      Yes
##   Female 80.32527 19.67473
##   Male   79.72973 20.27027

After calculating the attrition rate by gender, I found that:

Female employees → 19.67% attrition

Male employees → 20.27% attrition

The attrition rate is almost similar for both genders, with males having a slightly higher percentage.

iii) Which age group has highest attrition? Creating age group then checking distribution

df$Age_Group <- cut(df$Age,
                      breaks = c(18,25,35,45,55,65),
                      labels = c("18-25","26-35","36-45","46-55","56-65"),
                      right = FALSE)
#CHECK DISTRIBUTION
table(df$Age_Group, df$Attrition)
##        
##           No  Yes
##   18-25  958  272
##   26-35 2009  442
##   36-45 2055  526
##   46-55 2006  502
##   56-65  975  255
prop.table(table(df$Age_Group, df$Attrition), 1) * 100
##        
##               No      Yes
##   18-25 77.88618 22.11382
##   26-35 81.96654 18.03346
##   36-45 79.62030 20.37970
##   46-55 79.98405 20.01595
##   56-65 79.26829 20.73171
ggplot(df, aes(x = Age_Group, fill = Attrition)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = c("Yes" = "#5D478B",
                               "No" = "mediumpurple1")) +
  labs(title = "Attrition by Age Group",
       x = "Age Group",
       y = "Proportion")

The “18–25” age group has the highest attrition rate with 272 employees in numbers and approx 22% in percent leaving from this group.

Financial Analysis

i) Does monthly income affect attrition?

mean(subset(df, Attrition == "Yes")$Monthly_Income)
## [1] 11438.04
mean(subset(df, Attrition == "No")$Monthly_Income)
## [1] 11436.39
median(subset(df, Attrition == "Yes")$Monthly_Income)
## [1] 11323
median(subset(df, Attrition == "No")$Monthly_Income)
## [1] 11417

the mean and median is almost same so this implies monthly income does not appear to significantly affect attrition in this dataset.

ii) Do Lower Income Employees Leave More?

#Create Income Categories
df$Income_Group <- cut(df$Monthly_Income,
                         breaks = quantile(df$Monthly_Income, probs = c(0, 0.33, 0.66, 1)),
                         labels = c("Low", "Medium", "High"),
                         include.lowest = TRUE)
# Check Attrition by Income Group
prop.table(table(df$Income_Group, df$Attrition), 1) * 100
##         
##                No      Yes
##   Low    79.90909 20.09091
##   Medium 80.12121 19.87879
##   High   80.05882 19.94118

After dividing employees into Low, Medium, and High income groups and calculating attrition percentage:

Low income → 20.09% attrition

Medium income → 19.88% attrition

High income → 19.94% attrition

The attrition rate is almost the same across all income groups.

Therefore, lower income employees do not leave more than others, and income level does not seem to significantly impact attrition in this dataset.

Department and Growth Analysis

i) Which department has highest attrition rate?

table(df$Department, df$Attrition)
##            
##               No  Yes
##   Finance   1575  415
##   HR        1572  381
##   IT        1526  390
##   Marketing 1720  413
##   Sales     1610  398
dept_table <- prop.table(table(df$Department, df$Attrition), 1) * 100
ggplot(df %>% filter(Attrition == "Yes"),
       aes(x = Department)) +
  geom_bar(fill = "#9370DB", width = 0.5) +
  labs(title = "Number of Employees Who Left by Department",
       x = "Department",
       y = "Count of Attrited Employees") 

The “Finance department” has the highest attrition rate with 415 employees, followed closely by IT with 390.

ii) Does Employee Tenure Influence Satisfaction (and Potential Attrition Risk)?

df_summary <- df %>%
  group_by(Years_at_Company) %>%
  summarise(avg_satisfaction = mean(Job_Satisfaction, na.rm = TRUE))

ggplot(df_summary, aes(x = Years_at_Company, y = avg_satisfaction)) +
  geom_line(color = "#B23AEE", linewidth = 1) +
  labs(title = "Average Job Satisfaction by Years at Company",
       x = "Years at Company",
       y = "Average Job Satisfaction")

The average job satisfaction is almost same across different years at the company, with only small ups and downs. There is no major increasing or decreasing trend, which means years at company does not strongly affect satisfaction or attrition.

iii) Which job role has highest attrition?

table(df$Job_Role, df$Attrition)
##            
##               No  Yes
##   Analyst   2052  520
##   Assistant 1994  544
##   Executive 1979  497
##   Manager   1978  436

The Assistant role has the highest attrition rate (≈21.43%).

iv) Does job level affect attrition?

table(df$Job_Level, df$Attrition)
##    
##       No  Yes
##   1 1597  385
##   2 1674  398
##   3 1588  391
##   4 1558  432
##   5 1586  391

Job Level 4 has the highest attrition rate (≈21.71%).Therefore, qe can depict that job level does affect attrition.

v) Does overtime increase attrition?

table(df$Overtime, df$Attrition)
##      
##         No  Yes
##   No  4089 1014
##   Yes 3914  983

:=> The attrition rate is almost the same for employees who do overtime and those who do not. Therefore, overtime does not significantly increase attrition in this dataset.

vi) Does Years at Company Affect Attrition?

mean(subset(df, Attrition == "Yes")$Years_at_Company)
## [1] 14.98147
mean(subset(df, Attrition == "No")$Years_at_Company)
## [1] 14.9249

The average years at company is almost the same for both groups. Therefore, years at company does not seem to significantly affect attrition in this dataset.

vii) Are Employees Leaving Early (0–3 Years)?

#Create Experience Groups , Count Attrition by Experience Group
df$Experience_Group <- cut(df$Years_at_Company,
                           breaks = c(0,3,7,15,40),
                           labels = c("0-3","4-7","8-15","15+"),
                           right = TRUE)
table(df$Experience_Group, df$Attrition)
##       
##          No  Yes
##   0-3   835  200
##   4-7  1116  286
##   8-15 2267  551
##   15+  3785  960
prop.table(table(df$Experience_Group, df$Attrition), 1) * 100
##       
##              No      Yes
##   0-3  80.67633 19.32367
##   4-7  79.60057 20.39943
##   8-15 80.44713 19.55287
##   15+  79.76818 20.23182
ggplot(df %>% filter(Attrition == "Yes"),
       aes(y = Experience_Group)) +
  geom_bar(fill = "#68228B") +
  labs(title = "Attrited Employees by Experience Group",
       x = "Number of Employees",
       y = "Experience Group")

Although the absolute number of attrited employees is higher in the 15+ experience group, this is due to a larger employee base in that category. When examining attrition rates (proportional analysis), the attrition percentage remains consistent across all experience groups at approximately 20%. Therefore, experience level does not significantly influence attrition.

Career Progression & Growth Analysis

i) Does years in current role affect attrition?

tapply(df$Years_in_Current_Role, df$Attrition, mean)
##       No      Yes 
## 7.440960 7.492739

The average years in current role is almost the same for employees who stayed (7.44 years) and those who left (7.49 years). So, being in the company for more years/ less years is not a reason for attrition.

ii) Does years since last promotion influence attrition?

tapply(df$Years_Since_Last_Promotion, df$Attrition, mean)
##       No      Yes 
## 4.474697 4.460691

The average years since last promotion is almost the same for employees who stayed (4.47 years) and those who left (4.46 years). This shows that years since last promotion does not have a significant influence on attrition in this dataset.

df$Promotion_Group <- ifelse(df$Years_Since_Last_Promotion >= 3,
                             "No Recent Promotion",
                             "Recently Promoted")


ggplot(df, aes(x = Promotion_Group, fill = Attrition)) +
  geom_bar(position = "fill", width = 0.5) +
  coord_flip() +
  scale_fill_manual(values = c("No" = "#5D478B", 
                               "Yes" = "mediumpurple1")) +
  labs(title = "Attrition by Promotion Status",
       x = "Promotion Status",
       y = "Proportion") +
  theme_minimal()

The proportion of employees leaving is almost similar in both groups — those recently promoted and those without a recent promotion.

This indicates that lack of recent promotion does not strongly impact attrition in this dataset.

iii) Are employees without recent promotion more likely to leave?

prop.table(table(df$Promotion_Group, df$Attrition), 1) * 100
##                      
##                             No      Yes
##   No Recent Promotion 79.97979 20.02021
##   Recently Promoted   80.14323 19.85677

Employees without recent promotion have an attrition rate of 20.02%, while recently promoted employees have an attrition rate of 19.86%. Since the difference is very minimal, promotion status does not significantly impact employee attrition in this dataset.

SECTION - 4

PREDICTIVE MODELING

LOGISTIC REGRESSION MODEL

i) Converting Target Variable to Numeric

df$Attrition_Flag <- ifelse(df$Attrition == "Yes", 1, 0)

ii) Converting Categorical Variables to Factor

df$Gender <- as.factor(df$Gender)
df$Marital_Status <- as.factor(df$Marital_Status)
df$Department <- as.factor(df$Department)
df$Job_Role <- as.factor(df$Job_Role)
df$Overtime <- as.factor(df$Overtime)

library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.3
## corrplot 0.95 loaded
numeric_data <- df[, c("Age", "Monthly_Income", "Job_Satisfaction",
                       "Work_Life_Balance", "Years_at_Company",
                       "Job_Level", "Distance_From_Home")]

cor_matrix <- cor(numeric_data)

corrplot(cor_matrix,
         method = "color",
         type = "upper",
         col = colorRampPalette(c("darkorchid", "white", "darkorchid4"))(200),
         addCoef.col = "black",
         tl.cex = 0.8)

iii) Train-Test Split (70–30)

set.seed(123)

train_index <- sample(1:nrow(df), 0.7 * nrow(df))

train <- df[train_index, ]
test  <- df[-train_index, ]

iv) Build Logistic Regression Model

model <- glm(Attrition_Flag ~ Age + Monthly_Income + Job_Satisfaction +
             Work_Life_Balance + Years_at_Company +
             Overtime + Job_Level + Distance_From_Home,
             data = train,
             family = "binomial")

v) Predict on Test Data

probabilities <- predict(model, test, type = "response")

predictions <- ifelse(probabilities > 0.5, 1, 0)

vi) Confusion Matrix

table(Predicted = predictions, Actual = test$Attrition_Flag)
##          Actual
## Predicted    0    1
##         0 2415  585

vii) Calculate Accuracy

mean(predictions == test$Attrition_Flag)
## [1] 0.805

RANDOM FOREST MODEL

i)

library(randomForest)
## Warning: package 'randomForest' was built under R version 4.5.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
rf_model <- randomForest(as.factor(Attrition_Flag) ~ Age + Monthly_Income +
                         Job_Satisfaction + Work_Life_Balance +
                         Years_at_Company + Overtime +
                         Job_Level + Distance_From_Home,
                         data = train,
                         ntree = 100)

varImpPlot(rf_model)

ii)

rf_pred <- predict(rf_model, test)

table(Predicted = rf_pred, Actual = test$Attrition_Flag)
##          Actual
## Predicted    0    1
##         0 2414  581
##         1    1    4

iii)

mean(rf_pred == test$Attrition_Flag)
## [1] 0.806