SECTION - 1

UNDERSTANDING THE DATA , i) LOADING ALL NECESSARY LIBRARIES

# 
library(dplyr)   
library(readr)   
library(ggplot2)
library(caret)
library(rpart)

library(dplyr) : Loads the dplyr package, which contains useful libraries like dplyr, ggplot2 for data manipulation and visualization.

library(readr) : Loads the readr package, which is used to read data files (like CSV files) into R quickly and efficiently.

library(ggplot2) : Loads the ggplot package for data visualizations.

library(caret) : Using for the regression model.

library(rpart) : Loading the regression trees model like decision trees.

ii) LOADING THE DATASET

# 
df <- read_csv("Employee_Attrition_Analysis.csv")
df <- as.data.frame(df)

Reading the CSV file and storing the dataset in a variable called df. Converting the dataset into a data-frame format for easier analysis in R.

# iii) CHECKING FOR COLUMNS

cat(colnames(df), sep = "\n")

## Employee_ID
## Age
## Gender
## Marital_Status
## Department
## Job_Role
## Job_Level
## Monthly_Income
## Hourly_Rate
## Years_at_Company
## Years_in_Current_Role
## Years_Since_Last_Promotion
## Work_Life_Balance
## Job_Satisfaction
## Performance_Rating
## Training_Hours_Last_Year
## Overtime
## Project_Count
## Average_Hours_Worked_Per_Week
## Absenteeism
## Work_Environment_Satisfaction
## Relationship_with_Manager
## Job_Involvement
## Distance_From_Home
## Number_of_Companies_Worked
## Attrition

Checking for all the column names of the dataset.

# iv) UNDERSTANDING THE STRUCTURE OF DATA & DISPLAYING SNIPPET OF DATASET

#DISPLAYING FEW RECORDS OF DATA

knitr::kable(head(df), caption = "First 6 Rows of Dataset")

First 6 Rows of Dataset
Employee_ID	Age	Gender	Marital_Status	Department	Job_Role	Job_Level	Monthly_Income	Hourly_Rate	Years_at_Company	Years_in_Current_Role	Years_Since_Last_Promotion	Work_Life_Balance	Job_Satisfaction	Performance_Rating	Training_Hours_Last_Year	Overtime	Project_Count	Average_Hours_Worked_Per_Week	Absenteeism	Work_Environment_Satisfaction	Relationship_with_Manager	Job_Involvement	Distance_From_Home	Number_of_Companies_Worked	Attrition
1	58	Male	Single	Finance	Manager	5	7332	81	24	12	3	1	3	2	74	No	9	48	16	4	1	1	49	3	No
2	48	Female	Divorced	HR	Assistant	4	6069	55	18	7	5	1	2	2	24	Yes	9	57	10	4	1	1	25	1	No
3	34	Female	Married	Marketing	Manager	4	11485	65	6	4	3	4	5	1	63	Yes	3	55	1	1	4	3	21	1	Yes
4	27	Female	Divorced	HR	Manager	4	18707	28	12	9	1	1	1	2	4	No	9	53	2	3	4	1	46	2	No
5	40	Male	Married	HR	Analyst	1	16398	92	3	9	1	3	4	3	62	No	1	54	11	1	1	1	43	4	No
6	58	Male	Married	Finance	Executive	3	7305	63	25	2	3	4	5	3	84	No	1	42	11	2	3	4	4	3	Yes

Displaying a snippet of the dataset to understand the data well.

# v) STRUCTURE OF THE DATASET 
str(df)

## 'data.frame':    10032 obs. of  26 variables:
##  $ Employee_ID                  : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Age                          : num  58 48 34 27 40 58 38 42 30 30 ...
##  $ Gender                       : chr  "Male" "Female" "Female" "Female" ...
##  $ Marital_Status               : chr  "Single" "Divorced" "Married" "Divorced" ...
##  $ Department                   : chr  "Finance" "HR" "Marketing" "HR" ...
##  $ Job_Role                     : chr  "Manager" "Assistant" "Manager" "Manager" ...
##  $ Job_Level                    : num  5 4 4 4 1 3 5 1 4 4 ...
##  $ Monthly_Income               : num  7332 6069 11485 18707 16398 ...
##  $ Hourly_Rate                  : num  81 55 65 28 92 63 63 41 95 53 ...
##  $ Years_at_Company             : num  24 18 6 12 3 25 3 16 17 16 ...
##  $ Years_in_Current_Role        : num  12 7 4 9 9 2 3 8 10 14 ...
##  $ Years_Since_Last_Promotion   : num  3 5 3 1 1 3 4 0 2 4 ...
##  $ Work_Life_Balance            : num  1 1 4 1 3 4 4 2 2 1 ...
##  $ Job_Satisfaction             : num  3 2 5 1 4 5 3 4 3 4 ...
##  $ Performance_Rating           : num  2 2 1 2 3 3 4 3 3 3 ...
##  $ Training_Hours_Last_Year     : num  74 24 63 4 62 84 98 75 51 45 ...
##  $ Overtime                     : chr  "No" "Yes" "Yes" "No" ...
##  $ Project_Count                : num  9 9 3 9 1 1 1 3 8 6 ...
##  $ Average_Hours_Worked_Per_Week: num  48 57 55 53 54 42 58 45 42 41 ...
##  $ Absenteeism                  : num  16 10 1 2 11 11 16 9 4 12 ...
##  $ Work_Environment_Satisfaction: num  4 4 1 3 1 2 3 2 3 4 ...
##  $ Relationship_with_Manager    : num  1 1 4 4 1 3 3 1 3 2 ...
##  $ Job_Involvement              : num  1 1 3 1 1 4 4 4 1 1 ...
##  $ Distance_From_Home           : num  49 25 21 46 43 4 33 3 39 1 ...
##  $ Number_of_Companies_Worked   : num  3 1 1 2 4 3 1 2 4 4 ...
##  $ Attrition                    : chr  "No" "No" "Yes" "No" ...

After checking the structure of the dataset using str(df), I found that the dataset contains 10,032 records and 26 variables. Out of these, 6 columns are character (chr) type and 20 columns are numeric (num) type.

#now moving further with the data preprocessing ; checking the null, duplicate records and if existing any , removing them to remove the dedundancy.

SECTION - 2

DATA PREPROCESSING

# i) CHECKING FOR NULL VALUES IF EXIST IN DATASET

#sum(is.na(df))
colSums(is.na(df))

##                   Employee_ID                           Age 
##                             0                             0 
##                        Gender                Marital_Status 
##                             0                             0 
##                    Department                      Job_Role 
##                             0                             0 
##                     Job_Level                Monthly_Income 
##                             0                             0 
##                   Hourly_Rate              Years_at_Company 
##                             0                             0 
##         Years_in_Current_Role    Years_Since_Last_Promotion 
##                             0                             0 
##             Work_Life_Balance              Job_Satisfaction 
##                             0                             0 
##            Performance_Rating      Training_Hours_Last_Year 
##                             0                             0 
##                      Overtime                 Project_Count 
##                             0                             0 
## Average_Hours_Worked_Per_Week                   Absenteeism 
##                             0                             0 
## Work_Environment_Satisfaction     Relationship_with_Manager 
##                             0                             0 
##               Job_Involvement            Distance_From_Home 
##                             0                             0 
##    Number_of_Companies_Worked                     Attrition 
##                             0                             0

Checking for nulls if there is any in the dataset.

# ii) CHECKING FOR DUPLICACY
sum(duplicated(df))

## [1] 32

Finding that there are 32 duplicate records.

# iii) FINDING THE INDEX POSITION OF DUPLICATE ROWS/RECORDS

duplicate_rows <- which(duplicated(df), arr.ind = TRUE)
duplicate_rows

##  [1]  2181  2182  2183  2184  2185  2186  2187  2188  2189  2190  2191  2192
## [13]  2193  2194  2195  2196  2197 10014 10015 10016 10017 10018 10019 10020
## [25] 10021 10022 10023 10024 10025 10026 10027 10028

Checking for the indexes of the duplicate records found.

# iv) REMOVING DUPLICATE RECORDS
df<-df[!duplicated(df), ]

Removing all duplicate records

# v) NOW AGAIN CHECHING FOR DUPLICACY
sum(duplicated(df))

## [1] 0

Now the dataset is containing all the unique records without any duplicate record.

SECTION - 3

EXPLORATORY DATA ANALYSIS(EDA)

i) What is the total number of employees?

total_employees <- nrow(df)
total_employees

## [1] 10000

:=> After cleaning the dataset , we have 10,000 records in our dataset.

ii) What is the total number of attrited employee and overall attrition rate in percentage?

table(df$Attrition)

## 
##   No  Yes 
## 8003 1997

attrition_percent <-prop.table(table(df$Attrition)) * 100
attrition_percent

## 
##    No   Yes 
## 80.03 19.97

attrition_counts <- as.data.frame(table(df$Attrition))

ggplot(attrition_counts, aes(x = Var1, y = Freq)) +
  geom_bar(stat = "identity", fill = "darkorchid4", width = 0.6) +
  geom_text(aes(label = Freq),
            color = "white",
            size = 5,
            vjust = 2.5) +
  labs(title = "Overall Attrition Distribution",
       x = "Attrition",
       y = "Number of Employees")

After calculating the frequency of the Attrition column :

Total employees who left (Attrition = Yes) = 1,997

Total employees who stayed (Attrition = No) = 8,003

The overall attrition rate is 19.97%, while 80.03% employees stayed in the company.

iii) Is the attrition rate dominating the employee numbers staying in the company?

minority_class <- min(attrition_percent)
if(minority_class < 30){
  cat(" The attrition rate is approximately", minority_class,"% indicating that employee attrition is present but not dominant in the company.")
} else {
  print("Attrition Rate is affecting the company")
}

##  The attrition rate is approximately 19.97 % indicating that employee attrition is present but not dominant in the company.

Gender-Wise Analysis

i) What is the gender distribution?

table(df$Gender)

## 
## Female   Male 
##   5042   4958

prop.table(table(df$Gender)) * 100

## 
## Female   Male 
##  50.42  49.58

Total Number of Males we have 4958 with Female 5042

ii) Is attrition rate different across gender?

prop.table(table(df$Gender, df$Attrition), 1) * 100

##         
##                No      Yes
##   Female 80.32527 19.67473
##   Male   79.72973 20.27027

After calculating the attrition rate by gender, I found that:

Female employees → 19.67% attrition

Male employees → 20.27% attrition

The attrition rate is almost similar for both genders, with males having a slightly higher percentage.

iii) Which age group has highest attrition? Creating age group then checking distribution

df$Age_Group <- cut(df$Age,
                      breaks = c(18,25,35,45,55,65),
                      labels = c("18-25","26-35","36-45","46-55","56-65"),
                      right = FALSE)

#CHECK DISTRIBUTION
table(df$Age_Group, df$Attrition)

##        
##           No  Yes
##   18-25  958  272
##   26-35 2009  442
##   36-45 2055  526
##   46-55 2006  502
##   56-65  975  255

prop.table(table(df$Age_Group, df$Attrition), 1) * 100

##        
##               No      Yes
##   18-25 77.88618 22.11382
##   26-35 81.96654 18.03346
##   36-45 79.62030 20.37970
##   46-55 79.98405 20.01595
##   56-65 79.26829 20.73171

ggplot(df, aes(x = Age_Group, fill = Attrition)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = c("Yes" = "#5D478B",
                               "No" = "mediumpurple1")) +
  labs(title = "Attrition by Age Group",
       x = "Age Group",
       y = "Proportion")

The “18–25” age group has the highest attrition rate with 272 employees in numbers and approx 22% in percent leaving from this group.

Financial Analysis

i) Does monthly income affect attrition?

mean(subset(df, Attrition == "Yes")$Monthly_Income)

## [1] 11438.04

mean(subset(df, Attrition == "No")$Monthly_Income)

## [1] 11436.39

median(subset(df, Attrition == "Yes")$Monthly_Income)

## [1] 11323

median(subset(df, Attrition == "No")$Monthly_Income)

## [1] 11417

the mean and median is almost same so this implies monthly income does not appear to significantly affect attrition in this dataset.

ii) Do Employee with Lower Income are leaving more?

#Create Income Categories
df$Income_Group <- cut(df$Monthly_Income,
                         breaks = quantile(df$Monthly_Income, probs = c(0, 0.33, 0.66, 1)),
                         labels = c("Low", "Medium", "High"),
                         include.lowest = TRUE)

# Check Attrition by Income Group
prop.table(table(df$Income_Group, df$Attrition), 1) * 100

##         
##                No      Yes
##   Low    79.90909 20.09091
##   Medium 80.12121 19.87879
##   High   80.05882 19.94118

After dividing employees into Low, Medium, and High income groups and calculating attrition percentage:

Low income → 20.09% attrition

Medium income → 19.88% attrition

High income → 19.94% attrition

The attrition rate is almost the same across all income groups.

Therefore, lower income employees do not leave more than others, and income level does not seem to significantly impact attrition in this dataset.

Department and Growth Analysis

i) Which department has highest attrition rate?

table(df$Department, df$Attrition)

##            
##               No  Yes
##   Finance   1575  415
##   HR        1572  381
##   IT        1526  390
##   Marketing 1720  413
##   Sales     1610  398

dept_table <- prop.table(table(df$Department, df$Attrition), 1) * 100

ggplot(df %>% filter(Attrition == "Yes"),
       aes(x = Department)) +
  geom_bar(fill = "#9370DB", width = 0.5) +
  labs(title = "Number of Employees Who Left by Department",
       x = "Department",
       y = "Count of Attrited Employees")

The “Finance department” has the highest attrition rate with 415 employees, followed closely by IT with 390.

ii) Does Employee Tenure Influence Satisfaction (and Potential Attrition Risk)?

df_summary <- df %>%
  group_by(Years_at_Company) %>%
  summarise(avg_satisfaction = mean(Job_Satisfaction, na.rm = TRUE))

ggplot(df_summary, aes(x = Years_at_Company, y = avg_satisfaction)) +
  geom_line(color = "#B23AEE", linewidth = 1) +
  labs(title = "Average Job Satisfaction by Years at Company",
       x = "Years at Company",
       y = "Average Job Satisfaction")

The average job satisfaction is almost same across different years at the company, with only small ups and downs. There is no major increasing or decreasing trend, which means years at company does not strongly affect satisfaction or attrition.”From the chart we can depict that “Satisfaction stays between 2.9 and 3.2” and fluctuations are there but not to an extent, “Although , Around 15 years, satisfaction dips slightly”..

iii) Which job role has highest attrition?

attrition_table <- table(df$Job_Role, df$Attrition)
leaving_counts <- attrition_table[, "Yes"]
percentages <- round(leaving_counts / sum(leaving_counts) * 100, 1)
labels <- paste(names(percentages), percentages, "%")
pie(
  leaving_counts,
  labels = labels,
  col = c("darkorchid4", "#AB82FF", "#9A32CD", "#BA55D3"),
  main = "Employees Leaving by Job Role",
  border = "white"
)

The Assistant role has the highest attrition rate (≈21.43%).

iv) Does job level affect attrition?

level_table <- table(df$Job_Level, df$Attrition)

round(prop.table(level_table, margin = 1) * 100, 1)

##    
##       No  Yes
##   1 80.6 19.4
##   2 80.8 19.2
##   3 80.2 19.8
##   4 78.3 21.7
##   5 80.2 19.8

Job Level 4 has the highest attrition rate (≈21.71%).Therefore, qe can depict that job level does affect attrition.

v) Does overtime increase attrition?

table(df$Overtime, df$Attrition)

##      
##         No  Yes
##   No  4089 1014
##   Yes 3914  983

:=> The attrition rate is almost the same for employees who do overtime and those who do not. Therefore, overtime does not significantly increase attrition in this dataset.

vi) Does Years at Company Affect Attrition?

mean(subset(df, Attrition == "Yes")$Years_at_Company)

## [1] 14.98147

mean(subset(df, Attrition == "No")$Years_at_Company)

## [1] 14.9249

The average years at company is almost the same for both groups. Therefore, years at company does not seem to significantly affect attrition in this dataset.

vii) Are Employees Leaving Early (0–3 Years)?

#Create Experience Groups , Count Attrition by Experience Group
df$Experience_Group <- cut(df$Years_at_Company,
                           breaks = c(0,3,7,15,40),
                           labels = c("0-3","4-7","8-15","15+"),
                           right = TRUE)

table(df$Experience_Group, df$Attrition)

##       
##          No  Yes
##   0-3   835  200
##   4-7  1116  286
##   8-15 2267  551
##   15+  3785  960

prop.table(table(df$Experience_Group, df$Attrition), 1) * 100

##       
##              No      Yes
##   0-3  80.67633 19.32367
##   4-7  79.60057 20.39943
##   8-15 80.44713 19.55287
##   15+  79.76818 20.23182

ggplot(df %>% filter(Attrition == "Yes"),
       aes(y = Experience_Group)) +
  geom_bar(fill = "#68228B") +
  labs(title = "Attrited Employees by Experience Group",
       x = "Number of Employees",
       y = "Experience Group")

Although the absolute number of attrited employees is higher in the 15+ experience group, this is due to a larger employee base in that category. When examining attrition rates (proportional analysis), the attrition percentage remains consistent across all experience groups at approximately 20%. Therefore, experience level does not significantly influence attrition.

Career Progression & Growth Analysis

i) Does years in current role affect attrition?

tapply(df$Years_in_Current_Role, df$Attrition, mean)

##       No      Yes 
## 7.440960 7.492739

The average years in current role is almost the same for employees who stayed (7.44 years) and those who left (7.49 years). So, being in the company for more years/ less years is not a reason for attrition.

ii) Does years since last promotion influence attrition?

tapply(df$Years_Since_Last_Promotion, df$Attrition, mean)

##       No      Yes 
## 4.474697 4.460691

The average years since last promotion is almost the same for employees who stayed (4.47 years) and those who left (4.46 years). This shows that years since last promotion does not have a significant influence on attrition in this dataset.

df$Promotion_Group <- ifelse(df$Years_Since_Last_Promotion >= 3,
                             "No Recent Promotion",
                             "Recently Promoted")


ggplot(df, aes(x = Promotion_Group, fill = Attrition)) +
  geom_bar(position = "fill", width = 0.5) +
  coord_flip() +
  scale_fill_manual(values = c("No" = "#5D478B", 
                               "Yes" = "mediumpurple1")) +
  labs(title = "Attrition by Promotion Status",
       x = "Promotion Status",
       y = "Proportion") +
  theme_minimal()

The proportion of employees leaving is almost similar in both groups — those recently promoted and those without a recent promotion. This indicates that lack of recent promotion does not strongly impact attrition in this dataset.

iii) Are employees without recent promotion more likely to leave?

prop.table(table(df$Promotion_Group, df$Attrition), 1) * 100

##                      
##                             No      Yes
##   No Recent Promotion 79.97979 20.02021
##   Recently Promoted   80.14323 19.85677

Employees without recent promotion have an attrition rate of 20.02%, while recently promoted employees have an attrition rate of 19.86%. Since the difference is very minimal, promotion status does not significantly impact employee attrition in this dataset.

SECTION - 4

PREDICTIVE MODELING

LOGISTIC REGRESSION

i) Converting Target Variable to Factor

df$Attrition <- as.factor(df$Attrition)

we are converting it to factor as Our Target Variable (Attrition) is a categorical column.

ii) Train-Test Split

set.seed(123)

train_index <- createDataPartition(df$Attrition, p = 0.7, list = FALSE)

train_data <- df[train_index, ]
test_data  <- df[-train_index, ]

fixing the random value by using seed ; using caret library for creating a data Partition for testing and training, 70% data for training and 30% for testing.

iii) Checking train-test data

dim(train_data)

## [1] 7001   30

dim(test_data)

## [1] 2999   30

checking for testing training data records

iv) Build Logistic Regression Model

model <- glm(Attrition ~ .,
                   data = train_data,
                   family = "binomial",
             weights = ifelse(train_data$Attrition == "Yes", 3, 1))

Predicting Attrition using all other variables as predictors in train_data. alongside, using binomial as family because we are doing logistic regression, with setting weights as 3,1 because the yes comes inside minority class (approx 20%).Assigned higher weight to the minority class (‘Yes’) so that the model gives more importance to correctly identifying attrition cases.”

v) Model Summary

summary(model)

## 
## Call:
## glm(formula = Attrition ~ ., family = "binomial", data = train_data, 
##     weights = ifelse(train_data$Attrition == "Yes", 3, 1))
## 
## Coefficients:
##                                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      -1.094e+00  3.002e-01  -3.645 0.000267 ***
## Employee_ID                      -5.307e-06  7.149e-06  -0.742 0.457839    
## Age                               1.826e-02  7.969e-03   2.291 0.021959 *  
## GenderMale                        4.095e-02  4.114e-02   0.995 0.319658    
## Marital_StatusMarried            -5.033e-02  5.042e-02  -0.998 0.318207    
## Marital_StatusSingle              2.324e-02  5.023e-02   0.463 0.643542    
## DepartmentHR                     -4.511e-02  6.569e-02  -0.687 0.492290    
## DepartmentIT                     -9.450e-03  6.550e-02  -0.144 0.885289    
## DepartmentMarketing              -2.321e-03  6.394e-02  -0.036 0.971036    
## DepartmentSales                  -3.467e-02  6.488e-02  -0.534 0.593063    
## Job_RoleAssistant                 8.591e-02  5.679e-02   1.513 0.130334    
## Job_RoleExecutive                -1.005e-02  5.745e-02  -0.175 0.861189    
## Job_RoleManager                  -1.807e-01  5.942e-02  -3.041 0.002359 ** 
## Job_Level                         2.224e-02  1.463e-02   1.520 0.128401    
## Monthly_Income                    1.130e-05  1.245e-05   0.908 0.363835    
## Hourly_Rate                      -1.241e-03  8.265e-04  -1.501 0.133317    
## Years_at_Company                  6.195e-04  6.780e-03   0.091 0.927197    
## Years_in_Current_Role             2.550e-04  5.080e-03   0.050 0.959975    
## Years_Since_Last_Promotion       -9.009e-03  1.196e-02  -0.753 0.451392    
## Work_Life_Balance                 4.478e-02  1.855e-02   2.414 0.015772 *  
## Job_Satisfaction                  2.556e-02  1.459e-02   1.752 0.079713 .  
## Performance_Rating                4.509e-02  1.846e-02   2.443 0.014563 *  
## Training_Hours_Last_Year         -2.666e-04  7.152e-04  -0.373 0.709289    
## OvertimeYes                       3.449e-02  4.113e-02   0.838 0.401758    
## Project_Count                    -3.232e-03  7.997e-03  -0.404 0.686076    
## Average_Hours_Worked_Per_Week     4.284e-03  2.384e-03   1.797 0.072341 .  
## Absenteeism                       7.458e-04  3.542e-03   0.211 0.833239    
## Work_Environment_Satisfaction    -9.975e-03  1.843e-02  -0.541 0.588316    
## Relationship_with_Manager         1.209e-04  1.857e-02   0.007 0.994805    
## Job_Involvement                   3.059e-02  1.855e-02   1.650 0.099044 .  
## Distance_From_Home                1.832e-03  1.443e-03   1.269 0.204494    
## Number_of_Companies_Worked       -2.333e-02  1.847e-02  -1.263 0.206678    
## Age_Group26-35                   -2.675e-01  9.392e-02  -2.848 0.004393 ** 
## Age_Group36-45                   -2.868e-01  1.561e-01  -1.838 0.066110 .  
## Age_Group46-55                   -5.023e-01  2.308e-01  -2.176 0.029551 *  
## Age_Group56-65                   -6.565e-01  2.904e-01  -2.260 0.023796 *  
## Income_GroupMedium               -7.124e-02  8.496e-02  -0.839 0.401741    
## Income_GroupHigh                 -1.415e-01  1.500e-01  -0.944 0.345330    
## Experience_Group4-7              -8.186e-02  8.660e-02  -0.945 0.344506    
## Experience_Group8-15             -4.811e-02  9.713e-02  -0.495 0.620368    
## Experience_Group15+              -7.160e-02  1.563e-01  -0.458 0.646973    
## Promotion_GroupRecently Promoted -4.913e-02  7.465e-02  -0.658 0.510445    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 13378  on 7000  degrees of freedom
## Residual deviance: 13305  on 6959  degrees of freedom
## AIC: 13389
## 
## Number of Fisher Scoring iterations: 4

The model summary shows the estimated coefficients, standard errors, z-values, and p-values for each predictor variable. These help us understand which variables significantly influence attrition.

Age (p = 0.0219) , Age has a statistically significant effect on attrition. The positive coefficient suggests that with increase in age, probability of attrition slightly increases, although the effect size is small.

Work_Life_Balance (p = 0.015) , Work-life balance shows statistical significance. This suggests it plays a role in predicting attrition

Performance_Rating (p = 0.014) , Performance rating also significantly influences attrition.

Many variables such as income, overtime, years at company, and promotion status do not show statistical significance, which confirms the EDA findings that these factors do not strongly influence attrition in this dataset.

The reduction from null deviance to residual deviance is small, which indicates that the model explains only a limited portion of variation in attrition.”

vi) Predictions

prob_predictions <- predict(model, test_data, type = "response")

predicted_class <- ifelse(prob_predictions > 0.5, "Yes", "No")
predicted_class <- as.factor(predicted_class)

head(prob_predictions)

##         3         4         5        10        11        14 
## 0.4211484 0.3385969 0.4477636 0.4043930 0.4389556 0.5136492

#head(predicted_class)

After building the logistic regression model, I used the predict() function to generate predictions on the test dataset. Since logistic regression gives probabilities, I used type = “response” to get the predicted probability of Attrition = “Yes”.

Looking to the values coming as the output : These values represent the probability that an employee will leave the company. For example, 0.42 means there is a 42% chance that the employee will leave. 0.51 means there is a 51% chance that the employee will leave.

vii) Confusion Matrix

test_data$Attrition <- factor(test_data$Attrition, 
                               levels = c("Yes", "No"))

predicted_class <- factor(predicted_class, 
                          levels = c("Yes", "No"))

confusionMatrix(predicted_class, test_data$Attrition)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  Yes   No
##        Yes   23   88
##        No   576 2312
##                                           
##                Accuracy : 0.7786          
##                  95% CI : (0.7633, 0.7933)
##     No Information Rate : 0.8003          
##     P-Value [Acc > NIR] : 0.9984          
##                                           
##                   Kappa : 0.0025          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.038397        
##             Specificity : 0.963333        
##          Pos Pred Value : 0.207207        
##          Neg Pred Value : 0.800554        
##              Prevalence : 0.199733        
##          Detection Rate : 0.007669        
##    Detection Prevalence : 0.037012        
##       Balanced Accuracy : 0.500865        
##                                           
##        'Positive' Class : Yes             
##

Out of all employees who actually left, the model correctly identified only 23 employees, while it missed 576 employees who actually left.

Accuracy = 0.7786 (77.86%)

Although the model shows around 78% accuracy, it fails to identify employees who are likely to leave. The sensitivity is extremely low at 3.8%, meaning most attrition cases are missed. Therefore, the model is biased toward predicting employees who stay in the organization.

DECISION TREE

i)

tree_model <- rpart(Attrition ~ ., data = train_data, method = "class")

tree_pred <- predict(tree_model, test_data, type = "class")

confusionMatrix(tree_pred, test_data$Attrition)

## Warning in confusionMatrix.default(tree_pred, test_data$Attrition): Levels are
## not in the same order for reference and data. Refactoring data to match.

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  Yes   No
##        Yes    0    0
##        No   599 2400
##                                           
##                Accuracy : 0.8003          
##                  95% CI : (0.7855, 0.8144)
##     No Information Rate : 0.8003          
##     P-Value [Acc > NIR] : 0.5109          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0000          
##             Specificity : 1.0000          
##          Pos Pred Value :    NaN          
##          Neg Pred Value : 0.8003          
##              Prevalence : 0.1997          
##          Detection Rate : 0.0000          
##    Detection Prevalence : 0.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : Yes             
##

This, the decision tree model predicts only the majority class, which is employees staying in the company. As a result, it completely fails to detect attrition cases

model_names <- c("Logistic Regression", "Decision Tree")
accuracy <- c(0.7786, 0.8003)

comparison <- data.frame(model_names, accuracy)

library(ggplot2)

ggplot(comparison, aes(x = model_names, y = accuracy)) +
  geom_bar(stat = "identity", fill ="purple3", width = 0.4)  +
  labs(title = "Model Accuracy Comparison",
       x = "Models",
       y = "Accuracy")

Final Conclusion: This analysis shows that there is no single variable that alone affects employee attrition. Both EDA and predictive models like Logistic Regression and Decision Tree indicate that attrition is influenced by multiple factors together, rather than one major reason.

Employee Attrition Analysis

Rishita Tripathi

2026-04-14