#LOADING ALL NECESSARY LIBRARIES

#library(dplyr) #Loads the dplyr package, which contains useful libraries like dplyr, ggplot2 for data manipulation and visualization.

library(readr)

#Loads the readr package, which is used to read data files (like CSV files) into R quickly and efficiently.

library(ggplot2)

#Loads the ggplot package for data visualizations.

#LOADING THE DATASET

df <- read_csv("Employee_Attrition_Analysis.csv")
df <- as.data.frame(df)

:=> Reading the CSV file and storing the dataset in a variable called df.Converting the dataset into a data-frame format for easier analysis in R.

#CHECKING FOR COLUMNS

cat(colnames(df), sep = "\n")
## Employee_ID
## Age
## Gender
## Marital_Status
## Department
## Job_Role
## Job_Level
## Monthly_Income
## Hourly_Rate
## Years_at_Company
## Years_in_Current_Role
## Years_Since_Last_Promotion
## Work_Life_Balance
## Job_Satisfaction
## Performance_Rating
## Training_Hours_Last_Year
## Overtime
## Project_Count
## Average_Hours_Worked_Per_Week
## Absenteeism
## Work_Environment_Satisfaction
## Relationship_with_Manager
## Job_Involvement
## Distance_From_Home
## Number_of_Companies_Worked
## Attrition

:=> Checking for all the column names of the dataset.

#UNDERSTANDING THE STRUCTURE OF DATA & DISPLAYING SNIPPET OF DATASET

#DISPLAYING FEW RECORDS OF DATA

knitr::kable(head(df), caption = "First 6 Rows of Dataset")
First 6 Rows of Dataset
Employee_ID Age Gender Marital_Status Department Job_Role Job_Level Monthly_Income Hourly_Rate Years_at_Company Years_in_Current_Role Years_Since_Last_Promotion Work_Life_Balance Job_Satisfaction Performance_Rating Training_Hours_Last_Year Overtime Project_Count Average_Hours_Worked_Per_Week Absenteeism Work_Environment_Satisfaction Relationship_with_Manager Job_Involvement Distance_From_Home Number_of_Companies_Worked Attrition
1 58 Male Single Finance Manager 5 7332 81 24 12 3 1 3 2 74 No 9 48 16 4 1 1 49 3 No
2 48 Female Divorced HR Assistant 4 6069 55 18 7 5 1 2 2 24 Yes 9 57 10 4 1 1 25 1 No
3 34 Female Married Marketing Manager 4 11485 65 6 4 3 4 5 1 63 Yes 3 55 1 1 4 3 21 1 Yes
4 27 Female Divorced HR Manager 4 18707 28 12 9 1 1 1 2 4 No 9 53 2 3 4 1 46 2 No
5 40 Male Married HR Analyst 1 16398 92 3 9 1 3 4 3 62 No 1 54 11 1 1 1 43 4 No
6 58 Male Married Finance Executive 3 7305 63 25 2 3 4 5 3 84 No 1 42 11 2 3 4 4 3 Yes

:=> just displaying a snippet of the dataset to understand the data well.

#STRUCTURE OF THE DATASET

str(df)
## 'data.frame':    10032 obs. of  26 variables:
##  $ Employee_ID                  : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Age                          : num  58 48 34 27 40 58 38 42 30 30 ...
##  $ Gender                       : chr  "Male" "Female" "Female" "Female" ...
##  $ Marital_Status               : chr  "Single" "Divorced" "Married" "Divorced" ...
##  $ Department                   : chr  "Finance" "HR" "Marketing" "HR" ...
##  $ Job_Role                     : chr  "Manager" "Assistant" "Manager" "Manager" ...
##  $ Job_Level                    : num  5 4 4 4 1 3 5 1 4 4 ...
##  $ Monthly_Income               : num  7332 6069 11485 18707 16398 ...
##  $ Hourly_Rate                  : num  81 55 65 28 92 63 63 41 95 53 ...
##  $ Years_at_Company             : num  24 18 6 12 3 25 3 16 17 16 ...
##  $ Years_in_Current_Role        : num  12 7 4 9 9 2 3 8 10 14 ...
##  $ Years_Since_Last_Promotion   : num  3 5 3 1 1 3 4 0 2 4 ...
##  $ Work_Life_Balance            : num  1 1 4 1 3 4 4 2 2 1 ...
##  $ Job_Satisfaction             : num  3 2 5 1 4 5 3 4 3 4 ...
##  $ Performance_Rating           : num  2 2 1 2 3 3 4 3 3 3 ...
##  $ Training_Hours_Last_Year     : num  74 24 63 4 62 84 98 75 51 45 ...
##  $ Overtime                     : chr  "No" "Yes" "Yes" "No" ...
##  $ Project_Count                : num  9 9 3 9 1 1 1 3 8 6 ...
##  $ Average_Hours_Worked_Per_Week: num  48 57 55 53 54 42 58 45 42 41 ...
##  $ Absenteeism                  : num  16 10 1 2 11 11 16 9 4 12 ...
##  $ Work_Environment_Satisfaction: num  4 4 1 3 1 2 3 2 3 4 ...
##  $ Relationship_with_Manager    : num  1 1 4 4 1 3 3 1 3 2 ...
##  $ Job_Involvement              : num  1 1 3 1 1 4 4 4 1 1 ...
##  $ Distance_From_Home           : num  49 25 21 46 43 4 33 3 39 1 ...
##  $ Number_of_Companies_Worked   : num  3 1 1 2 4 3 1 2 4 4 ...
##  $ Attrition                    : chr  "No" "No" "Yes" "No" ...

:=> After checking the structure of the dataset using str(df), I found that the dataset contains 10,032 records and 26 variables. Out of these, 6 columns are character (chr) type and 20 columns are numeric (num) type.

#DATA PREPROCESSING

#CHECKING FOR NULL VALUES IF EXIST IN DATASET

#sum(is.na(df))
colSums(is.na(df))
##                   Employee_ID                           Age 
##                             0                             0 
##                        Gender                Marital_Status 
##                             0                             0 
##                    Department                      Job_Role 
##                             0                             0 
##                     Job_Level                Monthly_Income 
##                             0                             0 
##                   Hourly_Rate              Years_at_Company 
##                             0                             0 
##         Years_in_Current_Role    Years_Since_Last_Promotion 
##                             0                             0 
##             Work_Life_Balance              Job_Satisfaction 
##                             0                             0 
##            Performance_Rating      Training_Hours_Last_Year 
##                             0                             0 
##                      Overtime                 Project_Count 
##                             0                             0 
## Average_Hours_Worked_Per_Week                   Absenteeism 
##                             0                             0 
## Work_Environment_Satisfaction     Relationship_with_Manager 
##                             0                             0 
##               Job_Involvement            Distance_From_Home 
##                             0                             0 
##    Number_of_Companies_Worked                     Attrition 
##                             0                             0

:=> checking for nulls if exist there in the dataset

#CHECKING FOR DUPLICACY

sum(duplicated(df))
## [1] 32

:=> finding that there are 32 duplicate records.

#FINDING THE INDEX POSITION OF DUPLICATE ROWS/RECORDS

duplicate_rows <- which(duplicated(df), arr.ind = TRUE)
duplicate_rows
##  [1]  2181  2182  2183  2184  2185  2186  2187  2188  2189  2190  2191  2192
## [13]  2193  2194  2195  2196  2197 10014 10015 10016 10017 10018 10019 10020
## [25] 10021 10022 10023 10024 10025 10026 10027 10028

:=> checking for the indexes of the duplicate records found.

#REMOVING DUPLICATE RECORDS

df<-df[!duplicated(df), ]

:=> removing all duplicate records

#NOW AGAIN CHECHING FOR DUPLICACY

sum(duplicated(df))
## [1] 0

:=> finding that now the dataset is containing all unique records without any duplicate record.

#EXPLORATORY DATA ANALYSIS

#SECTION 1: Understanding the Target Variable

#1. What is the total number of employees?

total_employees <- nrow(df)
total_employees
## [1] 10000

:=> after cleaning the dataset , we have 10,000 records in our dataset.

#2. What is the total number of attrited employee and overall attrition rate in percentage?

table(df$Attrition)
## 
##   No  Yes 
## 8003 1997
attrition_percent <-prop.table(table(df$Attrition)) * 100
attrition_percent
## 
##    No   Yes 
## 80.03 19.97
attrition_counts <- as.data.frame(table(df$Attrition))
colnames(attrition_counts) <- c("Attrition", "Count")

ggplot(attrition_counts, aes(x = "", y = Count, fill = Attrition)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y") +
  scale_fill_manual(values = c("No" = "darkorchid1",
                               "Yes" = "darkorchid4")) +
  labs(title = "Overall Attrition Distribution") +
  theme_void()

:=> After calculating the frequency of the Attrition column, I found that:

Total employees who left (Attrition = Yes) = 1,997 Total employees who stayed (Attrition = No) = 8,003

The overall attrition rate is 19.97%, while 80.03% employees stayed in the company.

#3. Is the dataset imbalanced?

If:

Attrition Yes < 30% Attrition No > 70%

Then dataset is imbalanced.

minority_class <- min(attrition_percent)
if(minority_class < 30){
  cat(":=> The attrition rate is approximately", minority_class,"% indicating that employee attrition is present but not dominant in the company.")
} else {
  print("Attrition Rate is affecting the company")
}
## :=> The attrition rate is approximately 19.97 % indicating that employee attrition is present but not dominant in the company.

#SECTION 2: Gender-Wise Analysis

#1. What is the gender distribution?

table(df$Gender)
## 
## Female   Male 
##   5042   4958
prop.table(table(df$Gender)) * 100
## 
## Female   Male 
##  50.42  49.58

:=> Total Number of Males we have 4958 with Female 5042

#2.Is attrition rate different across gender?

prop.table(table(df$Gender, df$Attrition), 1) * 100
##         
##                No      Yes
##   Female 80.32527 19.67473
##   Male   79.72973 20.27027

:=> After calculating the attrition rate by gender, I found that:

Female employees → 19.67% attrition Male employees → 20.27% attrition

The attrition rate is almost similar for both genders, with males having a slightly higher percentage.

#3. Which age group has highest attrition?

#CREATING AGE GROUPS

df$Age_Group <- cut(df$Age,
                      breaks = c(18,25,35,45,55,65),
                      labels = c("18-25","26-35","36-45","46-55","56-65"),
                      right = FALSE)

#CHECK DISTRIBUTION

table(df$Age_Group, df$Attrition)
##        
##           No  Yes
##   18-25  958  272
##   26-35 2009  442
##   36-45 2055  526
##   46-55 2006  502
##   56-65  975  255
prop.table(table(df$Age_Group, df$Attrition), 1) * 100
##        
##               No      Yes
##   18-25 77.88618 22.11382
##   26-35 81.96654 18.03346
##   36-45 79.62030 20.37970
##   46-55 79.98405 20.01595
##   56-65 79.26829 20.73171

:=> The “18–25” age group has the highest attrition rate with 272 employees leaving from this group.

ggplot(df, aes(x = Age_Group, fill = Attrition)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = c("Yes" = "#5D478B",
                               "No" = "mediumpurple1")) +
  labs(title = "Attrition by Age Group",
       x = "Age Group",
       y = "Proportion")

#SECTION 3: Financial Analysis

#1. Does monthly income affect attrition?

mean(subset(df, Attrition == "Yes")$Monthly_Income)
## [1] 11438.04
mean(subset(df, Attrition == "No")$Monthly_Income)
## [1] 11436.39
median(subset(df, Attrition == "Yes")$Monthly_Income)
## [1] 11323
median(subset(df, Attrition == "No")$Monthly_Income)
## [1] 11417

:=> the mean and median is almost same so this implies monthly income does not appear to significantly affect attrition in this dataset

#2. Do Lower Income Employees Leave More?

#Create Income Categories

df$Income_Group <- cut(df$Monthly_Income,
                         breaks = quantile(df$Monthly_Income, probs = c(0, 0.33, 0.66, 1)),
                         labels = c("Low", "Medium", "High"),
                         include.lowest = TRUE)

#Check Attrition by Income Group

prop.table(table(df$Income_Group, df$Attrition), 1) * 100
##         
##                No      Yes
##   Low    79.90909 20.09091
##   Medium 80.12121 19.87879
##   High   80.05882 19.94118

:=> After dividing employees into Low, Medium, and High income groups and calculating attrition percentage:

Low income → 20.09% attrition Medium income → 19.88% attrition High income → 19.94% attrition

The attrition rate is almost the same across all income groups.

Therefore, lower income employees do not leave more than others, and income level does not seem to significantly impact attrition in this dataset.

#SECTION 4: Department and Growth Analysis

#1. Which department has highest attrition rate?

table(df$Department, df$Attrition)
##            
##               No  Yes
##   Finance   1575  415
##   HR        1572  381
##   IT        1526  390
##   Marketing 1720  413
##   Sales     1610  398
dept_table <- prop.table(table(df$Department, df$Attrition), 1) * 100

:=> The “Finance department” has the highest attrition rate with 415 employees, followed closely by IT with 390.

ggplot(df %>% filter(Attrition == "Yes"),
       aes(x = Department)) +
  geom_bar(fill = "#9370DB", width = 0.5) +
  labs(title = "Number of Employees Who Left by Department",
       x = "Department",
       y = "Count of Attrited Employees") 

#2. Does Employee Tenure Influence Satisfaction (and Potential Attrition Risk)?

df_summary <- df %>%
  group_by(Years_at_Company) %>%
  summarise(avg_satisfaction = mean(Job_Satisfaction, na.rm = TRUE))

ggplot(df_summary, aes(x = Years_at_Company, y = avg_satisfaction)) +
  geom_line(color = "#B23AEE", linewidth = 1) +
  labs(title = "Average Job Satisfaction by Years at Company",
       x = "Years at Company",
       y = "Average Job Satisfaction")

:=> The average job satisfaction is almost same across different years at the company, with only small ups and downs. There is no major increasing or decreasing trend, which means years at company does not strongly affect satisfaction or attrition.

#3.Which job role has highest attrition?

table(df$Job_Role, df$Attrition)
##            
##               No  Yes
##   Analyst   2052  520
##   Assistant 1994  544
##   Executive 1979  497
##   Manager   1978  436
prop.table(table(df$Job_Role, df$Attrition), 1) * 100
##            
##                   No      Yes
##   Analyst   79.78227 20.21773
##   Assistant 78.56580 21.43420
##   Executive 79.92730 20.07270
##   Manager   81.93869 18.06131

:=> The Assistant role has the highest attrition rate (≈21.43%).

#4. Does job level affect attrition?

table(df$Job_Level, df$Attrition)
##    
##       No  Yes
##   1 1597  385
##   2 1674  398
##   3 1588  391
##   4 1558  432
##   5 1586  391

:=> Job Level 4 has the highest attrition rate (≈21.71%).

#5. Does overtime increase attrition?

table(df$Overtime, df$Attrition)
##      
##         No  Yes
##   No  4089 1014
##   Yes 3914  983

:=> The attrition rate is almost the same for employees who do overtime and those who do not.

Therefore, overtime does not significantly increase attrition in this dataset.

#6. Does Years at Company Affect Attrition?

mean(subset(df, Attrition == "Yes")$Years_at_Company)
## [1] 14.98147
mean(subset(df, Attrition == "No")$Years_at_Company)
## [1] 14.9249

:=> The average years at company is almost the same for both groups.

Therefore, years at company does not seem to significantly affect attrition in this dataset.

#7. Are Employees Leaving Early (0–3 Years)?

#Create Experience Groups

df$Experience_Group <- cut(df$Years_at_Company,
                           breaks = c(0,3,7,15,40),
                           labels = c("0-3","4-7","8-15","15+"),
                           right = TRUE)

#Count Attrition by Experience Group

table(df$Experience_Group, df$Attrition)
##       
##          No  Yes
##   0-3   835  200
##   4-7  1116  286
##   8-15 2267  551
##   15+  3785  960

:=> BY looking at the numbers the attrition rate for all is almost same so in conclusion, the employers are not leaving early.

ggplot(df %>% filter(Attrition == "Yes"),
       aes(y = Experience_Group)) +
  geom_bar(fill = "#68228B") +
  labs(title = "Attrited Employees by Experience Group",
       x = "Number of Employees",
       y = "Experience Group")

#SECTION 5 : Career Progression & Growth Analysis

#1. Does years in current role affect attrition?

tapply(df$Years_in_Current_Role, df$Attrition, mean)
##       No      Yes 
## 7.440960 7.492739

:=> The average years in current role is almost the same for employees who stayed (7.44 years) and those who left (7.49 years).

#2. Does years since last promotion influence attrition?

tapply(df$Years_Since_Last_Promotion, df$Attrition, mean)
##       No      Yes 
## 4.474697 4.460691

#3. Are employees without recent promotion more likely to leave?

df$Promotion_Group <- ifelse(df$Years_Since_Last_Promotion >= 3,
                             "No Recent Promotion",
                             "Recently Promoted")


ggplot(df, aes(x = Promotion_Group, fill = Attrition)) +
  geom_bar(position = "fill", width = 0.5) +
  coord_flip() +
  scale_fill_manual(values = c("No" = "#5D478B", 
                               "Yes" = "mediumpurple1")) +
  labs(title = "Attrition by Promotion Status",
       x = "Promotion Status",
       y = "Proportion") +
  theme_minimal()

prop.table(table(df$Promotion_Group, df$Attrition), 1) * 100
##                      
##                             No      Yes
##   No Recent Promotion 79.97979 20.02021
##   Recently Promoted   80.14323 19.85677

:=> Employees without recent promotion have an attrition rate of 20.02%, while recently promoted employees have an attrition rate of 19.86%. Since the difference is very minimal, promotion status does not significantly impact employee attrition in this dataset.

#SECTION 6 : Predictive Modeling

#1. Converting Target Variable to Numeric

df$Attrition_Flag <- ifelse(df$Attrition == "Yes", 1, 0)

#2. Convert Categorical Variables to Factor

df$Gender <- as.factor(df$Gender)
df$Marital_Status <- as.factor(df$Marital_Status)
df$Department <- as.factor(df$Department)
df$Job_Role <- as.factor(df$Job_Role)
df$Overtime <- as.factor(df$Overtime)

library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.3
## corrplot 0.95 loaded
numeric_data <- df[, c("Age", "Monthly_Income", "Job_Satisfaction",
                       "Work_Life_Balance", "Years_at_Company",
                       "Job_Level", "Distance_From_Home")]

cor_matrix <- cor(numeric_data)

corrplot(cor_matrix,
         method = "color",
         type = "upper",
         col = colorRampPalette(c("darkorchid", "white", "darkorchid4"))(200),
         addCoef.col = "black",
         tl.cex = 0.8)

#3. Train-Test Split (70–30)

set.seed(123)

train_index <- sample(1:nrow(df), 0.7 * nrow(df))

train <- df[train_index, ]
test  <- df[-train_index, ]

#4. Build Logistic Regression Model

model <- glm(Attrition_Flag ~ Age + Monthly_Income + Job_Satisfaction +
             Work_Life_Balance + Years_at_Company +
             Overtime + Job_Level + Distance_From_Home,
             data = train,
             family = "binomial")

#5. Predict on Test Data

probabilities <- predict(model, test, type = "response")

predictions <- ifelse(probabilities > 0.5, 1, 0)

#5. Confusion Matrix

table(Predicted = predictions, Actual = test$Attrition_Flag)
##          Actual
## Predicted    0    1
##         0 2415  585

#6. Calculate Accuracy

mean(predictions == test$Attrition_Flag)
## [1] 0.805

#SECTION 7 : TRYING RANDOM FOREST

library(randomForest)
## Warning: package 'randomForest' was built under R version 4.5.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
rf_model <- randomForest(as.factor(Attrition_Flag) ~ Age + Monthly_Income +
                         Job_Satisfaction + Work_Life_Balance +
                         Years_at_Company + Overtime +
                         Job_Level + Distance_From_Home,
                         data = train,
                         ntree = 100)

varImpPlot(rf_model)

rf_pred <- predict(rf_model, test)

table(Predicted = rf_pred, Actual = test$Attrition_Flag)
##          Actual
## Predicted    0    1
##         0 2414  581
##         1    1    4
mean(rf_pred == test$Attrition_Flag)
## [1] 0.806