An HR Management Development Program is often structured to provide a thorough blend of theoretical knowledge and practical experience, ensuring that participants gain well-rounded competencies. The program is frequently separated into main modules that address essential HR responsibilities like as recruitment, talent management, employee engagement, and labor law compliance
# Install required packages if missing
packages <- c("skimr", "DataExplorer", "corrplot", "ggplot2", "dplyr", "cluster", "factoextra", "caret", "randomForest", "gbm")
installed <- packages %in% installed.packages()
if (any(!installed)) install.packages(packages[!installed])
# Load packages
library(skimr)
library(DataExplorer)
library(corrplot)
## corrplot 0.95 loaded
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(caret)
## Loading required package: lattice
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(gbm)
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(reshape2)
set.seed(123)
# Load data
attrition <- read.csv("https://raw.githubusercontent.com/harithjamadi/Employee-Attrition-and-Income-Prediction/main/HR.csv")
head(attrition)
## Age Attrition BusinessTravel DailyRate Department
## 1 41 Yes Travel_Rarely 1102 Sales
## 2 49 No Travel_Frequently 279 Research & Development
## 3 37 Yes Travel_Rarely 1373 Research & Development
## 4 33 No Travel_Frequently 1392 Research & Development
## 5 27 No Travel_Rarely 591 Research & Development
## 6 32 No Travel_Frequently 1005 Research & Development
## DistanceFromHome Education EducationField EmployeeCount EmployeeNumber
## 1 1 2 Life Sciences 1 1
## 2 8 1 Life Sciences 1 2
## 3 2 2 Other 1 4
## 4 3 4 Life Sciences 1 5
## 5 2 1 Medical 1 7
## 6 2 2 Life Sciences 1 8
## EnvironmentSatisfaction Gender HourlyRate JobInvolvement JobLevel
## 1 2 Female 94 3 2
## 2 3 Male 61 2 2
## 3 4 Male 92 2 1
## 4 4 Female 56 3 1
## 5 1 Male 40 3 1
## 6 4 Male 79 3 1
## JobRole JobSatisfaction MaritalStatus MonthlyIncome MonthlyRate
## 1 Sales Executive 4 Single 5993 19479
## 2 Research Scientist 2 Married 5130 24907
## 3 Laboratory Technician 3 Single 2090 2396
## 4 Research Scientist 3 Married 2909 23159
## 5 Laboratory Technician 2 Married 3468 16632
## 6 Laboratory Technician 4 Single 3068 11864
## NumCompaniesWorked Over18 OverTime PercentSalaryHike PerformanceRating
## 1 8 Y Yes 11 3
## 2 1 Y No 23 4
## 3 6 Y Yes 15 3
## 4 1 Y Yes 11 3
## 5 9 Y No 12 3
## 6 0 Y No 13 3
## RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears
## 1 1 80 0 8
## 2 4 80 1 10
## 3 2 80 0 7
## 4 3 80 0 8
## 5 4 80 1 6
## 6 3 80 0 8
## TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole
## 1 0 1 6 4
## 2 3 3 10 7
## 3 3 3 0 0
## 4 3 3 8 7
## 5 3 3 2 2
## 6 2 2 7 7
## YearsSinceLastPromotion YearsWithCurrManager
## 1 0 5
## 2 1 7
## 3 0 0
## 4 3 0
## 5 2 2
## 6 3 6
# Check structure and summary
# → Understand data types, detect categorical/numeric variables, and get an overview of distributions.
str(attrition)
## 'data.frame': 1470 obs. of 35 variables:
## $ Age : int 41 49 37 33 27 32 59 30 38 36 ...
## $ Attrition : chr "Yes" "No" "Yes" "No" ...
## $ BusinessTravel : chr "Travel_Rarely" "Travel_Frequently" "Travel_Rarely" "Travel_Frequently" ...
## $ DailyRate : int 1102 279 1373 1392 591 1005 1324 1358 216 1299 ...
## $ Department : chr "Sales" "Research & Development" "Research & Development" "Research & Development" ...
## $ DistanceFromHome : int 1 8 2 3 2 2 3 24 23 27 ...
## $ Education : int 2 1 2 4 1 2 3 1 3 3 ...
## $ EducationField : chr "Life Sciences" "Life Sciences" "Other" "Life Sciences" ...
## $ EmployeeCount : int 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : int 1 2 4 5 7 8 10 11 12 13 ...
## $ EnvironmentSatisfaction : int 2 3 4 4 1 4 3 4 4 3 ...
## $ Gender : chr "Female" "Male" "Male" "Female" ...
## $ HourlyRate : int 94 61 92 56 40 79 81 67 44 94 ...
## $ JobInvolvement : int 3 2 2 3 3 3 4 3 2 3 ...
## $ JobLevel : int 2 2 1 1 1 1 1 1 3 2 ...
## $ JobRole : chr "Sales Executive" "Research Scientist" "Laboratory Technician" "Research Scientist" ...
## $ JobSatisfaction : int 4 2 3 3 2 4 1 3 3 3 ...
## $ MaritalStatus : chr "Single" "Married" "Single" "Married" ...
## $ MonthlyIncome : int 5993 5130 2090 2909 3468 3068 2670 2693 9526 5237 ...
## $ MonthlyRate : int 19479 24907 2396 23159 16632 11864 9964 13335 8787 16577 ...
## $ NumCompaniesWorked : int 8 1 6 1 9 0 4 1 0 6 ...
## $ Over18 : chr "Y" "Y" "Y" "Y" ...
## $ OverTime : chr "Yes" "No" "Yes" "Yes" ...
## $ PercentSalaryHike : int 11 23 15 11 12 13 20 22 21 13 ...
## $ PerformanceRating : int 3 4 3 3 3 3 4 4 4 3 ...
## $ RelationshipSatisfaction: int 1 4 2 3 4 3 1 2 2 2 ...
## $ StandardHours : int 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : int 0 1 0 0 1 0 3 1 0 2 ...
## $ TotalWorkingYears : int 8 10 7 8 6 8 12 1 10 17 ...
## $ TrainingTimesLastYear : int 0 3 3 3 3 2 3 2 2 3 ...
## $ WorkLifeBalance : int 1 3 3 3 3 2 2 3 3 2 ...
## $ YearsAtCompany : int 6 10 0 8 2 7 1 1 9 7 ...
## $ YearsInCurrentRole : int 4 7 0 7 2 7 0 0 7 7 ...
## $ YearsSinceLastPromotion : int 0 1 0 3 2 3 0 0 1 7 ...
## $ YearsWithCurrManager : int 5 7 0 0 2 6 0 0 8 7 ...
summary(attrition)
## Age Attrition BusinessTravel DailyRate
## Min. :18.00 Length:1470 Length:1470 Min. : 102.0
## 1st Qu.:30.00 Class :character Class :character 1st Qu.: 465.0
## Median :36.00 Mode :character Mode :character Median : 802.0
## Mean :36.92 Mean : 802.5
## 3rd Qu.:43.00 3rd Qu.:1157.0
## Max. :60.00 Max. :1499.0
## Department DistanceFromHome Education EducationField
## Length:1470 Min. : 1.000 Min. :1.000 Length:1470
## Class :character 1st Qu.: 2.000 1st Qu.:2.000 Class :character
## Mode :character Median : 7.000 Median :3.000 Mode :character
## Mean : 9.193 Mean :2.913
## 3rd Qu.:14.000 3rd Qu.:4.000
## Max. :29.000 Max. :5.000
## EmployeeCount EmployeeNumber EnvironmentSatisfaction Gender
## Min. :1 Min. : 1.0 Min. :1.000 Length:1470
## 1st Qu.:1 1st Qu.: 491.2 1st Qu.:2.000 Class :character
## Median :1 Median :1020.5 Median :3.000 Mode :character
## Mean :1 Mean :1024.9 Mean :2.722
## 3rd Qu.:1 3rd Qu.:1555.8 3rd Qu.:4.000
## Max. :1 Max. :2068.0 Max. :4.000
## HourlyRate JobInvolvement JobLevel JobRole
## Min. : 30.00 Min. :1.00 Min. :1.000 Length:1470
## 1st Qu.: 48.00 1st Qu.:2.00 1st Qu.:1.000 Class :character
## Median : 66.00 Median :3.00 Median :2.000 Mode :character
## Mean : 65.89 Mean :2.73 Mean :2.064
## 3rd Qu.: 83.75 3rd Qu.:3.00 3rd Qu.:3.000
## Max. :100.00 Max. :4.00 Max. :5.000
## JobSatisfaction MaritalStatus MonthlyIncome MonthlyRate
## Min. :1.000 Length:1470 Min. : 1009 Min. : 2094
## 1st Qu.:2.000 Class :character 1st Qu.: 2911 1st Qu.: 8047
## Median :3.000 Mode :character Median : 4919 Median :14236
## Mean :2.729 Mean : 6503 Mean :14313
## 3rd Qu.:4.000 3rd Qu.: 8379 3rd Qu.:20462
## Max. :4.000 Max. :19999 Max. :26999
## NumCompaniesWorked Over18 OverTime PercentSalaryHike
## Min. :0.000 Length:1470 Length:1470 Min. :11.00
## 1st Qu.:1.000 Class :character Class :character 1st Qu.:12.00
## Median :2.000 Mode :character Mode :character Median :14.00
## Mean :2.693 Mean :15.21
## 3rd Qu.:4.000 3rd Qu.:18.00
## Max. :9.000 Max. :25.00
## PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel
## Min. :3.000 Min. :1.000 Min. :80 Min. :0.0000
## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:80 1st Qu.:0.0000
## Median :3.000 Median :3.000 Median :80 Median :1.0000
## Mean :3.154 Mean :2.712 Mean :80 Mean :0.7939
## 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:80 3rd Qu.:1.0000
## Max. :4.000 Max. :4.000 Max. :80 Max. :3.0000
## TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## Min. : 0.00 Min. :0.000 Min. :1.000 Min. : 0.000
## 1st Qu.: 6.00 1st Qu.:2.000 1st Qu.:2.000 1st Qu.: 3.000
## Median :10.00 Median :3.000 Median :3.000 Median : 5.000
## Mean :11.28 Mean :2.799 Mean :2.761 Mean : 7.008
## 3rd Qu.:15.00 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.: 9.000
## Max. :40.00 Max. :6.000 Max. :4.000 Max. :40.000
## YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 2.000 1st Qu.: 0.000 1st Qu.: 2.000
## Median : 3.000 Median : 1.000 Median : 3.000
## Mean : 4.229 Mean : 2.188 Mean : 4.123
## 3rd Qu.: 7.000 3rd Qu.: 3.000 3rd Qu.: 7.000
## Max. :18.000 Max. :15.000 Max. :17.000
skim(attrition)
| Name | attrition |
| Number of rows | 1470 |
| Number of columns | 35 |
| _______________________ | |
| Column type frequency: | |
| character | 9 |
| numeric | 26 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Attrition | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
| BusinessTravel | 0 | 1 | 10 | 17 | 0 | 3 | 0 |
| Department | 0 | 1 | 5 | 22 | 0 | 3 | 0 |
| EducationField | 0 | 1 | 5 | 16 | 0 | 6 | 0 |
| Gender | 0 | 1 | 4 | 6 | 0 | 2 | 0 |
| JobRole | 0 | 1 | 7 | 25 | 0 | 9 | 0 |
| MaritalStatus | 0 | 1 | 6 | 8 | 0 | 3 | 0 |
| Over18 | 0 | 1 | 1 | 1 | 0 | 1 | 0 |
| OverTime | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Age | 0 | 1 | 36.92 | 9.14 | 18 | 30.00 | 36.0 | 43.00 | 60 | ▂▇▇▃▂ |
| DailyRate | 0 | 1 | 802.49 | 403.51 | 102 | 465.00 | 802.0 | 1157.00 | 1499 | ▇▇▇▇▇ |
| DistanceFromHome | 0 | 1 | 9.19 | 8.11 | 1 | 2.00 | 7.0 | 14.00 | 29 | ▇▅▂▂▂ |
| Education | 0 | 1 | 2.91 | 1.02 | 1 | 2.00 | 3.0 | 4.00 | 5 | ▂▃▇▆▁ |
| EmployeeCount | 0 | 1 | 1.00 | 0.00 | 1 | 1.00 | 1.0 | 1.00 | 1 | ▁▁▇▁▁ |
| EmployeeNumber | 0 | 1 | 1024.87 | 602.02 | 1 | 491.25 | 1020.5 | 1555.75 | 2068 | ▇▇▇▇▇ |
| EnvironmentSatisfaction | 0 | 1 | 2.72 | 1.09 | 1 | 2.00 | 3.0 | 4.00 | 4 | ▅▅▁▇▇ |
| HourlyRate | 0 | 1 | 65.89 | 20.33 | 30 | 48.00 | 66.0 | 83.75 | 100 | ▇▇▇▇▇ |
| JobInvolvement | 0 | 1 | 2.73 | 0.71 | 1 | 2.00 | 3.0 | 3.00 | 4 | ▁▃▁▇▁ |
| JobLevel | 0 | 1 | 2.06 | 1.11 | 1 | 1.00 | 2.0 | 3.00 | 5 | ▇▇▃▂▁ |
| JobSatisfaction | 0 | 1 | 2.73 | 1.10 | 1 | 2.00 | 3.0 | 4.00 | 4 | ▅▅▁▇▇ |
| MonthlyIncome | 0 | 1 | 6502.93 | 4707.96 | 1009 | 2911.00 | 4919.0 | 8379.00 | 19999 | ▇▅▂▁▂ |
| MonthlyRate | 0 | 1 | 14313.10 | 7117.79 | 2094 | 8047.00 | 14235.5 | 20461.50 | 26999 | ▇▇▇▇▇ |
| NumCompaniesWorked | 0 | 1 | 2.69 | 2.50 | 0 | 1.00 | 2.0 | 4.00 | 9 | ▇▃▂▂▁ |
| PercentSalaryHike | 0 | 1 | 15.21 | 3.66 | 11 | 12.00 | 14.0 | 18.00 | 25 | ▇▅▃▂▁ |
| PerformanceRating | 0 | 1 | 3.15 | 0.36 | 3 | 3.00 | 3.0 | 3.00 | 4 | ▇▁▁▁▂ |
| RelationshipSatisfaction | 0 | 1 | 2.71 | 1.08 | 1 | 2.00 | 3.0 | 4.00 | 4 | ▅▅▁▇▇ |
| StandardHours | 0 | 1 | 80.00 | 0.00 | 80 | 80.00 | 80.0 | 80.00 | 80 | ▁▁▇▁▁ |
| StockOptionLevel | 0 | 1 | 0.79 | 0.85 | 0 | 0.00 | 1.0 | 1.00 | 3 | ▇▇▁▂▁ |
| TotalWorkingYears | 0 | 1 | 11.28 | 7.78 | 0 | 6.00 | 10.0 | 15.00 | 40 | ▇▇▂▁▁ |
| TrainingTimesLastYear | 0 | 1 | 2.80 | 1.29 | 0 | 2.00 | 3.0 | 3.00 | 6 | ▂▇▇▂▃ |
| WorkLifeBalance | 0 | 1 | 2.76 | 0.71 | 1 | 2.00 | 3.0 | 3.00 | 4 | ▁▃▁▇▂ |
| YearsAtCompany | 0 | 1 | 7.01 | 6.13 | 0 | 3.00 | 5.0 | 9.00 | 40 | ▇▂▁▁▁ |
| YearsInCurrentRole | 0 | 1 | 4.23 | 3.62 | 0 | 2.00 | 3.0 | 7.00 | 18 | ▇▃▂▁▁ |
| YearsSinceLastPromotion | 0 | 1 | 2.19 | 3.22 | 0 | 0.00 | 1.0 | 3.00 | 15 | ▇▁▁▁▁ |
| YearsWithCurrManager | 0 | 1 | 4.12 | 3.57 | 0 | 2.00 | 3.0 | 7.00 | 17 | ▇▂▅▁▁ |
# Check for missing values and duplicates
# → Ensure data completeness and integrity before analysis.
colSums(is.na(attrition))
## Age Attrition BusinessTravel
## 0 0 0
## DailyRate Department DistanceFromHome
## 0 0 0
## Education EducationField EmployeeCount
## 0 0 0
## EmployeeNumber EnvironmentSatisfaction Gender
## 0 0 0
## HourlyRate JobInvolvement JobLevel
## 0 0 0
## JobRole JobSatisfaction MaritalStatus
## 0 0 0
## MonthlyIncome MonthlyRate NumCompaniesWorked
## 0 0 0
## Over18 OverTime PercentSalaryHike
## 0 0 0
## PerformanceRating RelationshipSatisfaction StandardHours
## 0 0 0
## StockOptionLevel TotalWorkingYears TrainingTimesLastYear
## 0 0 0
## WorkLifeBalance YearsAtCompany YearsInCurrentRole
## 0 0 0
## YearsSinceLastPromotion YearsWithCurrManager
## 0 0
# Visualize missing data
plot_missing(attrition)
# Check for duplicate rows
sum(duplicated(attrition))
## [1] 0
# Convert categorical variables to factors
# → Required for correct handling in models and visualizations.
attrition <- attrition %>%
mutate(
Attrition = as.factor(Attrition),
BusinessTravel = as.factor(BusinessTravel),
Department = as.factor(Department),
EducationField = as.factor(EducationField),
Gender = as.factor(Gender),
JobRole = as.factor(JobRole),
MaritalStatus = as.factor(MaritalStatus),
OverTime = as.factor(OverTime),
Over18 = as.factor(Over18)
)
str(attrition) # Re-check structure after conversion
## 'data.frame': 1470 obs. of 35 variables:
## $ Age : int 41 49 37 33 27 32 59 30 38 36 ...
## $ Attrition : Factor w/ 2 levels "No","Yes": 2 1 2 1 1 1 1 1 1 1 ...
## $ BusinessTravel : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 2 3 2 3 2 3 3 2 3 ...
## $ DailyRate : int 1102 279 1373 1392 591 1005 1324 1358 216 1299 ...
## $ Department : Factor w/ 3 levels "Human Resources",..: 3 2 2 2 2 2 2 2 2 2 ...
## $ DistanceFromHome : int 1 8 2 3 2 2 3 24 23 27 ...
## $ Education : int 2 1 2 4 1 2 3 1 3 3 ...
## $ EducationField : Factor w/ 6 levels "Human Resources",..: 2 2 5 2 4 2 4 2 2 4 ...
## $ EmployeeCount : int 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : int 1 2 4 5 7 8 10 11 12 13 ...
## $ EnvironmentSatisfaction : int 2 3 4 4 1 4 3 4 4 3 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 2 2 1 2 2 1 2 2 2 ...
## $ HourlyRate : int 94 61 92 56 40 79 81 67 44 94 ...
## $ JobInvolvement : int 3 2 2 3 3 3 4 3 2 3 ...
## $ JobLevel : int 2 2 1 1 1 1 1 1 3 2 ...
## $ JobRole : Factor w/ 9 levels "Healthcare Representative",..: 8 7 3 7 3 3 3 3 5 1 ...
## $ JobSatisfaction : int 4 2 3 3 2 4 1 3 3 3 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 3 2 3 2 2 3 2 1 3 2 ...
## $ MonthlyIncome : int 5993 5130 2090 2909 3468 3068 2670 2693 9526 5237 ...
## $ MonthlyRate : int 19479 24907 2396 23159 16632 11864 9964 13335 8787 16577 ...
## $ NumCompaniesWorked : int 8 1 6 1 9 0 4 1 0 6 ...
## $ Over18 : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
## $ OverTime : Factor w/ 2 levels "No","Yes": 2 1 2 2 1 1 2 1 1 1 ...
## $ PercentSalaryHike : int 11 23 15 11 12 13 20 22 21 13 ...
## $ PerformanceRating : int 3 4 3 3 3 3 4 4 4 3 ...
## $ RelationshipSatisfaction: int 1 4 2 3 4 3 1 2 2 2 ...
## $ StandardHours : int 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : int 0 1 0 0 1 0 3 1 0 2 ...
## $ TotalWorkingYears : int 8 10 7 8 6 8 12 1 10 17 ...
## $ TrainingTimesLastYear : int 0 3 3 3 3 2 3 2 2 3 ...
## $ WorkLifeBalance : int 1 3 3 3 3 2 2 3 3 2 ...
## $ YearsAtCompany : int 6 10 0 8 2 7 1 1 9 7 ...
## $ YearsInCurrentRole : int 4 7 0 7 2 7 0 0 7 7 ...
## $ YearsSinceLastPromotion : int 0 1 0 3 2 3 0 0 1 7 ...
## $ YearsWithCurrManager : int 5 7 0 0 2 6 0 0 8 7 ...
# Generate summary statistics grouped by Attrition
# → Compare means and medians of numeric features between employees who left and stayed.
attrition %>%
group_by(Attrition) %>%
summarise(across(where(is.numeric), list(mean = mean, median = median), .names = "{.col}_{.fn}"))
## # A tibble: 2 × 53
## Attrition Age_mean Age_median DailyRate_mean DailyRate_median
## <fct> <dbl> <int> <dbl> <int>
## 1 No 37.6 36 813. 817
## 2 Yes 33.6 32 750. 699
## # ℹ 48 more variables: DistanceFromHome_mean <dbl>,
## # DistanceFromHome_median <int>, Education_mean <dbl>,
## # Education_median <int>, EmployeeCount_mean <dbl>,
## # EmployeeCount_median <int>, EmployeeNumber_mean <dbl>,
## # EmployeeNumber_median <int>, EnvironmentSatisfaction_mean <dbl>,
## # EnvironmentSatisfaction_median <int>, HourlyRate_mean <dbl>,
## # HourlyRate_median <int>, JobInvolvement_mean <dbl>, …
# Visualize overall attrition distribution
# → Understand class imbalance in the target variable.
ggplot(attrition, aes(x = Attrition, fill = Attrition)) +
geom_bar() +
theme_minimal() +
labs(title = "Attrition Distribution", y = "Count")
## Correlation Analysis (Numerical)
# Analyze correlations among numeric variables
# → Identify multicollinearity and spot influential numeric predictors.
numeric_vars <- attrition %>% select(where(is.numeric))
# Correlation matrix
cor_matrix <- cor(numeric_vars)
## Warning in cor(numeric_vars): the standard deviation is zero
# Visualize correlation
corrplot(cor_matrix, method = "color", type = "upper", tl.cex = 0.8)
# Visualize distributions and relationships between features and Attrition
# → Explore potential predictors and patterns.
# Age distribution (Histogram)
# → Understand overall age trends. Histogram is used to show frequency distribution of continuous variable (Age).
ggplot(attrition, aes(x = Age)) +
geom_histogram(fill = "steelblue", bins = 30) +
theme_minimal() +
labs(title = "Distribution of Age")
# Monthly income by Attrition (Boxplot)
# → Compare income distribution between employees who stayed vs left. Boxplot shows medians and outliers clearly.
ggplot(attrition, aes(x = Attrition, y = MonthlyIncome, fill = Attrition)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Monthly Income by Attrition")
# Job Role vs Attrition (Proportional Bar Chart)
# → Identify which job roles have higher attrition. Proportional bar chart (position = "fill") is ideal for comparing proportions within categories.
ggplot(attrition, aes(x = JobRole, fill = Attrition)) +
geom_bar(position = "fill") +
coord_flip() +
theme_minimal() +
labs(title = "Attrition Rate by Job Role", y = "Proportion")
# Overtime vs Attrition (Proportional Bar Chart)
# → Analyze how working overtime affects attrition. Proportional bar chart is used to compare ratios across groups.
ggplot(attrition, aes(x = OverTime, fill = Attrition)) +
geom_bar(position = "fill") +
theme_minimal() +
labs(title = "Attrition Rate by Overtime", y = "Proportion")
# Gender vs Attrition (Proportional Bar Chart)
# → See if attrition varies by gender. Bar chart with proportion helps normalize different group sizes.
ggplot(attrition, aes(x = Gender, fill = Attrition)) +
geom_bar(position = "fill") +
theme_minimal() +
labs(title = "Attrition Rate by Gender", y = "Proportion")
# Marital Status vs Attrition (Proportional Bar Chart)
# → Assess marital status influence on attrition. Again, proportion bar chart allows fair comparison across groups.
ggplot(attrition, aes(x = MaritalStatus, fill = Attrition)) +
geom_bar(position = "fill") +
theme_minimal() +
labs(title = "Attrition Rate by Marital Status", y = "Proportion")
# Age by Attrition (Boxplot)
# → Compare age ranges across attrition groups. Boxplot efficiently shows medians, spread, and outliers.
ggplot(attrition, aes(x = Attrition, y = Age, fill = Attrition)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Age Distribution by Attrition")
# Years at Company by Attrition (Histogram with overlay)
# → Examine tenure patterns among employees. Histogram shows frequency, with transparency (alpha) to allow group comparison.
ggplot(attrition, aes(x = YearsAtCompany, fill = Attrition)) +
geom_histogram(position = "identity", alpha = 0.6, bins = 20) +
theme_minimal() +
labs(title = "Years at Company by Attrition")
# Job Satisfaction vs Attrition (Stacked Bar Chart)
# → Explore satisfaction levels related to attrition. Categorical x-axis, so bar chart is suitable. Proportions show attrition rate by rating.
ggplot(attrition, aes(x = factor(JobSatisfaction), fill = Attrition)) +
geom_bar(position = "fill") +
theme_minimal() +
labs(title = "Attrition by Job Satisfaction", x = "Job Satisfaction (1–4)", y = "Proportion")
# Business Travel vs Attrition (Proportional Bar Chart)
# → Determine if frequent travel affects attrition. Bar chart visualizes proportion of leavers in each travel group.
ggplot(attrition, aes(x = BusinessTravel, fill = Attrition)) +
geom_bar(position = "fill") +
theme_minimal() +
labs(title = "Attrition Rate by Business Travel", y = "Proportion")
# Total Working Years vs Monthly Income by Attrition (Scatter Plot)
# → Investigate relationship between experience and income, and how it differs for attrition status. Scatter plot best for continuous vs continuous.
ggplot(attrition, aes(x = TotalWorkingYears, y = MonthlyIncome, color = Attrition)) +
geom_point(alpha = 0.6) +
theme_minimal() +
labs(title = "Total Working Years vs. Monthly Income by Attrition")
# Remove non-information columns
attrition <- subset(attrition, select = -c(EmployeeCount, EmployeeNumber, Over18, StandardHours))
# Scaling numerical features
numeric_cols <- sapply(attrition, is.numeric)
hr_scaled <- attrition
hr_scaled[numeric_cols] <- scale(attrition[numeric_cols])
# Encode categorical features
dummies <- dummyVars(~ ., data = attrition)
hr_dummy <- predict(dummies, newdata = attrition)
hr_scaled <- as.data.frame(scale(hr_dummy))
# Train-test split
split <- createDataPartition(hr_scaled$MonthlyIncome, p = 0.8, list = FALSE)
train <- hr_scaled[split, ]
test <- hr_scaled[-split, ]
model_lm <- train(MonthlyIncome ~ ., data = train, method = "lm")
model_rf <- train(MonthlyIncome ~ ., data = train, method = "rf", trControl = trainControl(method = "cv", number = 5))
model_gbm <- train(MonthlyIncome ~ ., data = train, method = "gbm", verbose = FALSE, trControl = trainControl(method = "cv", number = 5))
pred_lm <- predict(model_lm, newdata = test)
pred_rf <- predict(model_rf, newdata = test)
pred_gbm <- predict(model_gbm, newdata = test)
postResample(pred_lm, obs = test$MonthlyIncome)
## RMSE Rsquared MAE
## 0.2329686 0.9484436 0.1823978
postResample(pred_rf, obs = test$MonthlyIncome)
## RMSE Rsquared MAE
## 0.2057515 0.9600827 0.1600847
postResample(pred_gbm, obs = test$MonthlyIncome)
## RMSE Rsquared MAE
## 0.2159831 0.9565028 0.1694184
ggplot(data.frame(Actual = test$MonthlyIncome, Predicted = pred_rf), aes(x = Actual, y = Predicted)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_abline(color = "red", linetype = "dashed") +
theme_minimal() +
labs(title = "Predicted vs Actual Monthly Income", x = "Actual", y = "Predicted")
varImp(model_rf)
## rf variable importance
##
## only 20 most important variables shown (out of 52)
##
## Overall
## JobLevel 100.0000
## `JobRole.Research Director` 1.2232
## TotalWorkingYears 1.0135
## DailyRate 0.4945
## HourlyRate 0.4451
## JobRole.Manager 0.4069
## MonthlyRate 0.4067
## `JobRole.Laboratory Technician` 0.3971
## DistanceFromHome 0.2575
## `JobRole.Sales Executive` 0.2528
## Age 0.2485
## PercentSalaryHike 0.2179
## YearsInCurrentRole 0.1875
## `JobRole.Healthcare Representative` 0.1870
## YearsAtCompany 0.1745
## `JobRole.Manufacturing Director` 0.1690
## NumCompaniesWorked 0.1641
## YearsWithCurrManager 0.1608
## Education 0.1463
## YearsSinceLastPromotion 0.1372
test_results <- data.frame(Actual = test$MonthlyIncome, Predicted = pred_rf)
test_results$Difference <- test_results$Predicted - test_results$Actual
# Show top 10 underpaid employees
head(arrange(test_results, Difference), 10)
## Actual Predicted Difference
## 213 0.73876394 -0.10221040 -0.8409743
## 1327 0.72304587 -0.02925755 -0.7523034
## 512 0.49789512 -0.17935339 -0.6772485
## 1449 0.51722410 -0.05330837 -0.5705325
## 503 0.39891375 -0.16199045 -0.5609042
## 817 0.05927597 -0.44773452 -0.5070105
## 1467 0.74088800 0.33371028 -0.4071777
## 151 -0.19072632 -0.58619350 -0.3954672
## 1443 -0.36447473 -0.73526274 -0.3707880
## 221 -0.12509276 -0.49496717 -0.3698744
This clustering analysis aims to uncover distinct patterns and groupings within the HR dataset. Since the dataset contains multiple continuous variables without predefined labels, clustering provides an effective way to segment observations into meaningful groups based on their similarities. To ensure robust and reliable findings, three different clustering algorithms are applied and compared:
K-Means is a fast and widely used clustering method that partitions data into k clusters by minimizing within-cluster variance. In this analysis, we removed any pre-existing cluster labels to avoid contamination, applied K-Means with 3 clusters and 25 random starts for stability, and added the resulting labels back to the dataset for profiling. The clusters were then visualized using the first two principal components to aid interpretation.
# Copy the scaled data to avoid issues
hr_input <- hr_scaled[, !names(hr_scaled) %in% "Cluster"]
# Run k-means on clean numeric data
kmeans_result <- kmeans(hr_input, centers = 3, nstart = 25)
# Add cluster labels to the original data
hr_scaled$Cluster <- as.factor(kmeans_result$cluster)
# Visualize
fviz_cluster(kmeans_result, data = hr_input, geom = "point", ellipse.type = "convex",
palette = "jco", ggtheme = theme_minimal())
Hierarchical clustering builds a tree-like structure of nested clusters using Ward’s linkage method, which minimizes within-cluster variance. After computing the distance matrix, a dendrogram is plotted to visualize the grouping structure. The tree is then cut to form 3 clusters, which are assigned to the data and visualized for interpretation.
# Compute distance and linkage
dist_matrix <- dist(hr_input)
hc_result <- hclust(dist_matrix, method = "ward.D2")
# Dendrogram
plot(hc_result, labels = FALSE, hang = -1, main = "Hierarchical Clustering Dendrogram")
# Cut into 3 clusters
hr_hclust <- hr_scaled
hr_hclust$Cluster <- as.factor(cutree(hc_result, k = 3))
# Visualize
fviz_cluster(list(data = hr_input, cluster = hr_hclust$Cluster),
geom = "point", ellipse.type = "convex", palette = "jco", ggtheme = theme_minimal())
PAM (Partitioning Around Medoids) is a robust clustering method that, unlike K-Means, uses medoids, making it less sensitive to outliers and better suited for datasets with noise or non-spherical shapes. In this analysis, PAM was applied with 3 clusters, and the resulting labels were added to the dataset and visualized for interpretation.
pam_result <- pam(hr_scaled, k = 3)
hr_pam <- hr_scaled
hr_pam$Cluster <- as.factor(pam_result$clustering)
fviz_cluster(pam_result, geom = "point", ellipse.type = "convex",
palette = "jco", ggtheme = theme_minimal())
Based on visual comparison, we selected K-Means Clusters as the most interpretable clustering solution due to its clear separation and minimal overlap between clusters. Cluster profiling was then performed by computing feature means within each cluster and identifying the top 30 features with the highest variance across groups. These key differentiators were visualized using bar plots to clearly highlight distinctions between clusters.
# KMeans
kmeans_summary <- aggregate(hr_scaled[, -which(names(hr_scaled) == "Cluster")],
by = list(Cluster = hr_scaled$Cluster), FUN = mean)
# Compute variance of mean feature values across clusters
feature_variances <- apply(kmeans_summary[,-1], 2, var)
# Select top 30 (10/10/10) features
features1 <- names(sort(feature_variances, decreasing = TRUE))[1:10]
features2 <- names(sort(feature_variances, decreasing = TRUE))[11:20]
features3 <- names(sort(feature_variances, decreasing = TRUE))[21:30]
melted_summary <- melt(kmeans_summary, id.vars = "Cluster")
melted1 <- melted_summary[melted_summary$variable %in% features1, ]
ggplot(melted1, aes(x = variable, y = value, fill = Cluster)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
labs(title = "First 10 Discriminative Features by Cluster",
x = "Feature", y = "Mean (scaled)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
melted2 <- melted_summary[melted_summary$variable %in% features2, ]
ggplot(melted2, aes(x = variable, y = value, fill = Cluster)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
labs(title = "Second 10 Discriminative Features by Cluster",
x = "Feature", y = "Mean (scaled)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
melted3 <- melted_summary[melted_summary$variable %in% features3, ]
ggplot(melted3, aes(x = variable, y = value, fill = Cluster)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
labs(title = "Third 10 Discriminative Features by Cluster",
x = "Feature", y = "Mean (scaled)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
### Key Findings
Cluster 1 (Red) Likely early-career employees in Sales or Research roles with lower tenure and income, potentially at risk of attrition.
Cluster 2 (Green) Likely mid-level sales/marketing professionals with moderate experience and lower attrition rates.
Cluster 3 (Blue) Senior professionals/managers, highly experienced, with strong tenure and income, and least likely to leave.
The clustering results reveal that tenure and income are key factors distinguishing employee groups. Attrition likelihood is strongly linked to both experience level and departmental placement, with job roles reflecting the employees’ career stages. Overall, Cluster 3 consists of the most experienced and stable employees, while Cluster 1 is characterized by junior staff who may be more at risk of leaving the organization.