library(MASS)
library(tidyr)
library(dplyr)
library(MASS)
library(caret)
library(lattice)
library(ggplot2)
library(gam)
library(readr)
library(ROCR)
library(readr)
library(readxl)
library(e1071)
HR_dataset = read.csv("Cleaned_HR_Data_Analysis.csv")
str(HR_dataset)
## 'data.frame': 2845 obs. of 28 variables:
## $ Employee.ID : int 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 ...
## $ StartDate : chr "20-Sep-19" "11-Feb-23" "10-Dec-18" "21-Jun-21" ...
## $ Title : chr "Production Technician I" "Production Technician I" "Area Sales Manager" "Area Sales Manager" ...
## $ BusinessUnit : chr "CCDR" "EW" "PL" "CCDR" ...
## $ EmployeeStatus : chr "Active" "Active" "Active" "Active" ...
## $ EmployeeType : chr "Contract" "Contract" "Full-Time" "Contract" ...
## $ PayZone : chr "Zone C" "Zone A" "Zone B" "Zone A" ...
## $ EmployeeClassificationType: chr "Temporary" "Part-Time" "Part-Time" "Full-Time" ...
## $ DepartmentType : chr "Production " "Production " "Sales" "Sales" ...
## $ Division : chr "Finance & Accounting" "Aerial" "General - Sga" "Finance & Accounting" ...
## $ DOB : chr "07-10-1969" "30-08-1965" "06-10-1991" "04-04-1998" ...
## $ State : chr "MA" "MA" "MA" "ND" ...
## $ GenderCode : chr "Female" "Male" "Male" "Male" ...
## $ RaceDesc : chr "White" "Hispanic" "Hispanic" "Other" ...
## $ MaritalDesc : chr "Widowed" "Widowed" "Widowed" "Single" ...
## $ Performance.Score : chr "Fully Meets" "Fully Meets" "Fully Meets" "Fully Meets" ...
## $ Current.Employee.Rating : int 4 3 4 2 3 3 4 2 3 5 ...
## $ Survey.Date : chr "14-01-2023" "09-09-2022" "27-05-2023" "16-06-2023" ...
## $ Engagement.Score : int 1 2 1 5 2 2 1 1 4 5 ...
## $ Satisfaction.Score : int 2 1 2 5 5 3 5 4 3 5 ...
## $ Work.Life.Balance.Score : int 3 5 1 4 3 5 2 2 3 2 ...
## $ Training.Date : chr "15-Jul-23" "12-Sep-22" "13-Aug-22" "15-Dec-22" ...
## $ Training.Program.Name : chr "Leadership Development" "Customer Service" "Leadership Development" "Project Management" ...
## $ Training.Type : chr "Internal" "External" "External" "External" ...
## $ Training.Outcome : chr "Failed" "Incomplete" "Failed" "Completed" ...
## $ Training.Duration.Days. : int 2 4 2 3 5 2 5 3 3 5 ...
## $ Training.Cost : num 606 673 413 664 399 ...
## $ Age : int 50 58 27 23 50 71 80 63 44 73 ...
summary(HR_dataset)
## Employee.ID StartDate Title BusinessUnit
## Min. :1001 Length:2845 Length:2845 Length:2845
## 1st Qu.:1736 Class :character Class :character Class :character
## Median :2456 Mode :character Mode :character Mode :character
## Mean :2471
## 3rd Qu.:3197
## Max. :4000
## EmployeeStatus EmployeeType PayZone
## Length:2845 Length:2845 Length:2845
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## EmployeeClassificationType DepartmentType Division
## Length:2845 Length:2845 Length:2845
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## DOB State GenderCode RaceDesc
## Length:2845 Length:2845 Length:2845 Length:2845
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## MaritalDesc Performance.Score Current.Employee.Rating
## Length:2845 Length:2845 Min. :1.000
## Class :character Class :character 1st Qu.:2.000
## Mode :character Mode :character Median :3.000
## Mean :2.975
## 3rd Qu.:3.000
## Max. :5.000
## Survey.Date Engagement.Score Satisfaction.Score Work.Life.Balance.Score
## Length:2845 Min. :1.000 Min. :1.000 Min. :1.000
## Class :character 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000
## Mode :character Median :3.000 Median :3.000 Median :3.000
## Mean :2.942 Mean :3.028 Mean :2.989
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000
## Training.Date Training.Program.Name Training.Type Training.Outcome
## Length:2845 Length:2845 Length:2845 Length:2845
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Training.Duration.Days. Training.Cost Age
## Min. :1.000 Min. : 100.0 Min. :17.00
## 1st Qu.:2.000 1st Qu.: 328.1 1st Qu.:34.00
## Median :3.000 Median : 571.8 Median :49.00
## Mean :2.974 Mean : 559.3 Mean :49.45
## 3rd Qu.:4.000 3rd Qu.: 788.3 3rd Qu.:65.00
## Max. :5.000 Max. :1000.0 Max. :82.00
# Check for missing values
colSums(is.na(HR_dataset))
## Employee.ID StartDate
## 0 0
## Title BusinessUnit
## 0 0
## EmployeeStatus EmployeeType
## 0 0
## PayZone EmployeeClassificationType
## 0 0
## DepartmentType Division
## 0 0
## DOB State
## 0 0
## GenderCode RaceDesc
## 0 0
## MaritalDesc Performance.Score
## 0 0
## Current.Employee.Rating Survey.Date
## 0 0
## Engagement.Score Satisfaction.Score
## 0 0
## Work.Life.Balance.Score Training.Date
## 0 0
## Training.Program.Name Training.Type
## 0 0
## Training.Outcome Training.Duration.Days.
## 0 0
## Training.Cost Age
## 0 0
Already a cleaned dat set, but wanted to double check, make sure no missing observations
levels(as.factor(HR_dataset$Title))
## [1] "Accountant I" "Administrative Assistant"
## [3] "Area Sales Manager" "BI Developer"
## [5] "BI Director" "CIO"
## [7] "Data Analyst" "Data Analyst "
## [9] "Data Architect" "Database Administrator"
## [11] "Director of Operations" "Director of Sales"
## [13] "Enterprise Architect" "IT Director"
## [15] "IT Manager - DB" "IT Manager - Infra"
## [17] "IT Manager - Support" "IT Support"
## [19] "Network Engineer" "President & CEO"
## [21] "Principal Data Architect" "Production Manager"
## [23] "Production Technician I" "Production Technician II"
## [25] "Sales Manager" "Senior BI Developer"
## [27] "Shared Services Manager" "Software Engineer"
## [29] "Software Engineering Manager" "Sr. Accountant"
## [31] "Sr. DBA" "Sr. Network Engineer"
levels(as.factor(HR_dataset$BusinessUnit))
## [1] "BPC" "CCDR" "EW" "MSC" "NEL" "PL" "PYZ" "SVG" "TNS" "WBL"
levels(as.factor(HR_dataset$EmployeeStatus))
## [1] "Active" "Terminated"
only 2 levels
levels(as.factor(HR_dataset$EmployeeType))
## [1] "Contract" "Full-Time" "Part-Time"
levels(as.factor(HR_dataset$PayZone))
## [1] "Zone A" "Zone B" "Zone C"
levels(as.factor(HR_dataset$EmployeeClassificationType))
## [1] "Full-Time" "Part-Time" "Temporary"
levels(as.factor(HR_dataset$DepartmentType))
## [1] "Admin Offices" "Executive Office" "IT/IS"
## [4] "Production " "Sales" "Software Engineering"
Take a look at the Sales option, or the one next to it
levels(as.factor(HR_dataset$Division))
## [1] "Aerial" "Billable Consultants"
## [3] "Catv" "Corp Operations"
## [5] "Engineers" "Executive"
## [7] "Field Operations" "Fielders"
## [9] "Finance & Accounting" "General - Con"
## [11] "General - Eng" "General - Sga"
## [13] "Isp" "People Services"
## [15] "Project Management - Con" "Project Management - Eng"
## [17] "Safety" "Sales & Marketing"
## [19] "Shop (Fleet)" "Splicing"
## [21] "Technology / It" "Underground"
## [23] "Wireless" "Wireline Construction"
## [25] "Yard (Material Handling)"
levels(as.factor(HR_dataset$State))
## [1] "AL" "AZ" "CA" "CO" "CT" "FL" "GA" "ID" "IN" "KY" "MA" "ME" "MT" "NC" "ND"
## [16] "NH" "NV" "NY" "OH" "OR" "PA" "RI" "TN" "TX" "UT" "VA" "VT" "WA"
only has 28 states
levels(as.factor(HR_dataset$GenderCode))
## [1] "Female" "Male"
only 2 levels, can binary code
levels(as.factor(HR_dataset$RaceDesc))
## [1] "Asian" "Black" "Hispanic" "Other" "White"
levels(as.factor(HR_dataset$MaritalDesc))
## [1] "Divorced" "Married" "Single" "Widowed"
levels(as.factor(HR_dataset$Performance.Score))
## [1] "Exceeds" "Fully Meets" "Needs Improvement"
## [4] "PIP"
levels(as.factor(HR_dataset$Training.Program.Name))
## [1] "Communication Skills" "Customer Service" "Leadership Development"
## [4] "Project Management" "Technical Skills"
levels(as.factor(HR_dataset$Training.Type))
## [1] "External" "Internal"
can binary code
levels(as.factor(HR_dataset$Training.Outcome))
## [1] "Completed" "Failed" "Incomplete" "Passed"
HR_dataset$EmployeeStatus <- as.factor(HR_dataset$EmployeeStatus)
HR_dataset$GenderCode <- as.factor(HR_dataset$GenderCode)
HR_dataset$Training.Type <- as.factor(HR_dataset$Training.Type)
HR_dataset$BusinessUnit <- as.factor(HR_dataset$BusinessUnit)
HR_dataset$EmployeeType <- as.factor(HR_dataset$EmployeeType)
HR_dataset$PayZone <- as.factor(HR_dataset$PayZone)
HR_dataset$EmployeeClassificationType <- as.factor(HR_dataset$EmployeeClassificationType)
HR_dataset$DepartmentType <- as.factor(HR_dataset$DepartmentType)
HR_dataset$Division <- as.factor(HR_dataset$Division)
HR_dataset$RaceDesc <- as.factor(HR_dataset$RaceDesc)
HR_dataset$MaritalDesc <- as.factor(HR_dataset$MaritalDesc)
HR_dataset$Performance.Score <- as.factor(HR_dataset$Performance.Score)
HR_dataset$Training.Program.Name <- as.factor(HR_dataset$Training.Program.Name)
HR_dataset$Training.Outcome <- as.factor(HR_dataset$Training.Outcome)
library(lubridate)
# Convert StartDate with month as characters (e.g., "06-Aug-2018")
HR_dataset$StartDate <- dmy(HR_dataset$StartDate)
# Convert DOB and Survey.Date and Training.Date
HR_dataset$DOB <- dmy(HR_dataset$DOB)
HR_dataset$Survey.Date <- dmy(HR_dataset$Survey.Date)
HR_dataset$Training.Date <- dmy(HR_dataset$Training.Date)
str(HR_dataset[, c("StartDate", "DOB", "Survey.Date", "Training.Date")])
## 'data.frame': 2845 obs. of 4 variables:
## $ StartDate : Date, format: "2019-09-20" "2023-02-11" ...
## $ DOB : Date, format: "1969-10-07" "1965-08-30" ...
## $ Survey.Date : Date, format: "2023-01-14" "2022-09-09" ...
## $ Training.Date: Date, format: "2023-07-15" "2022-09-12" ...
str(HR_dataset)
## 'data.frame': 2845 obs. of 28 variables:
## $ Employee.ID : int 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 ...
## $ StartDate : Date, format: "2019-09-20" "2023-02-11" ...
## $ Title : chr "Production Technician I" "Production Technician I" "Area Sales Manager" "Area Sales Manager" ...
## $ BusinessUnit : Factor w/ 10 levels "BPC","CCDR","EW",..: 2 3 6 2 9 1 10 2 5 1 ...
## $ EmployeeStatus : Factor w/ 2 levels "Active","Terminated": 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeType : Factor w/ 3 levels "Contract","Full-Time",..: 1 1 2 1 1 1 2 1 1 3 ...
## $ PayZone : Factor w/ 3 levels "Zone A","Zone B",..: 3 1 2 1 1 2 2 3 2 2 ...
## $ EmployeeClassificationType: Factor w/ 3 levels "Full-Time","Part-Time",..: 3 2 2 1 3 1 3 1 2 3 ...
## $ DepartmentType : Factor w/ 6 levels "Admin Offices",..: 4 4 5 5 5 5 5 5 5 5 ...
## $ Division : Factor w/ 25 levels "Aerial","Billable Consultants",..: 9 1 12 9 10 7 11 5 6 5 ...
## $ DOB : Date, format: "1969-10-07" "1965-08-30" ...
## $ State : chr "MA" "MA" "MA" "ND" ...
## $ GenderCode : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 2 1 1 2 2 ...
## $ RaceDesc : Factor w/ 5 levels "Asian","Black",..: 5 3 3 4 4 2 3 5 2 1 ...
## $ MaritalDesc : Factor w/ 4 levels "Divorced","Married",..: 4 4 4 3 2 2 1 1 4 4 ...
## $ Performance.Score : Factor w/ 4 levels "Exceeds","Fully Meets",..: 2 2 2 2 2 2 1 2 1 2 ...
## $ Current.Employee.Rating : int 4 3 4 2 3 3 4 2 3 5 ...
## $ Survey.Date : Date, format: "2023-01-14" "2022-09-09" ...
## $ Engagement.Score : int 1 2 1 5 2 2 1 1 4 5 ...
## $ Satisfaction.Score : int 2 1 2 5 5 3 5 4 3 5 ...
## $ Work.Life.Balance.Score : int 3 5 1 4 3 5 2 2 3 2 ...
## $ Training.Date : Date, format: "2023-07-15" "2022-09-12" ...
## $ Training.Program.Name : Factor w/ 5 levels "Communication Skills",..: 3 2 3 4 5 4 2 3 2 3 ...
## $ Training.Type : Factor w/ 2 levels "External","Internal": 2 1 1 1 1 2 1 2 2 2 ...
## $ Training.Outcome : Factor w/ 4 levels "Completed","Failed",..: 2 3 2 1 2 4 4 4 4 2 ...
## $ Training.Duration.Days. : int 2 4 2 3 5 2 5 3 3 5 ...
## $ Training.Cost : num 606 673 413 664 399 ...
## $ Age : int 50 58 27 23 50 71 80 63 44 73 ...
Don’t have to necessarily change the factor levels from 1 2 to 0 1, is seen as stil a category internally, but for log reg, will need to dummy code
Don’t need to factor State
, but would more than likely
drop it if it could skew analysis, but could factor to create
visuals
# Select only numeric columns
numeric_columns <- sapply(HR_dataset, is.numeric)
HR_numeric <- HR_dataset[, numeric_columns]
# Compute correlation matrix
corr_matrix <- cor(HR_numeric, use = "complete.obs")
# Print correlation matrix
print(corr_matrix)
## Employee.ID Current.Employee.Rating Engagement.Score
## Employee.ID 1.000000000 -0.01647101 0.014512208
## Current.Employee.Rating -0.016471015 1.00000000 0.022209785
## Engagement.Score 0.014512208 0.02220979 1.000000000
## Satisfaction.Score 0.018004462 -0.02461340 -0.004391166
## Work.Life.Balance.Score -0.020638867 0.02815133 0.025596264
## Training.Duration.Days. -0.003981504 0.00590126 0.002878806
## Training.Cost -0.019489386 0.01075746 0.020968191
## Age 0.004796900 -0.02011502 -0.033426978
## Satisfaction.Score Work.Life.Balance.Score
## Employee.ID 0.0180044621 -0.020638867
## Current.Employee.Rating -0.0246134010 0.028151331
## Engagement.Score -0.0043911665 0.025596264
## Satisfaction.Score 1.0000000000 -0.032766022
## Work.Life.Balance.Score -0.0327660215 1.000000000
## Training.Duration.Days. 0.0242579862 0.003022691
## Training.Cost 0.0006648727 0.008012574
## Age -0.0063905240 -0.011810926
## Training.Duration.Days. Training.Cost Age
## Employee.ID -0.003981504 -0.0194893862 0.004796900
## Current.Employee.Rating 0.005901260 0.0107574563 -0.020115019
## Engagement.Score 0.002878806 0.0209681914 -0.033426978
## Satisfaction.Score 0.024257986 0.0006648727 -0.006390524
## Work.Life.Balance.Score 0.003022691 0.0080125740 -0.011810926
## Training.Duration.Days. 1.000000000 -0.0158420517 0.003657019
## Training.Cost -0.015842052 1.0000000000 -0.001902620
## Age 0.003657019 -0.0019026200 1.000000000
# Select only numeric columns, excluding Employee ID
numeric_columns <- sapply(HR_dataset, is.numeric)
HR_numeric <- HR_dataset[, numeric_columns]
# Remove Employee ID column if present
HR_numeric <- HR_numeric[, !colnames(HR_numeric) %in% c("Employee.ID")]
# Compute correlation matrix
corr_matrix <- cor(HR_numeric, use = "complete.obs")
# Print correlation matrix
print(corr_matrix)
## Current.Employee.Rating Engagement.Score
## Current.Employee.Rating 1.00000000 0.022209785
## Engagement.Score 0.02220979 1.000000000
## Satisfaction.Score -0.02461340 -0.004391166
## Work.Life.Balance.Score 0.02815133 0.025596264
## Training.Duration.Days. 0.00590126 0.002878806
## Training.Cost 0.01075746 0.020968191
## Age -0.02011502 -0.033426978
## Satisfaction.Score Work.Life.Balance.Score
## Current.Employee.Rating -0.0246134010 0.028151331
## Engagement.Score -0.0043911665 0.025596264
## Satisfaction.Score 1.0000000000 -0.032766022
## Work.Life.Balance.Score -0.0327660215 1.000000000
## Training.Duration.Days. 0.0242579862 0.003022691
## Training.Cost 0.0006648727 0.008012574
## Age -0.0063905240 -0.011810926
## Training.Duration.Days. Training.Cost Age
## Current.Employee.Rating 0.005901260 0.0107574563 -0.020115019
## Engagement.Score 0.002878806 0.0209681914 -0.033426978
## Satisfaction.Score 0.024257986 0.0006648727 -0.006390524
## Work.Life.Balance.Score 0.003022691 0.0080125740 -0.011810926
## Training.Duration.Days. 1.000000000 -0.0158420517 0.003657019
## Training.Cost -0.015842052 1.0000000000 -0.001902620
## Age 0.003657019 -0.0019026200 1.000000000
Redo this, but adding the three new binary variables
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.3.3
ggcorrplot(corr_matrix, lab = TRUE, lab_size = 3, colors = c("blue", "white", "red"))
Initial analysis of only the numerical values, shows nothing correlates, so they have no impact on one another
#HR_dataset$GenderNumeric <- ifelse(HR_dataset$GenderCode == "Male", 1, 0)
#HR_dataset$EmployeeStatusNumeric <- ifelse(HR_dataset$EmployeeStatus == "Active", 1, 0)
#HR_dataset$TrainingTypeNumeric <- ifelse(HR_dataset$Training.Type == "Instructor-Led", 1, 0) # Example value
# Convert and bind new dummy variables to dataset
#dummy_vars <- model.matrix(~ BusinessUnit + EmployeeType + PayZone +
# EmployeeClassificationType + DepartmentType +
# Division + RaceDesc + MaritalDesc +
# Performance.Score + Training.Program.Name +
# Training.Outcome, data = HR_dataset)[, -1] # Remove intercept
# Combine with original dataset
#HR_dataset <- cbind(HR_dataset, dummy_vars)
# Select only numeric columns, excluding Employee ID
numeric_columns_2 <- sapply(HR_dataset, is.numeric)
HR_numeric_2 <- HR_dataset[, numeric_columns_2]
# Remove Employee ID column if present
HR_numeric_2 <- HR_numeric_2[, !colnames(HR_numeric_2) %in% c("Employee.ID")]
# Compute correlation matrix
corr_matrix_2 <- cor(HR_numeric_2, use = "complete.obs")
# Print correlation matrix
print(corr_matrix_2)
## Current.Employee.Rating Engagement.Score
## Current.Employee.Rating 1.00000000 0.022209785
## Engagement.Score 0.02220979 1.000000000
## Satisfaction.Score -0.02461340 -0.004391166
## Work.Life.Balance.Score 0.02815133 0.025596264
## Training.Duration.Days. 0.00590126 0.002878806
## Training.Cost 0.01075746 0.020968191
## Age -0.02011502 -0.033426978
## Satisfaction.Score Work.Life.Balance.Score
## Current.Employee.Rating -0.0246134010 0.028151331
## Engagement.Score -0.0043911665 0.025596264
## Satisfaction.Score 1.0000000000 -0.032766022
## Work.Life.Balance.Score -0.0327660215 1.000000000
## Training.Duration.Days. 0.0242579862 0.003022691
## Training.Cost 0.0006648727 0.008012574
## Age -0.0063905240 -0.011810926
## Training.Duration.Days. Training.Cost Age
## Current.Employee.Rating 0.005901260 0.0107574563 -0.020115019
## Engagement.Score 0.002878806 0.0209681914 -0.033426978
## Satisfaction.Score 0.024257986 0.0006648727 -0.006390524
## Work.Life.Balance.Score 0.003022691 0.0080125740 -0.011810926
## Training.Duration.Days. 1.000000000 -0.0158420517 0.003657019
## Training.Cost -0.015842052 1.0000000000 -0.001902620
## Age 0.003657019 -0.0019026200 1.000000000
ggplot(HR_dataset, aes(x = Age)) + geom_histogram(binwidth = 5, fill = "blue", color = "black")
ggplot(HR_dataset, aes(x = Engagement.Score)) + geom_histogram(binwidth = 1, fill = "blue", color = "black")
ggplot(HR_dataset, aes(x = Satisfaction.Score)) + geom_histogram(binwidth = 1, fill = "blue", color = "black")
ggplot(HR_dataset, aes(x = Work.Life.Balance.Score)) + geom_histogram(binwidth = 1, fill = "blue", color = "black")
ggplot(HR_dataset, aes(x = BusinessUnit)) +
geom_bar(fill = "forestgreen") +
geom_text(stat = "count", aes(label = ..count..), hjust = -0.1) +
coord_flip() +
theme_minimal() +
labs(title = "Employee Count by Business Unit", x = "Business Unit", y = "Count")
ggplot(HR_dataset, aes(x = Division)) +
geom_bar(fill = "steelblue") +
geom_text(stat = "count", aes(label = ..count..), hjust = -0.2) +
coord_flip() +
theme_minimal() +
labs(title = "Employee Count by Division", x = "Division", y = "Count")
ggplot(HR_dataset, aes(x = DepartmentType)) +
geom_bar(fill = "darkorange") +
geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Employee Count by Department Type", x = "Department Type", y = "Count")
#install.packages("rcompanion")
library(tidyverse)
library(corrplot)
library(GGally)
#library(rcompanion)
library(vcd)
# ---- NUMERIC-ONLY CORRELATION MATRIX ----
numeric_cols <- sapply(HR_dataset, is.numeric)
HR_numeric <- HR_dataset[, numeric_cols]
# Remove ID column if present
HR_numeric <- HR_numeric[, !colnames(HR_numeric) %in% c("Employee.ID")]
# Compute and plot correlation
cor_matrix <- cor(HR_numeric, use = "complete.obs")
corrplot(cor_matrix, method = "color", type = "upper",
tl.cex = 0.8, tl.col = "black", addCoef.col = "black",
title = "Correlation Matrix for Numeric Variables", mar=c(0,0,1,0))
# ---- CATEGORICAL VS NUMERIC (ANOVA & Boxplots) ----
# Example: Gender vs Satisfaction.Score
ggplot(HR_dataset, aes(x = GenderCode, y = Satisfaction.Score)) +
geom_boxplot(fill = "skyblue") +
theme_minimal() +
labs(title = "Boxplot: Satisfaction by Gender")
# ANOVA example
anova_gender <- aov(Satisfaction.Score ~ GenderCode, data = HR_dataset)
summary(anova_gender)
## Df Sum Sq Mean Sq F value Pr(>F)
## GenderCode 1 1 1.134 0.57 0.45
## Residuals 2843 5654 1.989
# ---- CATEGORICAL VS CATEGORICAL (Chi-square and Cramér’s V) ----
# Example: Gender vs Training Outcome
ctab <- table(HR_dataset$GenderCode, HR_dataset$Training.Outcome)
chisq_result <- chisq.test(ctab)
#cramerv_result <- cramerV(ctab)
result <- assocstats(ctab)
print("Chi-Square Test: Gender vs Training Outcome")
## [1] "Chi-Square Test: Gender vs Training Outcome"
print(chisq_result)
##
## Pearson's Chi-squared test
##
## data: ctab
## X-squared = 6.0666, df = 3, p-value = 0.1084
# Extract and print Cramér’s V specifically
cramer_v <- result$cramer
print(paste("Cramér’s V:", round(cramer_v, 3)))
## [1] "Cramér’s V: 0.046"
# Mosaic plot
mosaicplot(ctab, main="Mosaic: Gender vs Training Outcome", color=TRUE)
# ---- MIXED VARIABLE OVERVIEW (GGally ggpairs) ----
HR_subset <- HR_dataset %>%
select(Engagement.Score, Satisfaction.Score, GenderCode, Training.Outcome, Performance.Score)
GGally::ggpairs(HR_subset)
cat_vars <- c("GenderCode", "EmployeeStatus", "Training.Type", "BusinessUnit",
"EmployeeType", "PayZone", "EmployeeClassificationType")
num_outcomes <- c("Engagement.Score", "Satisfaction.Score",
"Work.Life.Balance.Score", "Current.Employee.Rating")
for (cat in cat_vars) {
for (num in num_outcomes) {
formula <- as.formula(paste(num, "~", cat))
model <- aov(formula, data = HR_dataset)
print(paste("ANOVA:", num, "vs", cat))
print(summary(model))
}
}
## [1] "ANOVA: Engagement.Score vs GenderCode"
## Df Sum Sq Mean Sq F value Pr(>F)
## GenderCode 1 1 0.6493 0.315 0.575
## Residuals 2843 5858 2.0604
## [1] "ANOVA: Satisfaction.Score vs GenderCode"
## Df Sum Sq Mean Sq F value Pr(>F)
## GenderCode 1 1 1.134 0.57 0.45
## Residuals 2843 5654 1.989
## [1] "ANOVA: Work.Life.Balance.Score vs GenderCode"
## Df Sum Sq Mean Sq F value Pr(>F)
## GenderCode 1 1 0.6468 0.326 0.568
## Residuals 2843 5644 1.9852
## [1] "ANOVA: Current.Employee.Rating vs GenderCode"
## Df Sum Sq Mean Sq F value Pr(>F)
## GenderCode 1 0.6 0.6399 0.624 0.43
## Residuals 2843 2915.5 1.0255
## [1] "ANOVA: Engagement.Score vs EmployeeStatus"
## Df Sum Sq Mean Sq F value Pr(>F)
## EmployeeStatus 1 1 0.8063 0.391 0.532
## Residuals 2843 5858 2.0603
## [1] "ANOVA: Satisfaction.Score vs EmployeeStatus"
## Df Sum Sq Mean Sq F value Pr(>F)
## EmployeeStatus 1 4 3.660 1.841 0.175
## Residuals 2843 5651 1.988
## [1] "ANOVA: Work.Life.Balance.Score vs EmployeeStatus"
## Df Sum Sq Mean Sq F value Pr(>F)
## EmployeeStatus 1 4 3.830 1.93 0.165
## Residuals 2843 5641 1.984
## [1] "ANOVA: Current.Employee.Rating vs EmployeeStatus"
## Df Sum Sq Mean Sq F value Pr(>F)
## EmployeeStatus 1 4.3 4.272 4.171 0.0412 *
## Residuals 2843 2911.9 1.024
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] "ANOVA: Engagement.Score vs Training.Type"
## Df Sum Sq Mean Sq F value Pr(>F)
## Training.Type 1 2 1.714 0.832 0.362
## Residuals 2843 5857 2.060
## [1] "ANOVA: Satisfaction.Score vs Training.Type"
## Df Sum Sq Mean Sq F value Pr(>F)
## Training.Type 1 0 0.2212 0.111 0.739
## Residuals 2843 5654 1.9889
## [1] "ANOVA: Work.Life.Balance.Score vs Training.Type"
## Df Sum Sq Mean Sq F value Pr(>F)
## Training.Type 1 0 0.1854 0.093 0.76
## Residuals 2843 5644 1.9854
## [1] "ANOVA: Current.Employee.Rating vs Training.Type"
## Df Sum Sq Mean Sq F value Pr(>F)
## Training.Type 1 1.9 1.921 1.874 0.171
## Residuals 2843 2914.3 1.025
## [1] "ANOVA: Engagement.Score vs BusinessUnit"
## Df Sum Sq Mean Sq F value Pr(>F)
## BusinessUnit 9 12 1.291 0.626 0.776
## Residuals 2835 5847 2.062
## [1] "ANOVA: Satisfaction.Score vs BusinessUnit"
## Df Sum Sq Mean Sq F value Pr(>F)
## BusinessUnit 9 16 1.748 0.879 0.543
## Residuals 2835 5639 1.989
## [1] "ANOVA: Work.Life.Balance.Score vs BusinessUnit"
## Df Sum Sq Mean Sq F value Pr(>F)
## BusinessUnit 9 25 2.795 1.41 0.178
## Residuals 2835 5620 1.982
## [1] "ANOVA: Current.Employee.Rating vs BusinessUnit"
## Df Sum Sq Mean Sq F value Pr(>F)
## BusinessUnit 9 5.4 0.5956 0.58 0.815
## Residuals 2835 2910.8 1.0267
## [1] "ANOVA: Engagement.Score vs EmployeeType"
## Df Sum Sq Mean Sq F value Pr(>F)
## EmployeeType 2 1 0.3973 0.193 0.825
## Residuals 2842 5858 2.0611
## [1] "ANOVA: Satisfaction.Score vs EmployeeType"
## Df Sum Sq Mean Sq F value Pr(>F)
## EmployeeType 2 11 5.272 2.655 0.0705 .
## Residuals 2842 5644 1.986
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] "ANOVA: Work.Life.Balance.Score vs EmployeeType"
## Df Sum Sq Mean Sq F value Pr(>F)
## EmployeeType 2 10 5.175 2.611 0.0737 .
## Residuals 2842 5634 1.983
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] "ANOVA: Current.Employee.Rating vs EmployeeType"
## Df Sum Sq Mean Sq F value Pr(>F)
## EmployeeType 2 1.2 0.5836 0.569 0.566
## Residuals 2842 2915.0 1.0257
## [1] "ANOVA: Engagement.Score vs PayZone"
## Df Sum Sq Mean Sq F value Pr(>F)
## PayZone 2 1 0.5468 0.265 0.767
## Residuals 2842 5857 2.0610
## [1] "ANOVA: Satisfaction.Score vs PayZone"
## Df Sum Sq Mean Sq F value Pr(>F)
## PayZone 2 2 0.9423 0.474 0.623
## Residuals 2842 5653 1.9890
## [1] "ANOVA: Work.Life.Balance.Score vs PayZone"
## Df Sum Sq Mean Sq F value Pr(>F)
## PayZone 2 0 0.1922 0.097 0.908
## Residuals 2842 5644 1.9860
## [1] "ANOVA: Current.Employee.Rating vs PayZone"
## Df Sum Sq Mean Sq F value Pr(>F)
## PayZone 2 1.7 0.8408 0.82 0.441
## Residuals 2842 2914.5 1.0255
## [1] "ANOVA: Engagement.Score vs EmployeeClassificationType"
## Df Sum Sq Mean Sq F value Pr(>F)
## EmployeeClassificationType 2 5 2.584 1.255 0.285
## Residuals 2842 5853 2.059
## [1] "ANOVA: Satisfaction.Score vs EmployeeClassificationType"
## Df Sum Sq Mean Sq F value Pr(>F)
## EmployeeClassificationType 2 16 7.890 3.977 0.0189 *
## Residuals 2842 5639 1.984
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] "ANOVA: Work.Life.Balance.Score vs EmployeeClassificationType"
## Df Sum Sq Mean Sq F value Pr(>F)
## EmployeeClassificationType 2 0 0.0752 0.038 0.963
## Residuals 2842 5645 1.9861
## [1] "ANOVA: Current.Employee.Rating vs EmployeeClassificationType"
## Df Sum Sq Mean Sq F value Pr(>F)
## EmployeeClassificationType 2 5.8 2.918 2.85 0.058 .
## Residuals 2842 2910.3 1.024
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Variable, significant on, P-value, inference
EmployeeStatus, Current.Employee.Rating, 0.041, There is a significant difference in performance ratings between Active and Terminated employees. Likely, terminated employees had lower ratings.
EmployeeType, Satisfaction.Score, 0.070, Marginally significant — different employment types (e.g., full-time, contract) may experience different satisfaction levels.
EmployeeType, Work.Life.Balance.Score, 0.074, Marginal — employee type may slightly influence perceived work-life balance.
EmployeeClassificationType, Satisfaction.Score, 0.019, Employees with different contract durations (e.g., long-term vs. short-term) report significantly different satisfaction levels.
EmployeeClassificationType, Current.Employee.Rating, 0.058, Marginal — classification type may influence performance perception.
Gender: No significant effect on engagement, satisfaction, work-life balance, or performance.
Training.Type: Internal vs. external training had no effect on any measured outcomes.
BusinessUnit: No notable differences in scores across business units.
PayZone: Salary band had no effect on engagement, satisfaction, etc.
Engagement Score: Did not vary significantly across any factor tested.
Performance Ratings do appear to be influenced by employment status and potentially by contract classifications — suggesting that employees with more stable or longer-term roles are rated higher.
Satisfaction Scores are more sensitive to contract structure (EmployeeClassificationType) and possibly employment type (e.g., full-time vs. contract).
Gender, Training Type, Pay Zone, and Business Unit do not significantly impact engagement, satisfaction, or performance — suggesting relative equality across these groups in how employees experience work.
Retention strategies should consider how contract types affect satisfaction and performance.
Training programs may not need to differ based on Training.Type alone — other factors may be more influential.
Focus performance management and engagement efforts more on structural or role-based factors than on demographics like gender or department.
library(vcd)
cat_pairs <- list(
c("GenderCode", "Training.Outcome"),
c("EmployeeStatus", "Training.Outcome"),
c("BusinessUnit", "Training.Outcome"),
c("EmployeeType", "Training.Outcome"),
c("EmployeeStatus", "Performance.Score"),
c("MaritalDesc", "EmployeeStatus"),
c("RaceDesc", "Training.Outcome")
)
for (pair in cat_pairs) {
var1 <- pair[1]
var2 <- pair[2]
tab <- table(HR_dataset[[var1]], HR_dataset[[var2]])
cat("\nChi-square:", var1, "vs", var2, "\n")
print(chisq.test(tab))
print(paste("Cramér’s V:", round(assocstats(tab)$cramer, 3)))
}
##
## Chi-square: GenderCode vs Training.Outcome
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 6.0666, df = 3, p-value = 0.1084
##
## [1] "Cramér’s V: 0.046"
##
## Chi-square: EmployeeStatus vs Training.Outcome
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 6.464, df = 3, p-value = 0.09109
##
## [1] "Cramér’s V: 0.048"
##
## Chi-square: BusinessUnit vs Training.Outcome
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 31.702, df = 27, p-value = 0.2433
##
## [1] "Cramér’s V: 0.061"
##
## Chi-square: EmployeeType vs Training.Outcome
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 12.464, df = 6, p-value = 0.05238
##
## [1] "Cramér’s V: 0.047"
##
## Chi-square: EmployeeStatus vs Performance.Score
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 5.4042, df = 3, p-value = 0.1445
##
## [1] "Cramér’s V: 0.044"
##
## Chi-square: MaritalDesc vs EmployeeStatus
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 1.2508, df = 3, p-value = 0.7408
##
## [1] "Cramér’s V: 0.021"
##
## Chi-square: RaceDesc vs Training.Outcome
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 15.153, df = 12, p-value = 0.2332
##
## [1] "Cramér’s V: 0.042"
Here’s a summary and interpretation of your Chi-square test and Cramér’s V results, which explore associations between categorical variables like gender, employment status, and training outcomes.
Overview: What This Analysis Does Chi-square tests evaluate whether two categorical variables are statistically associated (i.e., not independent).
Cramér’s V measures the strength of that association:
< 0.10 → very weak
0.10–0.30 → weak
0.30 → moderate or strong
Key Results and Interpretation: Variable Pair p-value Cramér’s V Interpretation
Gender vs Training Outcome, 0.108, 0.046, No statistically significant association. Gender has a very weak relationship with training results.
Employee Status vs Training Outcome, 0.091, 0.048, Borderline non-significant, with very weak effect. Terminated/active employees may slightly differ in training outcomes.
Business Unit vs Training Outcome, 0.243, 0.061, No significant difference across business units in training outcomes. Weak association.
Employee Type vs Training Outcome, 0.052, 0.047, Borderline significant. Training outcomes may slightly differ by employment type (e.g., full-time, contract), but effect is very weak.
Employee Status vs Performance Score, 0.145, 0.044, No statistically significant association. Performance scores are largely independent of employment status (though see earlier ANOVA where rating differed).
Marital Status vs Employee Status, 0.741, 0.021, No relationship. Marital status does not appear to influence employment status.
Race vs Training Outcome, 0.233, 0.042, No significant difference in training outcomes by race. The relationship is very weak.
Variable Pair | p-value | Cramér’s V | Interpretation |
---|---|---|---|
Gender vs Training Outcome | 0.108 | 0.046 | No statistically significant association. Gender has a very weak relationship with training results. |
Uses all of the categorical variables, and compares all to eachother
cat_pairs_2 <- list(
c("GenderCode", "Training.Outcome"),
c("GenderCode", "EmployeeStatus"),
c("EmployeeStatus", "Training.Outcome"),
c("EmployeeStatus", "Performance.Score"),
c("EmployeeType", "Training.Outcome"),
c("EmployeeType", "EmployeeStatus"),
c("EmployeeClassificationType", "Training.Outcome"),
c("EmployeeClassificationType", "Performance.Score"),
c("DepartmentType", "Training.Outcome"),
c("Division", "Training.Outcome"),
c("PayZone", "EmployeeStatus"),
c("RaceDesc", "Training.Outcome"),
c("RaceDesc", "Performance.Score"),
c("MaritalDesc", "EmployeeStatus"),
c("Training.Type", "Training.Outcome")
)
for (pair in cat_pairs_2) {
var1 <- pair[1]
var2 <- pair[2]
tab <- table(HR_dataset[[var1]], HR_dataset[[var2]])
cat("\nChi-square:", var1, "vs", var2, "\n")
print(chisq.test(tab))
print(paste("Cramér’s V:", round(assocstats(tab)$cramer, 3)))
}
##
## Chi-square: GenderCode vs Training.Outcome
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 6.0666, df = 3, p-value = 0.1084
##
## [1] "Cramér’s V: 0.046"
##
## Chi-square: GenderCode vs EmployeeStatus
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: tab
## X-squared = 3.7087, df = 1, p-value = 0.05413
##
## [1] "Cramér’s V: 0.037"
##
## Chi-square: EmployeeStatus vs Training.Outcome
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 6.464, df = 3, p-value = 0.09109
##
## [1] "Cramér’s V: 0.048"
##
## Chi-square: EmployeeStatus vs Performance.Score
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 5.4042, df = 3, p-value = 0.1445
##
## [1] "Cramér’s V: 0.044"
##
## Chi-square: EmployeeType vs Training.Outcome
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 12.464, df = 6, p-value = 0.05238
##
## [1] "Cramér’s V: 0.047"
##
## Chi-square: EmployeeType vs EmployeeStatus
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 1.7434, df = 2, p-value = 0.4182
##
## [1] "Cramér’s V: 0.025"
##
## Chi-square: EmployeeClassificationType vs Training.Outcome
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 5.5969, df = 6, p-value = 0.4698
##
## [1] "Cramér’s V: 0.031"
##
## Chi-square: EmployeeClassificationType vs Performance.Score
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 1.0415, df = 6, p-value = 0.984
##
## [1] "Cramér’s V: 0.014"
##
## Chi-square: DepartmentType vs Training.Outcome
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 18.503, df = 15, p-value = 0.2371
##
## [1] "Cramér’s V: 0.047"
##
## Chi-square: Division vs Training.Outcome
## Warning in chisq.test(tab): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 64.232, df = 72, p-value = 0.7311
##
## [1] "Cramér’s V: 0.087"
##
## Chi-square: PayZone vs EmployeeStatus
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 2.7739, df = 2, p-value = 0.2498
##
## [1] "Cramér’s V: 0.031"
##
## Chi-square: RaceDesc vs Training.Outcome
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 15.153, df = 12, p-value = 0.2332
##
## [1] "Cramér’s V: 0.042"
##
## Chi-square: RaceDesc vs Performance.Score
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 17.44, df = 12, p-value = 0.1338
##
## [1] "Cramér’s V: 0.045"
##
## Chi-square: MaritalDesc vs EmployeeStatus
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 1.2508, df = 3, p-value = 0.7408
##
## [1] "Cramér’s V: 0.021"
##
## Chi-square: Training.Type vs Training.Outcome
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 5.0166, df = 3, p-value = 0.1706
##
## [1] "Cramér’s V: 0.042"
Key Results
Marginally Significant Results (p close to 0.05)
Variable Pair | p-value | Cramér’s V | Interpretation |
---|---|---|---|
Gender vs EmployeeStatus | 0.054 | 0.037 | Just above significance threshold. Gender may slightly affect employment status, but effect size is very weak. |
All of these have very low Cramér’s V, suggesting that even if statistical significance existed, the practical effect is minimal.
Non-Significant Results (p > 0.05) All remaining pairs show no statistically significant relationship. Here are examples with typical effect sizes:
Variable Pair | p-value | Cramér’s V | Interpretation |
---|---|---|---|
Gender vs Training.Outcome | 0.108 | 0.046 | No significant difference in training outcomes between genders. |
Across all of these, Cramér’s V is consistently under 0.10, meaning extremely weak associations — regardless of statistical significance.
Overall Inference: No strong or moderate associations were found between any pair of categorical variables.
A few pairs showed borderline significance (e.g., Gender vs EmployeeStatus, EmployeeType vs Training.Outcome), but effect sizes were minimal.
Most relationships — including those involving race, marital status, training type, division, and department — showed no significant differences in outcomes.
Strategic Takeaways: Categorical attributes like gender, race, marital status, and job division do not appear to meaningfully affect training or performance outcomes in this dataset.
Slight differences in outcomes based on employment type or classification suggest contractual status may have small but notable influence — a possible area for deeper HR review.
Training program equity appears strong — outcomes are not significantly influenced by demographic or structural employee categories.
Integrated Summary of HR Data Analysis: 1. Employee Demographics and Distribution (Descriptive Insights) The age distribution of employees is relatively even between 30 and 60 years, peaking around age 55–60.
Younger employees (under 25) and older employees (over 70) are underrepresented.
Business units and departments are diverse, with some divisions clearly employing more staff than others — suggesting varied workforce distribution across company segments.
Significant or Marginal Findings:
Variable | Outcome | p-value | Interpretation |
---|---|---|---|
EmployeeStatus | Current.Employee.Rating | 0.041 | Terminated vs. active employees have significantly different performance ratings. |
EmployeeClassificationType | Satisfaction.Score | 0.019 | Different contract types influence employee satisfaction. |
EmployeeType | Satisfaction & Work-Life | ~0.07 | Marginally significant — different employee types (full-time, contract) may affect work-life experience and satisfaction. |
No Significant Impact: Gender, BusinessUnit, PayZone, and Training.Type did not significantly affect engagement, satisfaction, or performance.
Engagement Score, across all factors, was not significantly different.
Inference: Employee performance and satisfaction are more affected by employment structure than by demographic or departmental characteristics.
Marginal Associations Found:
Variable Pair | p-value | Cramér’s V | Interpretation |
---|---|---|---|
Gender vs EmployeeStatus | 0.054 | 0.037 | Very weak potential association. |
EmployeeType vs Training.Outcome | 0.052 | 0.047 | Slight difference in training outcomes across employment types. |
EmployeeStatus vs Training.Outcome | 0.091 | 0.048 | Small influence of employment status on training result. |
All Other Pairs: Showed no statistically significant association.
Cramér’s V values were < 0.10 across the board — indicating very weak relationships.
Examples: Race, Marital Status, PayZone, Training Type, and DepartmentType had no meaningful effect on training outcomes, employee status, or performance.
Inference: Categorical employee attributes (e.g., gender, race, department) are not strong drivers of training or employment outcomes in this dataset. Structural factors like job type and status matter more.
Overall Insights & Strategic Implications The organization shows a fairly equitable environment with respect to gender, race, and department — these variables did not significantly influence training or performance outcomes.
Performance and satisfaction are most affected by employment structure — contract length, employee type, and employment status. This suggests:
Focus on contractual equity and better onboarding/training of short-term or contract employees.
Consider if employee classification impacts perceived support or growth.
Training outcomes appear consistent across demographics, which reflects well on program accessibility and fairness.
Significant Results You Found From ANOVA and Chi-square, we know:
EmployeeStatus significantly affects Current.Employee.Rating.
EmployeeClassificationType significantly affects Satisfaction.Score.
EmployeeType marginally affects Satisfaction and Work-Life Balance.
Training outcomes are slightly associated with EmployeeStatus and EmployeeType (borderline Chi-square p-values).
library(caTools)
## Warning: package 'caTools' was built under R version 4.3.3
# Binary outcome for classification (if not already created)
HR_dataset$EmployeeStatusBin <- ifelse(HR_dataset$EmployeeStatus == "Terminated", 1, 0)
# Stratified sampling
set.seed(2025) # for reproducibility
split <- sample.split(HR_dataset$EmployeeStatusBin, SplitRatio = 0.7)
# Create training and testing sets
train_data <- subset(HR_dataset, split == TRUE)
test_data <- subset(HR_dataset, split == FALSE)
Recommended Follow-Up Tests / Models 1. Logistic Regression — Predicting Employee Status (Terminated vs. Active) Use it to model the probability of attrition, incorporating:
Satisfaction.Score
Current.Employee.Rating
EmployeeType
EmployeeClassificationType
Age
Training.Outcome (as a categorical predictor)
logit_model <- glm(EmployeeStatusBin ~ Satisfaction.Score + Current.Employee.Rating +
EmployeeType + EmployeeClassificationType + Age + Training.Outcome,
data = train_data, family = "binomial")
summary(logit_model)
##
## Call:
## glm(formula = EmployeeStatusBin ~ Satisfaction.Score + Current.Employee.Rating +
## EmployeeType + EmployeeClassificationType + Age + Training.Outcome,
## family = "binomial", data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.65479 0.37383 -7.102 1.23e-12 ***
## Satisfaction.Score 0.08045 0.04695 1.713 0.0866 .
## Current.Employee.Rating 0.11441 0.06508 1.758 0.0788 .
## EmployeeTypeFull-Time 0.17907 0.15932 1.124 0.2610
## EmployeeTypePart-Time 0.06258 0.16767 0.373 0.7090
## EmployeeClassificationTypePart-Time 0.03878 0.16348 0.237 0.8125
## EmployeeClassificationTypeTemporary 0.12204 0.16025 0.762 0.4463
## Age -0.00223 0.00371 -0.601 0.5478
## Training.OutcomeFailed 0.21350 0.19265 1.108 0.2678
## Training.OutcomeIncomplete 0.16590 0.19054 0.871 0.3839
## Training.OutcomePassed 0.33261 0.18346 1.813 0.0698 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1584.5 on 1991 degrees of freedom
## Residual deviance: 1573.5 on 1981 degrees of freedom
## AIC: 1595.5
##
## Number of Fisher Scoring iterations: 4
Model Purpose This logistic regression model predicts the probability that an employee is terminated (EmployeeStatusBin = 1) based on: Satisfaction Score, Performance Rating, Employee Type & Classification, Age, Training Outcome.
Inferences No predictor was statistically significant at the 0.05 level, but a few are marginally significant (p < 0.10):
Satisfaction.Score (+)
Current.Employee.Rating (+)
Training.OutcomePassed (+)
These marginal effects suggest a potentially weak link between performance/satisfaction/training success and attrition, but the direction of effect is unexpected (e.g., higher satisfaction → higher termination odds), which may indicate:
Data coding issues
Other unmeasured variables influencing termination
Voluntary turnover (resignations) are lumped in with terminations
Variables like Age, EmployeeType, and Training Outcome (Incomplete/Failed) had no significant effect.
# Predict probabilities
logit_probs <- predict(logit_model, newdata = test_data, type = "response")
# Convert to binary prediction (threshold = 0.5)
logit_preds <- ifelse(logit_probs > 0.5, 1, 0)
# Confusion matrix
table(Predicted = logit_preds, Actual = test_data$EmployeeStatusBin)
## Actual
## Predicted 0 1
## 0 737 116
# Accuracy
logit_accuracy <- mean(logit_preds == test_data$EmployeeStatusBin)
print(paste("Logistic Regression Accuracy:", round(logit_accuracy, 3)))
## [1] "Logistic Regression Accuracy: 0.864"
library(pROC)
## Warning: package 'pROC' was built under R version 4.3.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# Create ROC object
roc_obj <- roc(test_data$EmployeeStatusBin, logit_probs)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Plot ROC curve
plot(roc_obj, col = "blue", lwd = 2, main = "ROC Curve - Logistic Regression")
abline(a = 0, b = 1, lty = 2, col = "gray") # reference line (random guess)
# Print AUC value
auc_value <- auc(roc_obj)
print(paste("AUC:", round(auc_value, 3)))
## [1] "AUC: 0.554"
The AUC, that determines how well this model, can correctly predict the data, is valued at 0.554, or visually having a steep incline. A value of 0.5 is no better than random guessing, meaning this model (logistic regression) is a poor model to use.
This compares
library(rpart)
## Warning: package 'rpart' was built under R version 4.3.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.3.3
# Train decision tree
tree_model <- rpart(EmployeeStatusBin ~ Satisfaction.Score + Current.Employee.Rating +
EmployeeType + EmployeeClassificationType + Age + Training.Outcome,
data = train_data, method = "class")
# Visualize tree
rpart.plot(tree_model, type = 2, extra = 104)
# Predict on test data
tree_preds <- predict(tree_model, newdata = test_data, type = "class")
# Confusion matrix
table(Predicted = tree_preds, Actual = test_data$EmployeeStatusBin)
## Actual
## Predicted 0 1
## 0 737 116
## 1 0 0
# Accuracy
tree_accuracy <- mean(tree_preds == test_data$EmployeeStatusBin)
print(paste("Decision Tree Accuracy:", round(tree_accuracy, 3)))
## [1] "Decision Tree Accuracy: 0.864"
tree_model <- rpart(EmployeeStatusBin ~ Satisfaction.Score + Current.Employee.Rating +
EmployeeType + EmployeeClassificationType + Age + Training.Outcome,
data = train_data,
method = "class",
control = rpart.control(cp = 0.001, minsplit = 20))
rpart.plot(tree_model, type = 2, extra = 104)
Each node in the tree:
Splits the data based on a predictor variable and threshold
Shows the predicted class (0 = Active, 1 = Terminated)
Displays the proportions of each class and sample size
Root Split: The first and most important split is on Current.Employee.Rating < 3
Employees rated below 3 are more likely to be terminated.
This aligns with expectations — low ratings are a strong predictor of attrition.
Left Subtree (Low rating < 3): Satisfaction.Score < 4, Age < 32, and EmployeeType are also used to refine predictions.
Termination likelihood increases for low satisfaction, younger age, and part-time workers.
Right Subtree (Rating ≥ 3): Splits on Training Outcome, Satisfaction.Score, and Age.
Interestingly, even some employees who passed training and are older (e.g., over 64) are predicted as more likely to leave — may indicate voluntary retirement patterns.
Overall Inferences from Tree Performance rating is the strongest predictor of termination risk.
Low satisfaction scores and younger age contribute to higher attrition risk.
EmployeeType (Part-Time) plays a minor role, especially among low-satisfaction employees.
Training outcome matters: those who failed training, even if they have decent performance, can be at higher risk.
# Example for decision tree
tree_probs <- predict(tree_model, newdata = test_data, type = "prob")[, 2]
roc_tree <- roc(test_data$EmployeeStatusBin, tree_probs)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Plot together
plot(roc_obj, col = "blue", lwd = 2, main = "ROC Curves")
lines(roc_tree, col = "red", lwd = 2)
legend("bottomright", legend = c("Logistic", "Tree"), col = c("blue", "red"), lwd = 2)
# AUC values
auc(roc_obj)
## Area under the curve: 0.5535
auc(roc_tree)
## Area under the curve: 0.5353
This visualization compares the random forest analysis and logit models to each other based on AUC values, with minimal change between the two, meaning each model performs poorly.
On testing data
tree_preds <- predict(tree_model, newdata = test_data, type = "class")
table(Predicted = tree_preds, Actual = test_data$EmployeeStatusBin)
## Actual
## Predicted 0 1
## 0 722 113
## 1 15 3
mean(tree_preds == test_data$EmployeeStatusBin)
## [1] 0.8499414
variable importance
tree_model$variable.importance
## Age EmployeeType
## 11.619480 4.898906
## Satisfaction.Score Training.Outcome
## 4.221799 3.365957
## Current.Employee.Rating EmployeeClassificationType
## 3.335093 1.936801
random forest to see
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
rf_model <- randomForest(as.factor(EmployeeStatusBin) ~ Satisfaction.Score + Current.Employee.Rating +
EmployeeType + EmployeeClassificationType + Age + Training.Outcome,
data = train_data, ntree = 500)
print(rf_model)
##
## Call:
## randomForest(formula = as.factor(EmployeeStatusBin) ~ Satisfaction.Score + Current.Employee.Rating + EmployeeType + EmployeeClassificationType + Age + Training.Outcome, data = train_data, ntree = 500)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 13.76%
## Confusion matrix:
## 0 1 class.error
## 0 1718 3 0.001743173
## 1 271 0 1.000000000
Interpretation: The decision tree correctly classified most active employees.
However, it struggled to detect terminated employees (only 3 out of 116 were correctly identified).
Recall for termination (sensitivity) is low → the tree is biased toward the majority class (active).
Interpretation: The random forest achieves high accuracy overall, but it’s completely failing to detect terminated employees.
This is a sign of class imbalance — the model is overwhelmed by the majority class and does not learn termination patterns effectively.
Interpretation: Age is the most influential variable in the forest model, suggesting possible patterns related to retirement or early-career turnover.
EmployeeType and Satisfaction.Score also play notable roles.
Performance rating and training outcome matter, but less than expected.
EmployeeClassificationType has the least predictive power.
Overall Inference Your models (decision tree & random forest) are very good at identifying active employees but fail to detect terminations due to class imbalance.
Important predictors include Age, EmployeeType, and Satisfaction Score, confirming earlier ANOVA and tree results.
Performance and training outcome are weaker predictors than initially expected.
Satisfaction.Score ~ EmployeeClassificationType + EmployeeType + Age + Work.Life.Balance.Score + GenderCode
Current.Employee.Rating ~ EmployeeStatus + Age + Training.Outcome + Performance.Score
satisfaction_model <- lm(Satisfaction.Score ~ EmployeeClassificationType + EmployeeType +
Work.Life.Balance.Score + GenderCode + Age, data = train_data)
summary(satisfaction_model)
##
## Call:
## lm(formula = Satisfaction.Score ~ EmployeeClassificationType +
## EmployeeType + Work.Life.Balance.Score + GenderCode + Age,
## data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.38996 -1.10782 0.00999 1.09803 2.30095
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.3885746 0.1328357 25.510 < 2e-16 ***
## EmployeeClassificationTypePart-Time -0.2153532 0.0777866 -2.769 0.005684 **
## EmployeeClassificationTypeTemporary -0.2917540 0.0769515 -3.791 0.000154 ***
## EmployeeTypeFull-Time -0.0172106 0.0764854 -0.225 0.821988
## EmployeeTypePart-Time -0.1479744 0.0788964 -1.876 0.060863 .
## Work.Life.Balance.Score -0.0479663 0.0224525 -2.136 0.032774 *
## GenderCodeMale 0.0588794 0.0634104 0.929 0.353238
## Age -0.0004333 0.0017816 -0.243 0.807880
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.407 on 1984 degrees of freedom
## Multiple R-squared: 0.01261, Adjusted R-squared: 0.009122
## F-statistic: 3.618 on 7 and 1984 DF, p-value: 0.0006994
Part-Time | -0.215 | 0.0057 ** | Significant decrease in satisfaction.
Temporary | -0.292 | <0.001 *** | Even greater drop. | | EmployeeType:
Part-Time | -0.148 | 0.061 † | Marginal effect, may lower satisfaction. |
Full-Time | -0.017 | 0.822 | No effect. | | Work.Life.Balance.Score | -0.048 | 0.033 * | Small negative relationship — surprising. Could be multicollinearity or reverse coding. | | Gender (Male) | +0.059 | 0.353 | Not significant. | | Age | -0.0004 | 0.808 | No meaningful effect. |
R² = 0.0126 → Model explains only 1.3% of the variance in Satisfaction Score.
F-test p = 0.0007 → The model is statistically significant, but weak in predictive power.
Contract type matters: Temporary and part-time employees report significantly lower satisfaction.
Work-life balance shows an unexpected negative effect — this may reflect a confounding variable or reversed scale (you should check coding direction).
Demographics (Gender, Age) and Employment Type (full vs part-time) do not meaningfully impact satisfaction.
satisfaction_model_2 <- lm(Current.Employee.Rating ~ EmployeeStatus + Age + Training.Outcome + Performance.Score, data = train_data)
summary(satisfaction_model_2)
##
## Call:
## lm(formula = Current.Employee.Rating ~ EmployeeStatus + Age +
## Training.Outcome + Performance.Score, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.11673 -0.89647 0.03568 0.10359 2.14753
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.067919 0.098112 31.269 <2e-16 ***
## EmployeeStatusTerminated 0.113303 0.066198 1.712 0.0871 .
## Age -0.001330 0.001276 -1.043 0.2972
## Training.OutcomeFailed -0.046067 0.064834 -0.711 0.4775
## Training.OutcomeIncomplete -0.059108 0.063575 -0.930 0.3526
## Training.OutcomePassed -0.068353 0.062759 -1.089 0.2762
## Performance.ScoreFully Meets -0.009976 0.070239 -0.142 0.8871
## Performance.ScoreNeeds Improvement 0.084732 0.114348 0.741 0.4588
## Performance.ScorePIP -0.105794 0.151502 -0.698 0.4851
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.01 on 1983 degrees of freedom
## Multiple R-squared: 0.003331, Adjusted R-squared: -0.0006898
## F-statistic: 0.8285 on 8 and 1983 DF, p-value: 0.5774
None of the predictors significantly influence performance ratings in this model.
Surprisingly, performance scores themselves have no statistical impact on the current employee rating. This might reflect redundancy between variables or poor measurement structure.
Terminated employees having slightly higher ratings could imply voluntary exits, or performance not being the only factor in separation.
Satisfaction Score Model: Has some predictive value, especially for employment classification types.
Suggests contract stability impacts satisfaction more than demographic traits.
Performance Rating Model: Performs poorly in predicting ratings.
Variables chosen do not adequately explain how employees are rated — perhaps other unmeasured internal factors (manager reviews, tenure, etc.) are more relevant.
# Predict on test set
satisfaction_preds <- predict(satisfaction_model, newdata = test_data)
# Actual values
actual <- test_data$Satisfaction.Score
# Root Mean Squared Error (RMSE)
rmse <- sqrt(mean((satisfaction_preds - actual)^2))
# Mean Absolute Error (MAE)
mae <- mean(abs(satisfaction_preds - actual))
print(paste("RMSE:", round(rmse, 3)))
## [1] "RMSE: 1.413"
print(paste("MAE:", round(mae, 3)))
## [1] "MAE: 1.223"
# Predict on test set
satisfaction_preds_2 <- predict(satisfaction_model_2, newdata = test_data)
# Actual values
actual_2 <- test_data$Current.Employee.Rating
# Root Mean Squared Error (RMSE)
rmse_2 <- sqrt(mean((satisfaction_preds_2 - actual_2)^2))
# Mean Absolute Error (MAE)
mae_2 <- mean(abs(satisfaction_preds_2 - actual_2))
print(paste("RMSE:", round(rmse_2, 3)))
## [1] "RMSE: 1.019"
print(paste("MAE:", round(mae_2, 3)))
## [1] "MAE: 0.706"
plot(actual, satisfaction_preds,
xlab = "Actual Satisfaction Score",
ylab = "Predicted Satisfaction Score",
main = "Actual vs. Predicted Satisfaction",
col = "darkblue", pch = 16)
abline(0, 1, col = "red", lwd = 2) # ideal prediction line
plot(actual, satisfaction_preds_2,
xlab = "Actual Satisfaction Score",
ylab = "Predicted Satisfaction Score",
main = "Actual vs. Predicted Satisfaction",
col = "darkblue", pch = 16)
abline(0, 1, col = "red", lwd = 2) # ideal prediction line
plot(satisfaction_model)
plot(satisfaction_model_2)
This will show:
Residuals vs Fitted (check for nonlinearity)
Normal Q-Q (residual normality)
Scale-Location (homoscedasticity)
Residuals vs Leverage (influential points)
#install.packages("xgboost")
library(xgboost)
library(Matrix)
# Prepare data
HR_dataset$EmployeeStatusBin <- ifelse(HR_dataset$EmployeeStatus == "Terminated", 1, 0)
train_data$EmployeeStatusBin <- as.numeric(as.character(train_data$EmployeeStatusBin))
# Create model matrix
x_train <- model.matrix(EmployeeStatusBin ~ Satisfaction.Score + Current.Employee.Rating +
EmployeeType + EmployeeClassificationType + Age + Training.Outcome, data = train_data)[,-1]
y_train <- train_data$EmployeeStatusBin
# Convert to DMatrix
dtrain <- xgb.DMatrix(data = x_train, label = y_train)
# Train XGBoost
xgb_model <- xgboost(data = dtrain, nrounds = 100, objective = "binary:logistic", eval_metric = "auc")
## [1] train-auc:0.557041
## [2] train-auc:0.670265
## [3] train-auc:0.741944
## [4] train-auc:0.769025
## [5] train-auc:0.783156
## [6] train-auc:0.800938
## [7] train-auc:0.808209
## [8] train-auc:0.817299
## [9] train-auc:0.824921
## [10] train-auc:0.835835
## [11] train-auc:0.853494
## [12] train-auc:0.865078
## [13] train-auc:0.871060
## [14] train-auc:0.879662
## [15] train-auc:0.883414
## [16] train-auc:0.885735
## [17] train-auc:0.893350
## [18] train-auc:0.904249
## [19] train-auc:0.907475
## [20] train-auc:0.916510
## [21] train-auc:0.917776
## [22] train-auc:0.924036
## [23] train-auc:0.928948
## [24] train-auc:0.932194
## [25] train-auc:0.935041
## [26] train-auc:0.938716
## [27] train-auc:0.940138
## [28] train-auc:0.942453
## [29] train-auc:0.945724
## [30] train-auc:0.947981
## [31] train-auc:0.949121
## [32] train-auc:0.950314
## [33] train-auc:0.951307
## [34] train-auc:0.953208
## [35] train-auc:0.955465
## [36] train-auc:0.955603
## [37] train-auc:0.957522
## [38] train-auc:0.958450
## [39] train-auc:0.958615
## [40] train-auc:0.959473
## [41] train-auc:0.959632
## [42] train-auc:0.960157
## [43] train-auc:0.960980
## [44] train-auc:0.961529
## [45] train-auc:0.962346
## [46] train-auc:0.964353
## [47] train-auc:0.966336
## [48] train-auc:0.967421
## [49] train-auc:0.968167
## [50] train-auc:0.968778
## [51] train-auc:0.970397
## [52] train-auc:0.970794
## [53] train-auc:0.971178
## [54] train-auc:0.971787
## [55] train-auc:0.972657
## [56] train-auc:0.973723
## [57] train-auc:0.974263
## [58] train-auc:0.975151
## [59] train-auc:0.976701
## [60] train-auc:0.976967
## [61] train-auc:0.978435
## [62] train-auc:0.978686
## [63] train-auc:0.978963
## [64] train-auc:0.979662
## [65] train-auc:0.980125
## [66] train-auc:0.980318
## [67] train-auc:0.981935
## [68] train-auc:0.981851
## [69] train-auc:0.982436
## [70] train-auc:0.983318
## [71] train-auc:0.983581
## [72] train-auc:0.983999
## [73] train-auc:0.985211
## [74] train-auc:0.985535
## [75] train-auc:0.985809
## [76] train-auc:0.986193
## [77] train-auc:0.986450
## [78] train-auc:0.987424
## [79] train-auc:0.987820
## [80] train-auc:0.987859
## [81] train-auc:0.988277
## [82] train-auc:0.988560
## [83] train-auc:0.988918
## [84] train-auc:0.989304
## [85] train-auc:0.989994
## [86] train-auc:0.990288
## [87] train-auc:0.990530
## [88] train-auc:0.990681
## [89] train-auc:0.991137
## [90] train-auc:0.991531
## [91] train-auc:0.991782
## [92] train-auc:0.992135
## [93] train-auc:0.992367
## [94] train-auc:0.992470
## [95] train-auc:0.992594
## [96] train-auc:0.992545
## [97] train-auc:0.992622
## [98] train-auc:0.992633
## [99] train-auc:0.992624
## [100] train-auc:0.992596
# Variable importance
xgb.importance(model = xgb_model) |> xgb.plot.importance()
train-auc: 0.9926 This AUC value is very high, indicating excellent model performance on the training data.
AUC = 1.0 means perfect prediction; AUC = 0.5 means no better than random.
Caution: This is training AUC — we should still check on the test set to confirm it’s not overfitting.
Feature Importance (Graph) Your plot shows the most influential features in predicting attrition: Interpretation: Age dominates: possibly due to younger employees being more likely to leave (early churn) or older employees retiring.
Satisfaction matters, confirming earlier ANOVA and logistic findings.
Training outcome has relatively weak direct impact — this is valuable insight for your training program evaluation.
Testing
# Prepare test data
x_test <- model.matrix(EmployeeStatusBin ~ Satisfaction.Score + Current.Employee.Rating +
EmployeeType + EmployeeClassificationType + Age + Training.Outcome,
data = test_data)[, -1]
dtest <- xgb.DMatrix(data = x_test)
# Predict probabilities and classes
xgb_preds <- predict(xgb_model, newdata = dtest)
xgb_classes <- ifelse(xgb_preds > 0.5, 1, 0)
# Confusion matrix
table(Predicted = xgb_classes, Actual = test_data$EmployeeStatusBin)
## Actual
## Predicted 0 1
## 0 708 109
## 1 29 7
# Accuracy
mean(xgb_classes == test_data$EmployeeStatusBin)
## [1] 0.8382181
# AUC
library(pROC)
roc_xgb <- roc(test_data$EmployeeStatusBin, xgb_preds)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_xgb, col = "blue", main = "XGBoost ROC - Test Set")
auc(roc_xgb)
## Area under the curve: 0.5328
library(mgcv)
## Warning: package 'mgcv' was built under R version 4.3.3
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
##
## Attaching package: 'mgcv'
## The following objects are masked from 'package:gam':
##
## gam, gam.control, gam.fit, s
gam_model <- gam(Satisfaction.Score ~ s(Age, k = 5) +
s(Work.Life.Balance.Score, k = 5) +
EmployeeType + EmployeeClassificationType + GenderCode,
data = train_data)
summary(gam_model)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## Satisfaction.Score ~ s(Age, k = 5) + s(Work.Life.Balance.Score,
## k = 5) + EmployeeType + EmployeeClassificationType + GenderCode
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.22393 0.07597 42.438 < 2e-16 ***
## EmployeeTypeFull-Time -0.01644 0.07646 -0.215 0.829779
## EmployeeTypePart-Time -0.14805 0.07886 -1.877 0.060625 .
## EmployeeClassificationTypePart-Time -0.21776 0.07778 -2.800 0.005165 **
## EmployeeClassificationTypeTemporary -0.28955 0.07694 -3.763 0.000173 ***
## GenderCodeMale 0.05901 0.06338 0.931 0.352010
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Age) 1.381 1.662 0.331 0.7539
## s(Work.Life.Balance.Score) 1.568 1.923 2.449 0.0632 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.00996 Deviance explained = 1.39%
## GCV = 1.9877 Scale est. = 1.9788 n = 1992
plot(gam_model, pages = 1, se = TRUE)
Model Fit Adjusted R²: 0.00996 → model explains ~1% of variance in satisfaction (very weak)
Deviance explained: 1.39%
This confirms what you’ve seen with linear models: Satisfaction.Score is not well explained by your current predictors
Conclusion:
Part-time and temporary classifications are associated with significantly lower satisfaction
Gender and full-time/part-time status alone are not significant
Conclusion:
Work-life balance may have a nonlinear effect, but it’s only marginally significant.
Age has no predictive relationship with satisfaction in this model.
Category | Variable | Effect on Satisfaction |
---|---|---|
Employment type | Temporary or Part-Time | Strongly negative |
Work-life balance | Slight trend (nonlinear) | Weak, marginal |
Age, Gender | Not significant | No effect |
gam_enhanced <- gam(Satisfaction.Score ~
s(Age, k = 5) +
s(Work.Life.Balance.Score, k = 5) +
EmployeeType * EmployeeClassificationType +
GenderCode,
data = train_data)
#This model:
#Adds an interaction between EmployeeType and EmployeeClassificationType
#Keeps non-linear smoothing for Age and Work.Life.Balance.Score
summary(gam_enhanced)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## Satisfaction.Score ~ s(Age, k = 5) + s(Work.Life.Balance.Score,
## k = 5) + EmployeeType * EmployeeClassificationType + GenderCode
##
## Parametric coefficients:
## Estimate Std. Error
## (Intercept) 3.20068 0.09848
## EmployeeTypeFull-Time -0.04986 0.13227
## EmployeeTypePart-Time -0.03989 0.13483
## EmployeeClassificationTypePart-Time -0.23538 0.13591
## EmployeeClassificationTypeTemporary -0.20271 0.13520
## GenderCodeMale 0.05890 0.06341
## EmployeeTypeFull-Time:EmployeeClassificationTypePart-Time 0.06517 0.18823
## EmployeeTypePart-Time:EmployeeClassificationTypePart-Time -0.01447 0.19358
## EmployeeTypeFull-Time:EmployeeClassificationTypeTemporary 0.03161 0.18623
## EmployeeTypePart-Time:EmployeeClassificationTypeTemporary -0.30722 0.19139
## t value Pr(>|t|)
## (Intercept) 32.502 <2e-16 ***
## EmployeeTypeFull-Time -0.377 0.7062
## EmployeeTypePart-Time -0.296 0.7674
## EmployeeClassificationTypePart-Time -1.732 0.0834 .
## EmployeeClassificationTypeTemporary -1.499 0.1339
## GenderCodeMale 0.929 0.3531
## EmployeeTypeFull-Time:EmployeeClassificationTypePart-Time 0.346 0.7292
## EmployeeTypePart-Time:EmployeeClassificationTypePart-Time -0.075 0.9404
## EmployeeTypeFull-Time:EmployeeClassificationTypeTemporary 0.170 0.8652
## EmployeeTypePart-Time:EmployeeClassificationTypeTemporary -1.605 0.1086
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Age) 1.396 1.685 0.350 0.7408
## s(Work.Life.Balance.Score) 1.566 1.921 2.457 0.0623 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0103 Deviance explained = 1.62%
## GCV = 1.9911 Scale est. = 1.9781 n = 1992
plot(gam_enhanced, pages = 1, se = TRUE, shade = TRUE)
#This shows the shapes and uncertainty of s(Age) and s(Work.Life.Balance.Score).
Metric | Value | Meaning |
---|---|---|
Adjusted R² | 0.0103 | Model explains just 1% of satisfaction variance — very weak |
Deviance Explained | 1.62% | Confirms low explanatory power |
GCV (Generalized CV) | 1.99 | Used internally — lower is better |
Interpretation: Even with added interaction terms and smooth functions, the model still struggles to predict satisfaction.
Key Takeaways Work-Life Balance has the only near-significant nonlinear influence on satisfaction.
Employment classification (e.g., part-time/temporary) may slightly lower satisfaction, but evidence is weak.
Gender, Age, and interactions have no meaningful effect.
Despite improvements, most of the variation in satisfaction remains unexplained.
preds <- predict(gam_enhanced, newdata = test_data)
actuals <- test_data$Satisfaction.Score
# RMSE and MAE
rmse <- sqrt(mean((preds - actuals)^2))
mae <- mean(abs(preds - actuals))
cat("RMSE:", round(rmse, 3), "\n")
## RMSE: 1.417
cat("MAE:", round(mae, 3), "\n")
## MAE: 1.228
#Model performance
# Scale relevant variables
cluster_data <- scale(HR_dataset[, c("Satisfaction.Score", "Engagement.Score", "Current.Employee.Rating")])
# Run K-Means
set.seed(2025)
kmeans_model <- kmeans(cluster_data, centers = 3, nstart = 25)
# Add cluster labels
HR_dataset$Cluster <- as.factor(kmeans_model$cluster)
# Visualize with ggplot
library(ggplot2)
library(ggfortify)
autoplot(kmeans_model, data = cluster_data, frame = TRUE)
# 1. Select relevant columns and remove missing values
cluster_vars <- HR_dataset[, c("Satisfaction.Score", "Engagement.Score", "Current.Employee.Rating")]
cluster_vars <- na.omit(cluster_vars)
# 2. Scale the variables (very important for K-means)
cluster_scaled <- scale(cluster_vars)
# 3. Choose number of clusters (try k = 3 to start)
set.seed(2025)
kmeans_model <- kmeans(cluster_scaled, centers = 3, nstart = 25)
# 4. Add cluster labels to the original data
HR_dataset$Cluster <- factor(kmeans_model$cluster)
# If not installed:
#install.packages("factoextra")
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.3.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# 2D cluster plot
fviz_cluster(kmeans_model, data = cluster_scaled,
ellipse.type = "norm",
geom = "point",
main = "K-Means Clusters of Employees")
aggregate(cbind(Satisfaction.Score, Engagement.Score, Current.Employee.Rating) ~ Cluster, data = HR_dataset, FUN = mean)
## Cluster Satisfaction.Score Engagement.Score Current.Employee.Rating
## 1 1 1.411538 2.237179 2.979487
## 2 2 3.231123 4.430939 3.080110
## 3 3 4.091931 1.850868 2.853933
This helps answer:
Are there clusters with low satisfaction but high performance?
Is there a group that is disengaged across the board?
Can you link any cluster back to termination rates, departments, or training types?
Cluster | Satisfaction | Engagement | Rating | Interpretation |
---|---|---|---|---|
1 | 1.41 (very low) | 2.24 (low) | 2.98 | ❌ At-risk & disengaged — unhappy and underengaged, likely to leave |
2 | 3.23 (moderate) | 4.43 (high) | 3.08 | ✅ High potential — strong engagement, moderate satisfaction and rating |
3 | 4.09 (high) | 1.85 (very low) | 2.85 | ⚠️ Complacent or disconnected — satisfied, but unengaged — may coast or be isolated |
Interpretation Cluster 1 is a classic attrition risk group — very low satisfaction + low engagement.
Cluster 2 looks like your high-performing, committed core — engaged and solidly rated.
Cluster 3 may represent content but disconnected employees — they’re satisfied but not engaged. This group might lack challenge or connection.
table(HR_dataset$Cluster, HR_dataset$EmployeeStatus)
##
## Active Terminated
## 1 676 104
## 2 940 146
## 3 842 137
table(HR_dataset$Cluster, HR_dataset$Training.Outcome)
##
## Completed Failed Incomplete Passed
## 1 200 189 198 193
## 2 283 258 280 265
## 3 254 221 253 251
library(ggplot2)
ggplot(HR_dataset, aes(x = Engagement.Score, y = Satisfaction.Score, color = Cluster)) +
geom_point(alpha = 0.6) +
labs(title = "Employee Clusters", x = "Engagement", y = "Satisfaction") +
theme_minimal()
These clusters give you actionable insights beyond what your regression models could. They’re great for:
Targeting retention strategies
Tailoring engagement initiatives
Customizing training approaches
fviz_nbclust(cluster_scaled, kmeans, method = "wss") +
labs(subtitle = "Elbow Method for Choosing k")
pca_data <- prcomp(cluster_data, scale. = TRUE)
summary(pca_data) # Proportion of variance explained
## Importance of components:
## PC1 PC2 PC3
## Standard deviation 1.0176 0.9978 0.9844
## Proportion of Variance 0.3451 0.3319 0.3230
## Cumulative Proportion 0.3451 0.6770 1.0000
# Plot PCA
biplot(pca_data, scale = 0)
Interpretation: No single component dominates — all 3 contribute fairly equally.
2 PCs explain ~68% of total variance — that’s solid for plotting.
You can safely visualize in 2D using PC1 and PC2 with minimal info loss.
# Compute PCA
pca_res <- prcomp(cluster_scaled, scale. = TRUE)
# Add cluster labels
pca_df <- as.data.frame(pca_res$x)
pca_df$Cluster <- HR_dataset$Cluster
# Plot
ggplot(pca_df, aes(x = PC1, y = PC2, color = Cluster)) +
geom_point(alpha = 0.6, size = 2) +
labs(title = "PCA Plot of Employee Clusters",
x = "Principal Component 1",
y = "Principal Component 2") +
theme_minimal()
This PCA plot will:
Visually confirm how well-separated your clusters are
Reveal which clusters are more compact vs scattered
Help you present intuitive groupings to stakeholders
#install.packages("survival")
library(survival)
# Assume 'StartDate' and 'EndDate' or current date
HR_dataset$StartDate <- as.Date(HR_dataset$StartDate, format = "%d-%b-%Y")
HR_dataset$EndDate <- ifelse(HR_dataset$EmployeeStatus == "Terminated",
HR_dataset$Training.Date, Sys.Date())
HR_dataset$EndDate <- as.Date(HR_dataset$EndDate)
# Time to event
HR_dataset$Time <- as.numeric(difftime(HR_dataset$EndDate, HR_dataset$StartDate, units = "days"))
HR_dataset$Status <- ifelse(HR_dataset$EmployeeStatus == "Terminated", 1, 0)
# Cox model
cox_model <- coxph(Surv(Time, Status) ~ Satisfaction.Score + EmployeeType + Age, data = HR_dataset)
summary(cox_model)
## Call:
## coxph(formula = Surv(Time, Status) ~ Satisfaction.Score + EmployeeType +
## Age, data = HR_dataset)
##
## n= 2845, number of events= 387
##
## coef exp(coef) se(coef) z Pr(>|z|)
## Satisfaction.Score 0.0534818 1.0549378 0.0362471 1.475 0.140
## EmployeeTypeFull-Time 0.1689881 1.1841060 0.1225053 1.379 0.168
## EmployeeTypePart-Time 0.0394723 1.0402617 0.1293064 0.305 0.760
## Age 0.0006256 1.0006257 0.0028652 0.218 0.827
##
## exp(coef) exp(-coef) lower .95 upper .95
## Satisfaction.Score 1.055 0.9479 0.9826 1.133
## EmployeeTypeFull-Time 1.184 0.8445 0.9314 1.505
## EmployeeTypePart-Time 1.040 0.9613 0.8074 1.340
## Age 1.001 0.9994 0.9950 1.006
##
## Concordance= 0.537 (se = 0.015 )
## Likelihood ratio test= 4.41 on 4 df, p=0.4
## Wald test = 4.44 on 4 df, p=0.4
## Score (logrank) test = 4.44 on 4 df, p=0.3
Metric | Value |
---|---|
Sample size (n) | 2,845 employees |
Number of events | 387 terminations |
Concordance | 0.537 (se = 0.015) |
Log-rank p-value | 0.3–0.4 (not significant) |
Predictor | Coef | HR (exp(coef)) | p-value | Interpretation |
---|---|---|---|---|
Satisfaction.Score | 0.0535 | 1.055 | 0.140 | Weak, non-significant ↑ in termination hazard as satisfaction ↑ (unexpected) |
EmployeeType: Full-Time | 0.169 | 1.18 | 0.168 | Small, non-significant ↑ in risk |
EmployeeType: Part-Time | 0.0395 | 1.04 | 0.76 | Very small effect, not significant |
Age | 0.0006 | 1.0006 | 0.826 | No effect |
Interpretation:
None of the variables are statistically significant (all p > 0.1).
Concordance = 0.537 → the model predicts slightly better than random, but not reliably.
Hazard Ratios (HR) close to 1 → very little influence on time to termination.
cox_model2 <- coxph(Surv(Time, Status) ~
Satisfaction.Score +
Current.Employee.Rating +
Engagement.Score +
Training.Outcome +
Performance.Score +
EmployeeClassificationType +
Age,
data = HR_dataset)
summary(cox_model2)
## Call:
## coxph(formula = Surv(Time, Status) ~ Satisfaction.Score + Current.Employee.Rating +
## Engagement.Score + Training.Outcome + Performance.Score +
## EmployeeClassificationType + Age, data = HR_dataset)
##
## n= 2845, number of events= 387
##
## coef exp(coef) se(coef) z
## Satisfaction.Score 0.0618255 1.0637767 0.0364200 1.698
## Current.Employee.Rating 0.1119695 1.1184788 0.0499818 2.240
## Engagement.Score -0.0292633 0.9711608 0.0354678 -0.825
## Training.OutcomeFailed 0.0983209 1.1033168 0.1522914 0.646
## Training.OutcomeIncomplete 0.1373032 1.1471760 0.1471421 0.933
## Training.OutcomePassed 0.3431868 1.4094320 0.1422915 2.412
## Performance.ScoreFully Meets 0.2242329 1.2513625 0.1682340 1.333
## Performance.ScoreNeeds Improvement 0.0790807 1.0822916 0.2743003 0.288
## Performance.ScorePIP 0.7293462 2.0737244 0.2841865 2.566
## EmployeeClassificationTypePart-Time 0.0408242 1.0416689 0.1277977 0.319
## EmployeeClassificationTypeTemporary 0.1512907 1.1633348 0.1229377 1.231
## Age 0.0006774 1.0006776 0.0028774 0.235
## Pr(>|z|)
## Satisfaction.Score 0.0896 .
## Current.Employee.Rating 0.0251 *
## Engagement.Score 0.4093
## Training.OutcomeFailed 0.5185
## Training.OutcomeIncomplete 0.3508
## Training.OutcomePassed 0.0159 *
## Performance.ScoreFully Meets 0.1826
## Performance.ScoreNeeds Improvement 0.7731
## Performance.ScorePIP 0.0103 *
## EmployeeClassificationTypePart-Time 0.7494
## EmployeeClassificationTypeTemporary 0.2185
## Age 0.8139
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## Satisfaction.Score 1.0638 0.9400 0.9905 1.142
## Current.Employee.Rating 1.1185 0.8941 1.0141 1.234
## Engagement.Score 0.9712 1.0297 0.9059 1.041
## Training.OutcomeFailed 1.1033 0.9064 0.8186 1.487
## Training.OutcomeIncomplete 1.1472 0.8717 0.8598 1.531
## Training.OutcomePassed 1.4094 0.7095 1.0664 1.863
## Performance.ScoreFully Meets 1.2514 0.7991 0.8999 1.740
## Performance.ScoreNeeds Improvement 1.0823 0.9240 0.6322 1.853
## Performance.ScorePIP 2.0737 0.4822 1.1881 3.620
## EmployeeClassificationTypePart-Time 1.0417 0.9600 0.8109 1.338
## EmployeeClassificationTypeTemporary 1.1633 0.8596 0.9142 1.480
## Age 1.0007 0.9993 0.9951 1.006
##
## Concordance= 0.559 (se = 0.015 )
## Likelihood ratio test= 21.38 on 12 df, p=0.05
## Wald test = 21.89 on 12 df, p=0.04
## Score (logrank) test = 22.07 on 12 df, p=0.04
#install.packages("survival")
#install.packages("survminer") # includes ggsurvplot()
library(survival)
library(survminer)
## Warning: package 'survminer' was built under R version 4.3.3
## Loading required package: ggpubr
## Warning: package 'ggpubr' was built under R version 4.3.3
##
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
##
## myeloma
library(survminer)
fit <- survfit(Surv(Time, Status) ~ EmployeeClassificationType, data = HR_dataset)
ggsurvplot(fit, data = HR_dataset, conf.int = TRUE, pval = TRUE)
Interpretation of the Plot Each line = a classification group (e.g., full-time, part-time, temp)
Steeper drop = faster attrition
The p-value tests whether the survival curves are significantly different