library(MASS)
library(tidyr)
library(dplyr)
library(MASS)
library(caret)
library(lattice)
library(ggplot2)
library(gam)
library(readr)
library(ROCR)
library(readr)
library(readxl)
library(e1071)
HR_dataset = read.csv("Cleaned_HR_Data_Analysis.csv")
str(HR_dataset)

## 'data.frame':    2845 obs. of  28 variables:
##  $ Employee.ID               : int  3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 ...
##  $ StartDate                 : chr  "20-Sep-19" "11-Feb-23" "10-Dec-18" "21-Jun-21" ...
##  $ Title                     : chr  "Production Technician I" "Production Technician I" "Area Sales Manager" "Area Sales Manager" ...
##  $ BusinessUnit              : chr  "CCDR" "EW" "PL" "CCDR" ...
##  $ EmployeeStatus            : chr  "Active" "Active" "Active" "Active" ...
##  $ EmployeeType              : chr  "Contract" "Contract" "Full-Time" "Contract" ...
##  $ PayZone                   : chr  "Zone C" "Zone A" "Zone B" "Zone A" ...
##  $ EmployeeClassificationType: chr  "Temporary" "Part-Time" "Part-Time" "Full-Time" ...
##  $ DepartmentType            : chr  "Production       " "Production       " "Sales" "Sales" ...
##  $ Division                  : chr  "Finance & Accounting" "Aerial" "General - Sga" "Finance & Accounting" ...
##  $ DOB                       : chr  "07-10-1969" "30-08-1965" "06-10-1991" "04-04-1998" ...
##  $ State                     : chr  "MA" "MA" "MA" "ND" ...
##  $ GenderCode                : chr  "Female" "Male" "Male" "Male" ...
##  $ RaceDesc                  : chr  "White" "Hispanic" "Hispanic" "Other" ...
##  $ MaritalDesc               : chr  "Widowed" "Widowed" "Widowed" "Single" ...
##  $ Performance.Score         : chr  "Fully Meets" "Fully Meets" "Fully Meets" "Fully Meets" ...
##  $ Current.Employee.Rating   : int  4 3 4 2 3 3 4 2 3 5 ...
##  $ Survey.Date               : chr  "14-01-2023" "09-09-2022" "27-05-2023" "16-06-2023" ...
##  $ Engagement.Score          : int  1 2 1 5 2 2 1 1 4 5 ...
##  $ Satisfaction.Score        : int  2 1 2 5 5 3 5 4 3 5 ...
##  $ Work.Life.Balance.Score   : int  3 5 1 4 3 5 2 2 3 2 ...
##  $ Training.Date             : chr  "15-Jul-23" "12-Sep-22" "13-Aug-22" "15-Dec-22" ...
##  $ Training.Program.Name     : chr  "Leadership Development" "Customer Service" "Leadership Development" "Project Management" ...
##  $ Training.Type             : chr  "Internal" "External" "External" "External" ...
##  $ Training.Outcome          : chr  "Failed" "Incomplete" "Failed" "Completed" ...
##  $ Training.Duration.Days.   : int  2 4 2 3 5 2 5 3 3 5 ...
##  $ Training.Cost             : num  606 673 413 664 399 ...
##  $ Age                       : int  50 58 27 23 50 71 80 63 44 73 ...

summary(HR_dataset)

##   Employee.ID    StartDate            Title           BusinessUnit      
##  Min.   :1001   Length:2845        Length:2845        Length:2845       
##  1st Qu.:1736   Class :character   Class :character   Class :character  
##  Median :2456   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :2471                                                           
##  3rd Qu.:3197                                                           
##  Max.   :4000                                                           
##  EmployeeStatus     EmployeeType         PayZone         
##  Length:2845        Length:2845        Length:2845       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##  EmployeeClassificationType DepartmentType       Division        
##  Length:2845                Length:2845        Length:2845       
##  Class :character           Class :character   Class :character  
##  Mode  :character           Mode  :character   Mode  :character  
##                                                                  
##                                                                  
##                                                                  
##      DOB               State            GenderCode          RaceDesc        
##  Length:2845        Length:2845        Length:2845        Length:2845       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  MaritalDesc        Performance.Score  Current.Employee.Rating
##  Length:2845        Length:2845        Min.   :1.000          
##  Class :character   Class :character   1st Qu.:2.000          
##  Mode  :character   Mode  :character   Median :3.000          
##                                        Mean   :2.975          
##                                        3rd Qu.:3.000          
##                                        Max.   :5.000          
##  Survey.Date        Engagement.Score Satisfaction.Score Work.Life.Balance.Score
##  Length:2845        Min.   :1.000    Min.   :1.000      Min.   :1.000          
##  Class :character   1st Qu.:2.000    1st Qu.:2.000      1st Qu.:2.000          
##  Mode  :character   Median :3.000    Median :3.000      Median :3.000          
##                     Mean   :2.942    Mean   :3.028      Mean   :2.989          
##                     3rd Qu.:4.000    3rd Qu.:4.000      3rd Qu.:4.000          
##                     Max.   :5.000    Max.   :5.000      Max.   :5.000          
##  Training.Date      Training.Program.Name Training.Type      Training.Outcome  
##  Length:2845        Length:2845           Length:2845        Length:2845       
##  Class :character   Class :character      Class :character   Class :character  
##  Mode  :character   Mode  :character      Mode  :character   Mode  :character  
##                                                                                
##                                                                                
##                                                                                
##  Training.Duration.Days. Training.Cost         Age       
##  Min.   :1.000           Min.   : 100.0   Min.   :17.00  
##  1st Qu.:2.000           1st Qu.: 328.1   1st Qu.:34.00  
##  Median :3.000           Median : 571.8   Median :49.00  
##  Mean   :2.974           Mean   : 559.3   Mean   :49.45  
##  3rd Qu.:4.000           3rd Qu.: 788.3   3rd Qu.:65.00  
##  Max.   :5.000           Max.   :1000.0   Max.   :82.00

# Check for missing values
colSums(is.na(HR_dataset))

##                Employee.ID                  StartDate 
##                          0                          0 
##                      Title               BusinessUnit 
##                          0                          0 
##             EmployeeStatus               EmployeeType 
##                          0                          0 
##                    PayZone EmployeeClassificationType 
##                          0                          0 
##             DepartmentType                   Division 
##                          0                          0 
##                        DOB                      State 
##                          0                          0 
##                 GenderCode                   RaceDesc 
##                          0                          0 
##                MaritalDesc          Performance.Score 
##                          0                          0 
##    Current.Employee.Rating                Survey.Date 
##                          0                          0 
##           Engagement.Score         Satisfaction.Score 
##                          0                          0 
##    Work.Life.Balance.Score              Training.Date 
##                          0                          0 
##      Training.Program.Name              Training.Type 
##                          0                          0 
##           Training.Outcome    Training.Duration.Days. 
##                          0                          0 
##              Training.Cost                        Age 
##                          0                          0

Already a cleaned dat set, but wanted to double check, make sure no missing observations

Looking at all the character variables, to see which variables have base binary levels (only 2 levels), and which have multiple (or more than 2)

levels(as.factor(HR_dataset$Title))

##  [1] "Accountant I"                 "Administrative Assistant"    
##  [3] "Area Sales Manager"           "BI Developer"                
##  [5] "BI Director"                  "CIO"                         
##  [7] "Data Analyst"                 "Data Analyst "               
##  [9] "Data Architect"               "Database Administrator"      
## [11] "Director of Operations"       "Director of Sales"           
## [13] "Enterprise Architect"         "IT Director"                 
## [15] "IT Manager - DB"              "IT Manager - Infra"          
## [17] "IT Manager - Support"         "IT Support"                  
## [19] "Network Engineer"             "President & CEO"             
## [21] "Principal Data Architect"     "Production Manager"          
## [23] "Production Technician I"      "Production Technician II"    
## [25] "Sales Manager"                "Senior BI Developer"         
## [27] "Shared Services Manager"      "Software Engineer"           
## [29] "Software Engineering Manager" "Sr. Accountant"              
## [31] "Sr. DBA"                      "Sr. Network Engineer"

levels(as.factor(HR_dataset$BusinessUnit))

##  [1] "BPC"  "CCDR" "EW"   "MSC"  "NEL"  "PL"   "PYZ"  "SVG"  "TNS"  "WBL"

levels(as.factor(HR_dataset$EmployeeStatus))

## [1] "Active"     "Terminated"

only 2 levels

levels(as.factor(HR_dataset$EmployeeType))

## [1] "Contract"  "Full-Time" "Part-Time"

levels(as.factor(HR_dataset$PayZone))

## [1] "Zone A" "Zone B" "Zone C"

levels(as.factor(HR_dataset$EmployeeClassificationType))

## [1] "Full-Time" "Part-Time" "Temporary"

levels(as.factor(HR_dataset$DepartmentType))

## [1] "Admin Offices"        "Executive Office"     "IT/IS"               
## [4] "Production       "    "Sales"                "Software Engineering"

Take a look at the Sales option, or the one next to it

levels(as.factor(HR_dataset$Division))

##  [1] "Aerial"                   "Billable Consultants"    
##  [3] "Catv"                     "Corp Operations"         
##  [5] "Engineers"                "Executive"               
##  [7] "Field Operations"         "Fielders"                
##  [9] "Finance & Accounting"     "General - Con"           
## [11] "General - Eng"            "General - Sga"           
## [13] "Isp"                      "People Services"         
## [15] "Project Management - Con" "Project Management - Eng"
## [17] "Safety"                   "Sales & Marketing"       
## [19] "Shop (Fleet)"             "Splicing"                
## [21] "Technology / It"          "Underground"             
## [23] "Wireless"                 "Wireline Construction"   
## [25] "Yard (Material Handling)"

levels(as.factor(HR_dataset$State))

##  [1] "AL" "AZ" "CA" "CO" "CT" "FL" "GA" "ID" "IN" "KY" "MA" "ME" "MT" "NC" "ND"
## [16] "NH" "NV" "NY" "OH" "OR" "PA" "RI" "TN" "TX" "UT" "VA" "VT" "WA"

only has 28 states

levels(as.factor(HR_dataset$GenderCode))

## [1] "Female" "Male"

only 2 levels, can binary code

levels(as.factor(HR_dataset$RaceDesc))

## [1] "Asian"    "Black"    "Hispanic" "Other"    "White"

levels(as.factor(HR_dataset$MaritalDesc))

## [1] "Divorced" "Married"  "Single"   "Widowed"

levels(as.factor(HR_dataset$Performance.Score))

## [1] "Exceeds"           "Fully Meets"       "Needs Improvement"
## [4] "PIP"

levels(as.factor(HR_dataset$Training.Program.Name))

## [1] "Communication Skills"   "Customer Service"       "Leadership Development"
## [4] "Project Management"     "Technical Skills"

levels(as.factor(HR_dataset$Training.Type))

## [1] "External" "Internal"

can binary code

levels(as.factor(HR_dataset$Training.Outcome))

## [1] "Completed"  "Failed"     "Incomplete" "Passed"

HR_dataset$EmployeeStatus <- as.factor(HR_dataset$EmployeeStatus)
HR_dataset$GenderCode <- as.factor(HR_dataset$GenderCode)
HR_dataset$Training.Type <- as.factor(HR_dataset$Training.Type)
HR_dataset$BusinessUnit <- as.factor(HR_dataset$BusinessUnit)
HR_dataset$EmployeeType <- as.factor(HR_dataset$EmployeeType)
HR_dataset$PayZone <- as.factor(HR_dataset$PayZone)
HR_dataset$EmployeeClassificationType <- as.factor(HR_dataset$EmployeeClassificationType)
HR_dataset$DepartmentType <- as.factor(HR_dataset$DepartmentType)
HR_dataset$Division <- as.factor(HR_dataset$Division)
HR_dataset$RaceDesc <- as.factor(HR_dataset$RaceDesc)
HR_dataset$MaritalDesc <- as.factor(HR_dataset$MaritalDesc)
HR_dataset$Performance.Score <- as.factor(HR_dataset$Performance.Score)
HR_dataset$Training.Program.Name <- as.factor(HR_dataset$Training.Program.Name)
HR_dataset$Training.Outcome <- as.factor(HR_dataset$Training.Outcome)

Fixing the date variables

library(lubridate)

# Convert StartDate with month as characters (e.g., "06-Aug-2018")
HR_dataset$StartDate <- dmy(HR_dataset$StartDate)

# Convert DOB and Survey.Date and Training.Date
HR_dataset$DOB <- dmy(HR_dataset$DOB)
HR_dataset$Survey.Date <- dmy(HR_dataset$Survey.Date)
HR_dataset$Training.Date <- dmy(HR_dataset$Training.Date)

str(HR_dataset[, c("StartDate", "DOB", "Survey.Date", "Training.Date")])

## 'data.frame':    2845 obs. of  4 variables:
##  $ StartDate    : Date, format: "2019-09-20" "2023-02-11" ...
##  $ DOB          : Date, format: "1969-10-07" "1965-08-30" ...
##  $ Survey.Date  : Date, format: "2023-01-14" "2022-09-09" ...
##  $ Training.Date: Date, format: "2023-07-15" "2022-09-12" ...

str(HR_dataset)

## 'data.frame':    2845 obs. of  28 variables:
##  $ Employee.ID               : int  3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 ...
##  $ StartDate                 : Date, format: "2019-09-20" "2023-02-11" ...
##  $ Title                     : chr  "Production Technician I" "Production Technician I" "Area Sales Manager" "Area Sales Manager" ...
##  $ BusinessUnit              : Factor w/ 10 levels "BPC","CCDR","EW",..: 2 3 6 2 9 1 10 2 5 1 ...
##  $ EmployeeStatus            : Factor w/ 2 levels "Active","Terminated": 1 1 1 1 1 1 1 1 1 1 ...
##  $ EmployeeType              : Factor w/ 3 levels "Contract","Full-Time",..: 1 1 2 1 1 1 2 1 1 3 ...
##  $ PayZone                   : Factor w/ 3 levels "Zone A","Zone B",..: 3 1 2 1 1 2 2 3 2 2 ...
##  $ EmployeeClassificationType: Factor w/ 3 levels "Full-Time","Part-Time",..: 3 2 2 1 3 1 3 1 2 3 ...
##  $ DepartmentType            : Factor w/ 6 levels "Admin Offices",..: 4 4 5 5 5 5 5 5 5 5 ...
##  $ Division                  : Factor w/ 25 levels "Aerial","Billable Consultants",..: 9 1 12 9 10 7 11 5 6 5 ...
##  $ DOB                       : Date, format: "1969-10-07" "1965-08-30" ...
##  $ State                     : chr  "MA" "MA" "MA" "ND" ...
##  $ GenderCode                : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 2 1 1 2 2 ...
##  $ RaceDesc                  : Factor w/ 5 levels "Asian","Black",..: 5 3 3 4 4 2 3 5 2 1 ...
##  $ MaritalDesc               : Factor w/ 4 levels "Divorced","Married",..: 4 4 4 3 2 2 1 1 4 4 ...
##  $ Performance.Score         : Factor w/ 4 levels "Exceeds","Fully Meets",..: 2 2 2 2 2 2 1 2 1 2 ...
##  $ Current.Employee.Rating   : int  4 3 4 2 3 3 4 2 3 5 ...
##  $ Survey.Date               : Date, format: "2023-01-14" "2022-09-09" ...
##  $ Engagement.Score          : int  1 2 1 5 2 2 1 1 4 5 ...
##  $ Satisfaction.Score        : int  2 1 2 5 5 3 5 4 3 5 ...
##  $ Work.Life.Balance.Score   : int  3 5 1 4 3 5 2 2 3 2 ...
##  $ Training.Date             : Date, format: "2023-07-15" "2022-09-12" ...
##  $ Training.Program.Name     : Factor w/ 5 levels "Communication Skills",..: 3 2 3 4 5 4 2 3 2 3 ...
##  $ Training.Type             : Factor w/ 2 levels "External","Internal": 2 1 1 1 1 2 1 2 2 2 ...
##  $ Training.Outcome          : Factor w/ 4 levels "Completed","Failed",..: 2 3 2 1 2 4 4 4 4 2 ...
##  $ Training.Duration.Days.   : int  2 4 2 3 5 2 5 3 3 5 ...
##  $ Training.Cost             : num  606 673 413 664 399 ...
##  $ Age                       : int  50 58 27 23 50 71 80 63 44 73 ...

Don’t have to necessarily change the factor levels from 1 2 to 0 1, is seen as stil a category internally, but for log reg, will need to dummy code

Don’t need to factor State, but would more than likely drop it if it could skew analysis, but could factor to create visuals

# Select only numeric columns
numeric_columns <- sapply(HR_dataset, is.numeric)
HR_numeric <- HR_dataset[, numeric_columns]

# Compute correlation matrix
corr_matrix <- cor(HR_numeric, use = "complete.obs")

# Print correlation matrix
print(corr_matrix)

##                          Employee.ID Current.Employee.Rating Engagement.Score
## Employee.ID              1.000000000             -0.01647101      0.014512208
## Current.Employee.Rating -0.016471015              1.00000000      0.022209785
## Engagement.Score         0.014512208              0.02220979      1.000000000
## Satisfaction.Score       0.018004462             -0.02461340     -0.004391166
## Work.Life.Balance.Score -0.020638867              0.02815133      0.025596264
## Training.Duration.Days. -0.003981504              0.00590126      0.002878806
## Training.Cost           -0.019489386              0.01075746      0.020968191
## Age                      0.004796900             -0.02011502     -0.033426978
##                         Satisfaction.Score Work.Life.Balance.Score
## Employee.ID                   0.0180044621            -0.020638867
## Current.Employee.Rating      -0.0246134010             0.028151331
## Engagement.Score             -0.0043911665             0.025596264
## Satisfaction.Score            1.0000000000            -0.032766022
## Work.Life.Balance.Score      -0.0327660215             1.000000000
## Training.Duration.Days.       0.0242579862             0.003022691
## Training.Cost                 0.0006648727             0.008012574
## Age                          -0.0063905240            -0.011810926
##                         Training.Duration.Days. Training.Cost          Age
## Employee.ID                        -0.003981504 -0.0194893862  0.004796900
## Current.Employee.Rating             0.005901260  0.0107574563 -0.020115019
## Engagement.Score                    0.002878806  0.0209681914 -0.033426978
## Satisfaction.Score                  0.024257986  0.0006648727 -0.006390524
## Work.Life.Balance.Score             0.003022691  0.0080125740 -0.011810926
## Training.Duration.Days.             1.000000000 -0.0158420517  0.003657019
## Training.Cost                      -0.015842052  1.0000000000 -0.001902620
## Age                                 0.003657019 -0.0019026200  1.000000000

# Select only numeric columns, excluding Employee ID
numeric_columns <- sapply(HR_dataset, is.numeric)
HR_numeric <- HR_dataset[, numeric_columns]

# Remove Employee ID column if present
HR_numeric <- HR_numeric[, !colnames(HR_numeric) %in% c("Employee.ID")]

# Compute correlation matrix
corr_matrix <- cor(HR_numeric, use = "complete.obs")

# Print correlation matrix
print(corr_matrix)

##                         Current.Employee.Rating Engagement.Score
## Current.Employee.Rating              1.00000000      0.022209785
## Engagement.Score                     0.02220979      1.000000000
## Satisfaction.Score                  -0.02461340     -0.004391166
## Work.Life.Balance.Score              0.02815133      0.025596264
## Training.Duration.Days.              0.00590126      0.002878806
## Training.Cost                        0.01075746      0.020968191
## Age                                 -0.02011502     -0.033426978
##                         Satisfaction.Score Work.Life.Balance.Score
## Current.Employee.Rating      -0.0246134010             0.028151331
## Engagement.Score             -0.0043911665             0.025596264
## Satisfaction.Score            1.0000000000            -0.032766022
## Work.Life.Balance.Score      -0.0327660215             1.000000000
## Training.Duration.Days.       0.0242579862             0.003022691
## Training.Cost                 0.0006648727             0.008012574
## Age                          -0.0063905240            -0.011810926
##                         Training.Duration.Days. Training.Cost          Age
## Current.Employee.Rating             0.005901260  0.0107574563 -0.020115019
## Engagement.Score                    0.002878806  0.0209681914 -0.033426978
## Satisfaction.Score                  0.024257986  0.0006648727 -0.006390524
## Work.Life.Balance.Score             0.003022691  0.0080125740 -0.011810926
## Training.Duration.Days.             1.000000000 -0.0158420517  0.003657019
## Training.Cost                      -0.015842052  1.0000000000 -0.001902620
## Age                                 0.003657019 -0.0019026200  1.000000000

Redo this, but adding the three new binary variables

library(ggcorrplot)

## Warning: package 'ggcorrplot' was built under R version 4.3.3

ggcorrplot(corr_matrix, lab = TRUE, lab_size = 3, colors = c("blue", "white", "red"))

Initial analysis of only the numerical values, shows nothing correlates, so they have no impact on one another

#HR_dataset$GenderNumeric <- ifelse(HR_dataset$GenderCode == "Male", 1, 0)
#HR_dataset$EmployeeStatusNumeric <- ifelse(HR_dataset$EmployeeStatus == "Active", 1, 0)
#HR_dataset$TrainingTypeNumeric <- ifelse(HR_dataset$Training.Type == "Instructor-Led", 1, 0)  # Example value

# Convert and bind new dummy variables to dataset
#dummy_vars <- model.matrix(~ BusinessUnit + EmployeeType + PayZone + 
 #                          EmployeeClassificationType + DepartmentType + 
  #                         Division + RaceDesc + MaritalDesc + 
   #                        Performance.Score + Training.Program.Name + 
    #                       Training.Outcome, data = HR_dataset)[, -1]  # Remove intercept

# Combine with original dataset
#HR_dataset <- cbind(HR_dataset, dummy_vars)

# Select only numeric columns, excluding Employee ID
numeric_columns_2 <- sapply(HR_dataset, is.numeric)
HR_numeric_2 <- HR_dataset[, numeric_columns_2]

# Remove Employee ID column if present
HR_numeric_2 <- HR_numeric_2[, !colnames(HR_numeric_2) %in% c("Employee.ID")]

# Compute correlation matrix
corr_matrix_2 <- cor(HR_numeric_2, use = "complete.obs")

# Print correlation matrix
print(corr_matrix_2)

##                         Current.Employee.Rating Engagement.Score
## Current.Employee.Rating              1.00000000      0.022209785
## Engagement.Score                     0.02220979      1.000000000
## Satisfaction.Score                  -0.02461340     -0.004391166
## Work.Life.Balance.Score              0.02815133      0.025596264
## Training.Duration.Days.              0.00590126      0.002878806
## Training.Cost                        0.01075746      0.020968191
## Age                                 -0.02011502     -0.033426978
##                         Satisfaction.Score Work.Life.Balance.Score
## Current.Employee.Rating      -0.0246134010             0.028151331
## Engagement.Score             -0.0043911665             0.025596264
## Satisfaction.Score            1.0000000000            -0.032766022
## Work.Life.Balance.Score      -0.0327660215             1.000000000
## Training.Duration.Days.       0.0242579862             0.003022691
## Training.Cost                 0.0006648727             0.008012574
## Age                          -0.0063905240            -0.011810926
##                         Training.Duration.Days. Training.Cost          Age
## Current.Employee.Rating             0.005901260  0.0107574563 -0.020115019
## Engagement.Score                    0.002878806  0.0209681914 -0.033426978
## Satisfaction.Score                  0.024257986  0.0006648727 -0.006390524
## Work.Life.Balance.Score             0.003022691  0.0080125740 -0.011810926
## Training.Duration.Days.             1.000000000 -0.0158420517  0.003657019
## Training.Cost                      -0.015842052  1.0000000000 -0.001902620
## Age                                 0.003657019 -0.0019026200  1.000000000

Bar graphs/histograms based on the integer/numeric variables

ggplot(HR_dataset, aes(x = Age)) + geom_histogram(binwidth = 5, fill = "blue", color = "black")

ggplot(HR_dataset, aes(x = Engagement.Score)) + geom_histogram(binwidth = 1, fill = "blue", color = "black")

ggplot(HR_dataset, aes(x = Satisfaction.Score)) + geom_histogram(binwidth = 1, fill = "blue", color = "black")

ggplot(HR_dataset, aes(x = Work.Life.Balance.Score)) + geom_histogram(binwidth = 1, fill = "blue", color = "black")

ggplot(HR_dataset, aes(x = BusinessUnit)) +
  geom_bar(fill = "forestgreen") +
  geom_text(stat = "count", aes(label = ..count..), hjust = -0.1) +
  coord_flip() +
  theme_minimal() +
  labs(title = "Employee Count by Business Unit", x = "Business Unit", y = "Count")

ggplot(HR_dataset, aes(x = Division)) +
  geom_bar(fill = "steelblue") +
  geom_text(stat = "count", aes(label = ..count..), hjust = -0.2) +
  coord_flip() +
  theme_minimal() +
  labs(title = "Employee Count by Division", x = "Division", y = "Count")

ggplot(HR_dataset, aes(x = DepartmentType)) +
  geom_bar(fill = "darkorange") +
  geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Employee Count by Department Type", x = "Department Type", y = "Count")

#install.packages("rcompanion")

library(tidyverse)
library(corrplot)
library(GGally)
#library(rcompanion)
library(vcd)

# ---- NUMERIC-ONLY CORRELATION MATRIX ----
numeric_cols <- sapply(HR_dataset, is.numeric)
HR_numeric <- HR_dataset[, numeric_cols]

# Remove ID column if present
HR_numeric <- HR_numeric[, !colnames(HR_numeric) %in% c("Employee.ID")]

# Compute and plot correlation
cor_matrix <- cor(HR_numeric, use = "complete.obs")
corrplot(cor_matrix, method = "color", type = "upper", 
         tl.cex = 0.8, tl.col = "black", addCoef.col = "black", 
         title = "Correlation Matrix for Numeric Variables", mar=c(0,0,1,0))

# ---- CATEGORICAL VS NUMERIC (ANOVA & Boxplots) ----
# Example: Gender vs Satisfaction.Score
ggplot(HR_dataset, aes(x = GenderCode, y = Satisfaction.Score)) +
  geom_boxplot(fill = "skyblue") +
  theme_minimal() +
  labs(title = "Boxplot: Satisfaction by Gender")

# ANOVA example
anova_gender <- aov(Satisfaction.Score ~ GenderCode, data = HR_dataset)
summary(anova_gender)

##               Df Sum Sq Mean Sq F value Pr(>F)
## GenderCode     1      1   1.134    0.57   0.45
## Residuals   2843   5654   1.989

# ---- CATEGORICAL VS CATEGORICAL (Chi-square and Cramér’s V) ----
# Example: Gender vs Training Outcome
ctab <- table(HR_dataset$GenderCode, HR_dataset$Training.Outcome)
chisq_result <- chisq.test(ctab)
#cramerv_result <- cramerV(ctab)
result <- assocstats(ctab)

print("Chi-Square Test: Gender vs Training Outcome")

## [1] "Chi-Square Test: Gender vs Training Outcome"

print(chisq_result)

## 
##  Pearson's Chi-squared test
## 
## data:  ctab
## X-squared = 6.0666, df = 3, p-value = 0.1084

# Extract and print Cramér’s V specifically
cramer_v <- result$cramer
print(paste("Cramér’s V:", round(cramer_v, 3)))

## [1] "Cramér’s V: 0.046"

# Mosaic plot
mosaicplot(ctab, main="Mosaic: Gender vs Training Outcome", color=TRUE)

# ---- MIXED VARIABLE OVERVIEW (GGally ggpairs) ----
HR_subset <- HR_dataset %>% 
  select(Engagement.Score, Satisfaction.Score, GenderCode, Training.Outcome, Performance.Score)

GGally::ggpairs(HR_subset)

Correlation Matrix for all numeric variables.
Boxplot + ANOVA between a factor (e.g., Gender) and a score.
Chi-square + Cramér’s V for relationship strength between two categorical variables. Top is ANOVA, second is chi-square, cramer’s V
Mosaic plot for visualizing categorical distributions.
GGally ggpairs: A matrix that shows pairwise relationships across mixed variable types.

The analysis below demenstrates which variables contribute heavily to analysis, lowering computation costs overall

cat_vars <- c("GenderCode", "EmployeeStatus", "Training.Type", "BusinessUnit",
              "EmployeeType", "PayZone", "EmployeeClassificationType")

num_outcomes <- c("Engagement.Score", "Satisfaction.Score", 
                  "Work.Life.Balance.Score", "Current.Employee.Rating")

for (cat in cat_vars) {
  for (num in num_outcomes) {
    formula <- as.formula(paste(num, "~", cat))
    model <- aov(formula, data = HR_dataset)
    print(paste("ANOVA:", num, "vs", cat))
    print(summary(model))
  }
}

## [1] "ANOVA: Engagement.Score vs GenderCode"
##               Df Sum Sq Mean Sq F value Pr(>F)
## GenderCode     1      1  0.6493   0.315  0.575
## Residuals   2843   5858  2.0604               
## [1] "ANOVA: Satisfaction.Score vs GenderCode"
##               Df Sum Sq Mean Sq F value Pr(>F)
## GenderCode     1      1   1.134    0.57   0.45
## Residuals   2843   5654   1.989               
## [1] "ANOVA: Work.Life.Balance.Score vs GenderCode"
##               Df Sum Sq Mean Sq F value Pr(>F)
## GenderCode     1      1  0.6468   0.326  0.568
## Residuals   2843   5644  1.9852               
## [1] "ANOVA: Current.Employee.Rating vs GenderCode"
##               Df Sum Sq Mean Sq F value Pr(>F)
## GenderCode     1    0.6  0.6399   0.624   0.43
## Residuals   2843 2915.5  1.0255               
## [1] "ANOVA: Engagement.Score vs EmployeeStatus"
##                  Df Sum Sq Mean Sq F value Pr(>F)
## EmployeeStatus    1      1  0.8063   0.391  0.532
## Residuals      2843   5858  2.0603               
## [1] "ANOVA: Satisfaction.Score vs EmployeeStatus"
##                  Df Sum Sq Mean Sq F value Pr(>F)
## EmployeeStatus    1      4   3.660   1.841  0.175
## Residuals      2843   5651   1.988               
## [1] "ANOVA: Work.Life.Balance.Score vs EmployeeStatus"
##                  Df Sum Sq Mean Sq F value Pr(>F)
## EmployeeStatus    1      4   3.830    1.93  0.165
## Residuals      2843   5641   1.984               
## [1] "ANOVA: Current.Employee.Rating vs EmployeeStatus"
##                  Df Sum Sq Mean Sq F value Pr(>F)  
## EmployeeStatus    1    4.3   4.272   4.171 0.0412 *
## Residuals      2843 2911.9   1.024                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] "ANOVA: Engagement.Score vs Training.Type"
##                 Df Sum Sq Mean Sq F value Pr(>F)
## Training.Type    1      2   1.714   0.832  0.362
## Residuals     2843   5857   2.060               
## [1] "ANOVA: Satisfaction.Score vs Training.Type"
##                 Df Sum Sq Mean Sq F value Pr(>F)
## Training.Type    1      0  0.2212   0.111  0.739
## Residuals     2843   5654  1.9889               
## [1] "ANOVA: Work.Life.Balance.Score vs Training.Type"
##                 Df Sum Sq Mean Sq F value Pr(>F)
## Training.Type    1      0  0.1854   0.093   0.76
## Residuals     2843   5644  1.9854               
## [1] "ANOVA: Current.Employee.Rating vs Training.Type"
##                 Df Sum Sq Mean Sq F value Pr(>F)
## Training.Type    1    1.9   1.921   1.874  0.171
## Residuals     2843 2914.3   1.025               
## [1] "ANOVA: Engagement.Score vs BusinessUnit"
##                Df Sum Sq Mean Sq F value Pr(>F)
## BusinessUnit    9     12   1.291   0.626  0.776
## Residuals    2835   5847   2.062               
## [1] "ANOVA: Satisfaction.Score vs BusinessUnit"
##                Df Sum Sq Mean Sq F value Pr(>F)
## BusinessUnit    9     16   1.748   0.879  0.543
## Residuals    2835   5639   1.989               
## [1] "ANOVA: Work.Life.Balance.Score vs BusinessUnit"
##                Df Sum Sq Mean Sq F value Pr(>F)
## BusinessUnit    9     25   2.795    1.41  0.178
## Residuals    2835   5620   1.982               
## [1] "ANOVA: Current.Employee.Rating vs BusinessUnit"
##                Df Sum Sq Mean Sq F value Pr(>F)
## BusinessUnit    9    5.4  0.5956    0.58  0.815
## Residuals    2835 2910.8  1.0267               
## [1] "ANOVA: Engagement.Score vs EmployeeType"
##                Df Sum Sq Mean Sq F value Pr(>F)
## EmployeeType    2      1  0.3973   0.193  0.825
## Residuals    2842   5858  2.0611               
## [1] "ANOVA: Satisfaction.Score vs EmployeeType"
##                Df Sum Sq Mean Sq F value Pr(>F)  
## EmployeeType    2     11   5.272   2.655 0.0705 .
## Residuals    2842   5644   1.986                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] "ANOVA: Work.Life.Balance.Score vs EmployeeType"
##                Df Sum Sq Mean Sq F value Pr(>F)  
## EmployeeType    2     10   5.175   2.611 0.0737 .
## Residuals    2842   5634   1.983                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] "ANOVA: Current.Employee.Rating vs EmployeeType"
##                Df Sum Sq Mean Sq F value Pr(>F)
## EmployeeType    2    1.2  0.5836   0.569  0.566
## Residuals    2842 2915.0  1.0257               
## [1] "ANOVA: Engagement.Score vs PayZone"
##               Df Sum Sq Mean Sq F value Pr(>F)
## PayZone        2      1  0.5468   0.265  0.767
## Residuals   2842   5857  2.0610               
## [1] "ANOVA: Satisfaction.Score vs PayZone"
##               Df Sum Sq Mean Sq F value Pr(>F)
## PayZone        2      2  0.9423   0.474  0.623
## Residuals   2842   5653  1.9890               
## [1] "ANOVA: Work.Life.Balance.Score vs PayZone"
##               Df Sum Sq Mean Sq F value Pr(>F)
## PayZone        2      0  0.1922   0.097  0.908
## Residuals   2842   5644  1.9860               
## [1] "ANOVA: Current.Employee.Rating vs PayZone"
##               Df Sum Sq Mean Sq F value Pr(>F)
## PayZone        2    1.7  0.8408    0.82  0.441
## Residuals   2842 2914.5  1.0255               
## [1] "ANOVA: Engagement.Score vs EmployeeClassificationType"
##                              Df Sum Sq Mean Sq F value Pr(>F)
## EmployeeClassificationType    2      5   2.584   1.255  0.285
## Residuals                  2842   5853   2.059               
## [1] "ANOVA: Satisfaction.Score vs EmployeeClassificationType"
##                              Df Sum Sq Mean Sq F value Pr(>F)  
## EmployeeClassificationType    2     16   7.890   3.977 0.0189 *
## Residuals                  2842   5639   1.984                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] "ANOVA: Work.Life.Balance.Score vs EmployeeClassificationType"
##                              Df Sum Sq Mean Sq F value Pr(>F)
## EmployeeClassificationType    2      0  0.0752   0.038  0.963
## Residuals                  2842   5645  1.9861               
## [1] "ANOVA: Current.Employee.Rating vs EmployeeClassificationType"
##                              Df Sum Sq Mean Sq F value Pr(>F)  
## EmployeeClassificationType    2    5.8   2.918    2.85  0.058 .
## Residuals                  2842 2910.3   1.024                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Overview: What This Analysis Does

The ANOVA tests whether mean scores for engagement, satisfaction, etc. differ significantly between groups (e.g., Male vs. Female, Full-Time vs. Contract). A p-value < 0.05 indicates a statistically significant difference.

Variable, significant on, P-value, inference

Significant values:

EmployeeStatus, Current.Employee.Rating, 0.041, There is a significant difference in performance ratings between Active and Terminated employees. Likely, terminated employees had lower ratings.

EmployeeType, Satisfaction.Score, 0.070, Marginally significant — different employment types (e.g., full-time, contract) may experience different satisfaction levels.

EmployeeType, Work.Life.Balance.Score, 0.074, Marginal — employee type may slightly influence perceived work-life balance.

EmployeeClassificationType, Satisfaction.Score, 0.019, Employees with different contract durations (e.g., long-term vs. short-term) report significantly different satisfaction levels.

EmployeeClassificationType, Current.Employee.Rating, 0.058, Marginal — classification type may influence performance perception.

No Significant Differences (p > 0.05):

Gender: No significant effect on engagement, satisfaction, work-life balance, or performance.

Training.Type: Internal vs. external training had no effect on any measured outcomes.

BusinessUnit: No notable differences in scores across business units.

PayZone: Salary band had no effect on engagement, satisfaction, etc.

Engagement Score: Did not vary significantly across any factor tested.

Overall Inference

Performance Ratings do appear to be influenced by employment status and potentially by contract classifications — suggesting that employees with more stable or longer-term roles are rated higher.

Satisfaction Scores are more sensitive to contract structure (EmployeeClassificationType) and possibly employment type (e.g., full-time vs. contract).

Gender, Training Type, Pay Zone, and Business Unit do not significantly impact engagement, satisfaction, or performance — suggesting relative equality across these groups in how employees experience work.

Strategic Implications

Retention strategies should consider how contract types affect satisfaction and performance.

Training programs may not need to differ based on Training.Type alone — other factors may be more influential.

Focus performance management and engagement efforts more on structural or role-based factors than on demographics like gender or department.

library(vcd)

cat_pairs <- list(
  c("GenderCode", "Training.Outcome"),
  c("EmployeeStatus", "Training.Outcome"),
  c("BusinessUnit", "Training.Outcome"),
  c("EmployeeType", "Training.Outcome"),
  c("EmployeeStatus", "Performance.Score"),
  c("MaritalDesc", "EmployeeStatus"),
  c("RaceDesc", "Training.Outcome")
)

for (pair in cat_pairs) {
  var1 <- pair[1]
  var2 <- pair[2]
  tab <- table(HR_dataset[[var1]], HR_dataset[[var2]])
  cat("\nChi-square:", var1, "vs", var2, "\n")
  print(chisq.test(tab))
  print(paste("Cramér’s V:", round(assocstats(tab)$cramer, 3)))
}

## 
## Chi-square: GenderCode vs Training.Outcome 
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 6.0666, df = 3, p-value = 0.1084
## 
## [1] "Cramér’s V: 0.046"
## 
## Chi-square: EmployeeStatus vs Training.Outcome 
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 6.464, df = 3, p-value = 0.09109
## 
## [1] "Cramér’s V: 0.048"
## 
## Chi-square: BusinessUnit vs Training.Outcome 
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 31.702, df = 27, p-value = 0.2433
## 
## [1] "Cramér’s V: 0.061"
## 
## Chi-square: EmployeeType vs Training.Outcome 
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 12.464, df = 6, p-value = 0.05238
## 
## [1] "Cramér’s V: 0.047"
## 
## Chi-square: EmployeeStatus vs Performance.Score 
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 5.4042, df = 3, p-value = 0.1445
## 
## [1] "Cramér’s V: 0.044"
## 
## Chi-square: MaritalDesc vs EmployeeStatus 
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 1.2508, df = 3, p-value = 0.7408
## 
## [1] "Cramér’s V: 0.021"
## 
## Chi-square: RaceDesc vs Training.Outcome 
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 15.153, df = 12, p-value = 0.2332
## 
## [1] "Cramér’s V: 0.042"

Here’s a summary and interpretation of your Chi-square test and Cramér’s V results, which explore associations between categorical variables like gender, employment status, and training outcomes.

Overview: What This Analysis Does Chi-square tests evaluate whether two categorical variables are statistically associated (i.e., not independent).

Cramér’s V measures the strength of that association:

< 0.10 → very weak

0.10–0.30 → weak

0.30 → moderate or strong

Key Results and Interpretation: Variable Pair p-value Cramér’s V Interpretation

Gender vs Training Outcome, 0.108, 0.046, No statistically significant association. Gender has a very weak relationship with training results.

Employee Status vs Training Outcome, 0.091, 0.048, Borderline non-significant, with very weak effect. Terminated/active employees may slightly differ in training outcomes.

Business Unit vs Training Outcome, 0.243, 0.061, No significant difference across business units in training outcomes. Weak association.

Employee Type vs Training Outcome, 0.052, 0.047, Borderline significant. Training outcomes may slightly differ by employment type (e.g., full-time, contract), but effect is very weak.

Employee Status vs Performance Score, 0.145, 0.044, No statistically significant association. Performance scores are largely independent of employment status (though see earlier ANOVA where rating differed).

Marital Status vs Employee Status, 0.741, 0.021, No relationship. Marital status does not appear to influence employment status.

Race vs Training Outcome, 0.233, 0.042, No significant difference in training outcomes by race. The relationship is very weak.

Variable Pair	p-value	Cramér’s V	Interpretation
Gender vs Training Outcome	0.108	0.046	No statistically significant association. Gender has a very weak relationship with training results.

Employee Status vs Training Outcome | 0.091 | 0.048 | Borderline non-significant, with very weak effect. Terminated/active employees may slightly differ in training outcomes. |

Business Unit vs Training Outcome | 0.243 | 0.061 | No significant difference across business units in training outcomes. Weak association. |

Employee Type vs Training Outcome | 0.052 | 0.047 | Borderline significant. Training outcomes may slightly differ by employment type (e.g., full-time, contract), but effect is very weak. |

Employee Status vs Performance Score | 0.145 | 0.044 | No statistically significant association. Performance scores are largely independent of employment status (though see earlier ANOVA where rating differed). |

Marital Status vs Employee Status | 0.741 | 0.021 | No relationship. Marital status does not appear to influence employment status. |

Race vs Training Outcome | 0.233 | 0.042 | No significant difference in training outcomes by race. The relationship is very weak. |

Uses all of the categorical variables, and compares all to eachother

cat_pairs_2 <- list(
  c("GenderCode", "Training.Outcome"),
  c("GenderCode", "EmployeeStatus"),
  c("EmployeeStatus", "Training.Outcome"),
  c("EmployeeStatus", "Performance.Score"),
  c("EmployeeType", "Training.Outcome"),
  c("EmployeeType", "EmployeeStatus"),
  c("EmployeeClassificationType", "Training.Outcome"),
  c("EmployeeClassificationType", "Performance.Score"),
  c("DepartmentType", "Training.Outcome"),
  c("Division", "Training.Outcome"),
  c("PayZone", "EmployeeStatus"),
  c("RaceDesc", "Training.Outcome"),
  c("RaceDesc", "Performance.Score"),
  c("MaritalDesc", "EmployeeStatus"),
  c("Training.Type", "Training.Outcome")
)

for (pair in cat_pairs_2) {
  var1 <- pair[1]
  var2 <- pair[2]
  tab <- table(HR_dataset[[var1]], HR_dataset[[var2]])
  cat("\nChi-square:", var1, "vs", var2, "\n")
  print(chisq.test(tab))
  print(paste("Cramér’s V:", round(assocstats(tab)$cramer, 3)))
}

## 
## Chi-square: GenderCode vs Training.Outcome 
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 6.0666, df = 3, p-value = 0.1084
## 
## [1] "Cramér’s V: 0.046"
## 
## Chi-square: GenderCode vs EmployeeStatus 
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  tab
## X-squared = 3.7087, df = 1, p-value = 0.05413
## 
## [1] "Cramér’s V: 0.037"
## 
## Chi-square: EmployeeStatus vs Training.Outcome 
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 6.464, df = 3, p-value = 0.09109
## 
## [1] "Cramér’s V: 0.048"
## 
## Chi-square: EmployeeStatus vs Performance.Score 
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 5.4042, df = 3, p-value = 0.1445
## 
## [1] "Cramér’s V: 0.044"
## 
## Chi-square: EmployeeType vs Training.Outcome 
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 12.464, df = 6, p-value = 0.05238
## 
## [1] "Cramér’s V: 0.047"
## 
## Chi-square: EmployeeType vs EmployeeStatus 
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 1.7434, df = 2, p-value = 0.4182
## 
## [1] "Cramér’s V: 0.025"
## 
## Chi-square: EmployeeClassificationType vs Training.Outcome 
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 5.5969, df = 6, p-value = 0.4698
## 
## [1] "Cramér’s V: 0.031"
## 
## Chi-square: EmployeeClassificationType vs Performance.Score 
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 1.0415, df = 6, p-value = 0.984
## 
## [1] "Cramér’s V: 0.014"
## 
## Chi-square: DepartmentType vs Training.Outcome 
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 18.503, df = 15, p-value = 0.2371
## 
## [1] "Cramér’s V: 0.047"
## 
## Chi-square: Division vs Training.Outcome

## Warning in chisq.test(tab): Chi-squared approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 64.232, df = 72, p-value = 0.7311
## 
## [1] "Cramér’s V: 0.087"
## 
## Chi-square: PayZone vs EmployeeStatus 
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 2.7739, df = 2, p-value = 0.2498
## 
## [1] "Cramér’s V: 0.031"
## 
## Chi-square: RaceDesc vs Training.Outcome 
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 15.153, df = 12, p-value = 0.2332
## 
## [1] "Cramér’s V: 0.042"
## 
## Chi-square: RaceDesc vs Performance.Score 
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 17.44, df = 12, p-value = 0.1338
## 
## [1] "Cramér’s V: 0.045"
## 
## Chi-square: MaritalDesc vs EmployeeStatus 
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 1.2508, df = 3, p-value = 0.7408
## 
## [1] "Cramér’s V: 0.021"
## 
## Chi-square: Training.Type vs Training.Outcome 
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 5.0166, df = 3, p-value = 0.1706
## 
## [1] "Cramér’s V: 0.042"

Key Results

Marginally Significant Results (p close to 0.05)

Variable Pair	p-value	Cramér’s V	Interpretation
Gender vs EmployeeStatus	0.054	0.037	Just above significance threshold. Gender may slightly affect employment status, but effect size is very weak.

EmployeeType vs Training.Outcome | 0.052 | 0.047 | Borderline significant. Slight variation in training outcome by employment type (e.g., full-time vs. contract). |

EmployeeStatus vs Training.Outcome | 0.091 | 0.048 | Approaches significance. Terminated vs. active employees may differ slightly in training outcomes. |

All of these have very low Cramér’s V, suggesting that even if statistical significance existed, the practical effect is minimal.

Non-Significant Results (p > 0.05) All remaining pairs show no statistically significant relationship. Here are examples with typical effect sizes:

Variable Pair	p-value	Cramér’s V	Interpretation
Gender vs Training.Outcome	0.108	0.046	No significant difference in training outcomes between genders.

EmployeeStatus vs Performance.Score | 0.145 | 0.044 | Performance ratings do not vary significantly with employment status (though ANOVA showed a small effect). |

MaritalDesc vs EmployeeStatus | 0.741 | 0.021 | No relationship at all. |
RaceDesc vs Training.Outcome | 0.233 | 0.042 | No relationship. |
RaceDesc vs Performance.Score | 0.134 | 0.045 | No relationship. |
DepartmentType vs Training.Outcome | 0.237 | 0.047 | No difference in training outcome by department. |
Division vs Training.Outcome | 0.731 | 0.087 | No reliable relationship (and warning: approximation may be incorrect). |

EmployeeClassificationType vs Performance.Score | 0.984 | 0.014 | Completely independent. |
Training.Type vs Training.Outcome | 0.171 | 0.042 | No significant variation between internal and external training. |

Across all of these, Cramér’s V is consistently under 0.10, meaning extremely weak associations — regardless of statistical significance.

Overall Inference: No strong or moderate associations were found between any pair of categorical variables.

A few pairs showed borderline significance (e.g., Gender vs EmployeeStatus, EmployeeType vs Training.Outcome), but effect sizes were minimal.

Most relationships — including those involving race, marital status, training type, division, and department — showed no significant differences in outcomes.

Strategic Takeaways: Categorical attributes like gender, race, marital status, and job division do not appear to meaningfully affect training or performance outcomes in this dataset.

Slight differences in outcomes based on employment type or classification suggest contractual status may have small but notable influence — a possible area for deeper HR review.

Training program equity appears strong — outcomes are not significantly influenced by demographic or structural employee categories.

Integrated Summary of HR Data Analysis: 1. Employee Demographics and Distribution (Descriptive Insights) The age distribution of employees is relatively even between 30 and 60 years, peaking around age 55–60.

Younger employees (under 25) and older employees (over 70) are underrepresented.

Business units and departments are diverse, with some divisions clearly employing more staff than others — suggesting varied workforce distribution across company segments.

ANOVA Results — Impact of Categorical Variables on Engagement, Satisfaction, Work-Life Balance, and Performance ANOVA tests revealed very limited statistically significant differences between groups:

Significant or Marginal Findings:

Variable	Outcome	p-value	Interpretation
EmployeeStatus	Current.Employee.Rating	0.041	Terminated vs. active employees have significantly different performance ratings.
EmployeeClassificationType	Satisfaction.Score	0.019	Different contract types influence employee satisfaction.
EmployeeType	Satisfaction & Work-Life	~0.07	Marginally significant — different employee types (full-time, contract) may affect work-life experience and satisfaction.

No Significant Impact: Gender, BusinessUnit, PayZone, and Training.Type did not significantly affect engagement, satisfaction, or performance.

Engagement Score, across all factors, was not significantly different.

Inference: Employee performance and satisfaction are more affected by employment structure than by demographic or departmental characteristics.

Chi-Square Tests — Relationships Between Categorical Variables Chi-square tests were used to detect associations between categories such as training outcomes, employee status, gender, race, and job structure.

Marginal Associations Found:

Variable Pair	p-value	Cramér’s V	Interpretation
Gender vs EmployeeStatus	0.054	0.037	Very weak potential association.
EmployeeType vs Training.Outcome	0.052	0.047	Slight difference in training outcomes across employment types.
EmployeeStatus vs Training.Outcome	0.091	0.048	Small influence of employment status on training result.

All Other Pairs: Showed no statistically significant association.

Cramér’s V values were < 0.10 across the board — indicating very weak relationships.

Examples: Race, Marital Status, PayZone, Training Type, and DepartmentType had no meaningful effect on training outcomes, employee status, or performance.

Inference: Categorical employee attributes (e.g., gender, race, department) are not strong drivers of training or employment outcomes in this dataset. Structural factors like job type and status matter more.

Overall Insights & Strategic Implications The organization shows a fairly equitable environment with respect to gender, race, and department — these variables did not significantly influence training or performance outcomes.

Performance and satisfaction are most affected by employment structure — contract length, employee type, and employment status. This suggests:

Focus on contractual equity and better onboarding/training of short-term or contract employees.

Consider if employee classification impacts perceived support or growth.

Training outcomes appear consistent across demographics, which reflects well on program accessibility and fairness.

Significant Results You Found From ANOVA and Chi-square, we know:

EmployeeStatus significantly affects Current.Employee.Rating.

EmployeeClassificationType significantly affects Satisfaction.Score.

EmployeeType marginally affects Satisfaction and Work-Life Balance.

Training outcomes are slightly associated with EmployeeStatus and EmployeeType (borderline Chi-square p-values).

library(caTools)

## Warning: package 'caTools' was built under R version 4.3.3

# Binary outcome for classification (if not already created)
HR_dataset$EmployeeStatusBin <- ifelse(HR_dataset$EmployeeStatus == "Terminated", 1, 0)

# Stratified sampling
set.seed(2025)  # for reproducibility
split <- sample.split(HR_dataset$EmployeeStatusBin, SplitRatio = 0.7)

# Create training and testing sets
train_data <- subset(HR_dataset, split == TRUE)
test_data  <- subset(HR_dataset, split == FALSE)

Recommended Follow-Up Tests / Models 1. Logistic Regression — Predicting Employee Status (Terminated vs. Active) Use it to model the probability of attrition, incorporating:

Satisfaction.Score

Current.Employee.Rating

EmployeeType

EmployeeClassificationType

Age

Training.Outcome (as a categorical predictor)

logit_model <- glm(EmployeeStatusBin ~ Satisfaction.Score + Current.Employee.Rating +
                   EmployeeType + EmployeeClassificationType + Age + Training.Outcome,
                   data = train_data, family = "binomial")
summary(logit_model)

## 
## Call:
## glm(formula = EmployeeStatusBin ~ Satisfaction.Score + Current.Employee.Rating + 
##     EmployeeType + EmployeeClassificationType + Age + Training.Outcome, 
##     family = "binomial", data = train_data)
## 
## Coefficients:
##                                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                         -2.65479    0.37383  -7.102 1.23e-12 ***
## Satisfaction.Score                   0.08045    0.04695   1.713   0.0866 .  
## Current.Employee.Rating              0.11441    0.06508   1.758   0.0788 .  
## EmployeeTypeFull-Time                0.17907    0.15932   1.124   0.2610    
## EmployeeTypePart-Time                0.06258    0.16767   0.373   0.7090    
## EmployeeClassificationTypePart-Time  0.03878    0.16348   0.237   0.8125    
## EmployeeClassificationTypeTemporary  0.12204    0.16025   0.762   0.4463    
## Age                                 -0.00223    0.00371  -0.601   0.5478    
## Training.OutcomeFailed               0.21350    0.19265   1.108   0.2678    
## Training.OutcomeIncomplete           0.16590    0.19054   0.871   0.3839    
## Training.OutcomePassed               0.33261    0.18346   1.813   0.0698 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1584.5  on 1991  degrees of freedom
## Residual deviance: 1573.5  on 1981  degrees of freedom
## AIC: 1595.5
## 
## Number of Fisher Scoring iterations: 4

Model Purpose This logistic regression model predicts the probability that an employee is terminated (EmployeeStatusBin = 1) based on: Satisfaction Score, Performance Rating, Employee Type & Classification, Age, Training Outcome.

Inferences No predictor was statistically significant at the 0.05 level, but a few are marginally significant (p < 0.10):

Satisfaction.Score (+)

Current.Employee.Rating (+)

Training.OutcomePassed (+)

These marginal effects suggest a potentially weak link between performance/satisfaction/training success and attrition, but the direction of effect is unexpected (e.g., higher satisfaction → higher termination odds), which may indicate:

Data coding issues

Other unmeasured variables influencing termination

Voluntary turnover (resignations) are lumped in with terminations

Variables like Age, EmployeeType, and Training Outcome (Incomplete/Failed) had no significant effect.

# Predict probabilities
logit_probs <- predict(logit_model, newdata = test_data, type = "response")

# Convert to binary prediction (threshold = 0.5)
logit_preds <- ifelse(logit_probs > 0.5, 1, 0)

# Confusion matrix
table(Predicted = logit_preds, Actual = test_data$EmployeeStatusBin)

##          Actual
## Predicted   0   1
##         0 737 116

# Accuracy
logit_accuracy <- mean(logit_preds == test_data$EmployeeStatusBin)
print(paste("Logistic Regression Accuracy:", round(logit_accuracy, 3)))

## [1] "Logistic Regression Accuracy: 0.864"

library(pROC)

## Warning: package 'pROC' was built under R version 4.3.3

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

# Create ROC object
roc_obj <- roc(test_data$EmployeeStatusBin, logit_probs)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

# Plot ROC curve
plot(roc_obj, col = "blue", lwd = 2, main = "ROC Curve - Logistic Regression")
abline(a = 0, b = 1, lty = 2, col = "gray")  # reference line (random guess)

# Print AUC value
auc_value <- auc(roc_obj)
print(paste("AUC:", round(auc_value, 3)))

## [1] "AUC: 0.554"

The AUC, that determines how well this model, can correctly predict the data, is valued at 0.554, or visually having a steep incline. A value of 0.5 is no better than random guessing, meaning this model (logistic regression) is a poor model to use.

This compares

library(rpart)

## Warning: package 'rpart' was built under R version 4.3.3

library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 4.3.3

# Train decision tree
tree_model <- rpart(EmployeeStatusBin ~ Satisfaction.Score + Current.Employee.Rating +
                    EmployeeType + EmployeeClassificationType + Age + Training.Outcome,
                    data = train_data, method = "class")

# Visualize tree
rpart.plot(tree_model, type = 2, extra = 104)

# Predict on test data
tree_preds <- predict(tree_model, newdata = test_data, type = "class")

# Confusion matrix
table(Predicted = tree_preds, Actual = test_data$EmployeeStatusBin)

##          Actual
## Predicted   0   1
##         0 737 116
##         1   0   0

# Accuracy
tree_accuracy <- mean(tree_preds == test_data$EmployeeStatusBin)
print(paste("Decision Tree Accuracy:", round(tree_accuracy, 3)))

## [1] "Decision Tree Accuracy: 0.864"

tree_model <- rpart(EmployeeStatusBin ~ Satisfaction.Score + Current.Employee.Rating +
                    EmployeeType + EmployeeClassificationType + Age + Training.Outcome,
                    data = train_data,
                    method = "class",
                    control = rpart.control(cp = 0.001, minsplit = 20))

rpart.plot(tree_model, type = 2, extra = 104)

Each node in the tree:

Splits the data based on a predictor variable and threshold

Shows the predicted class (0 = Active, 1 = Terminated)

Displays the proportions of each class and sample size

Root Split: The first and most important split is on Current.Employee.Rating < 3

Employees rated below 3 are more likely to be terminated.

This aligns with expectations — low ratings are a strong predictor of attrition.

Left Subtree (Low rating < 3): Satisfaction.Score < 4, Age < 32, and EmployeeType are also used to refine predictions.

Termination likelihood increases for low satisfaction, younger age, and part-time workers.

Right Subtree (Rating ≥ 3): Splits on Training Outcome, Satisfaction.Score, and Age.

Interestingly, even some employees who passed training and are older (e.g., over 64) are predicted as more likely to leave — may indicate voluntary retirement patterns.

Overall Inferences from Tree Performance rating is the strongest predictor of termination risk.

Low satisfaction scores and younger age contribute to higher attrition risk.

EmployeeType (Part-Time) plays a minor role, especially among low-satisfaction employees.

Training outcome matters: those who failed training, even if they have decent performance, can be at higher risk.

# Example for decision tree
tree_probs <- predict(tree_model, newdata = test_data, type = "prob")[, 2]
roc_tree <- roc(test_data$EmployeeStatusBin, tree_probs)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

# Plot together
plot(roc_obj, col = "blue", lwd = 2, main = "ROC Curves")
lines(roc_tree, col = "red", lwd = 2)
legend("bottomright", legend = c("Logistic", "Tree"), col = c("blue", "red"), lwd = 2)

# AUC values
auc(roc_obj)

## Area under the curve: 0.5535

auc(roc_tree)

## Area under the curve: 0.5353

This visualization compares the random forest analysis and logit models to each other based on AUC values, with minimal change between the two, meaning each model performs poorly.

On testing data

tree_preds <- predict(tree_model, newdata = test_data, type = "class")
table(Predicted = tree_preds, Actual = test_data$EmployeeStatusBin)

##          Actual
## Predicted   0   1
##         0 722 113
##         1  15   3

mean(tree_preds == test_data$EmployeeStatusBin)

## [1] 0.8499414

variable importance

tree_model$variable.importance

##                        Age               EmployeeType 
##                  11.619480                   4.898906 
##         Satisfaction.Score           Training.Outcome 
##                   4.221799                   3.365957 
##    Current.Employee.Rating EmployeeClassificationType 
##                   3.335093                   1.936801

random forest to see

library(randomForest)

## Warning: package 'randomForest' was built under R version 4.3.3

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

## The following object is masked from 'package:dplyr':
## 
##     combine

rf_model <- randomForest(as.factor(EmployeeStatusBin) ~ Satisfaction.Score + Current.Employee.Rating +
                         EmployeeType + EmployeeClassificationType + Age + Training.Outcome,
                         data = train_data, ntree = 500)
print(rf_model)

## 
## Call:
##  randomForest(formula = as.factor(EmployeeStatusBin) ~ Satisfaction.Score +      Current.Employee.Rating + EmployeeType + EmployeeClassificationType +      Age + Training.Outcome, data = train_data, ntree = 500) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 13.76%
## Confusion matrix:
##      0 1 class.error
## 0 1718 3 0.001743173
## 1  271 0 1.000000000

Interpretation: The decision tree correctly classified most active employees.

However, it struggled to detect terminated employees (only 3 out of 116 were correctly identified).

Recall for termination (sensitivity) is low → the tree is biased toward the majority class (active).

Interpretation: The random forest achieves high accuracy overall, but it’s completely failing to detect terminated employees.

This is a sign of class imbalance — the model is overwhelmed by the majority class and does not learn termination patterns effectively.

Interpretation: Age is the most influential variable in the forest model, suggesting possible patterns related to retirement or early-career turnover.

EmployeeType and Satisfaction.Score also play notable roles.

Performance rating and training outcome matter, but less than expected.

EmployeeClassificationType has the least predictive power.

Overall Inference Your models (decision tree & random forest) are very good at identifying active employees but fail to detect terminations due to class imbalance.

Important predictors include Age, EmployeeType, and Satisfaction Score, confirming earlier ANOVA and tree results.

Performance and training outcome are weaker predictors than initially expected.

Multiple Linear Regression — Predicting Satisfaction or Rating Model:

Satisfaction.Score ~ EmployeeClassificationType + EmployeeType + Age + Work.Life.Balance.Score + GenderCode

Current.Employee.Rating ~ EmployeeStatus + Age + Training.Outcome + Performance.Score

satisfaction_model <- lm(Satisfaction.Score ~ EmployeeClassificationType + EmployeeType + 
                         Work.Life.Balance.Score + GenderCode + Age, data = train_data)
summary(satisfaction_model)

## 
## Call:
## lm(formula = Satisfaction.Score ~ EmployeeClassificationType + 
##     EmployeeType + Work.Life.Balance.Score + GenderCode + Age, 
##     data = train_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.38996 -1.10782  0.00999  1.09803  2.30095 
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          3.3885746  0.1328357  25.510  < 2e-16 ***
## EmployeeClassificationTypePart-Time -0.2153532  0.0777866  -2.769 0.005684 ** 
## EmployeeClassificationTypeTemporary -0.2917540  0.0769515  -3.791 0.000154 ***
## EmployeeTypeFull-Time               -0.0172106  0.0764854  -0.225 0.821988    
## EmployeeTypePart-Time               -0.1479744  0.0788964  -1.876 0.060863 .  
## Work.Life.Balance.Score             -0.0479663  0.0224525  -2.136 0.032774 *  
## GenderCodeMale                       0.0588794  0.0634104   0.929 0.353238    
## Age                                 -0.0004333  0.0017816  -0.243 0.807880    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.407 on 1984 degrees of freedom
## Multiple R-squared:  0.01261,    Adjusted R-squared:  0.009122 
## F-statistic: 3.618 on 7 and 1984 DF,  p-value: 0.0006994

Part-Time | -0.215 | 0.0057 ** | Significant decrease in satisfaction.

Part-Time | -0.148 | 0.061 † | Marginal effect, may lower satisfaction. |

Full-Time | -0.017 | 0.822 | No effect. | | Work.Life.Balance.Score | -0.048 | 0.033 * | Small negative relationship — surprising. Could be multicollinearity or reverse coding. | | Gender (Male) | +0.059 | 0.353 | Not significant. | | Age | -0.0004 | 0.808 | No meaningful effect. |

R² = 0.0126 → Model explains only 1.3% of the variance in Satisfaction Score.

F-test p = 0.0007 → The model is statistically significant, but weak in predictive power.

Contract type matters: Temporary and part-time employees report significantly lower satisfaction.

Work-life balance shows an unexpected negative effect — this may reflect a confounding variable or reversed scale (you should check coding direction).

Demographics (Gender, Age) and Employment Type (full vs part-time) do not meaningfully impact satisfaction.

satisfaction_model_2 <- lm(Current.Employee.Rating ~ EmployeeStatus + Age + Training.Outcome + Performance.Score, data = train_data)
summary(satisfaction_model_2)

## 
## Call:
## lm(formula = Current.Employee.Rating ~ EmployeeStatus + Age + 
##     Training.Outcome + Performance.Score, data = train_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.11673 -0.89647  0.03568  0.10359  2.14753 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         3.067919   0.098112  31.269   <2e-16 ***
## EmployeeStatusTerminated            0.113303   0.066198   1.712   0.0871 .  
## Age                                -0.001330   0.001276  -1.043   0.2972    
## Training.OutcomeFailed             -0.046067   0.064834  -0.711   0.4775    
## Training.OutcomeIncomplete         -0.059108   0.063575  -0.930   0.3526    
## Training.OutcomePassed             -0.068353   0.062759  -1.089   0.2762    
## Performance.ScoreFully Meets       -0.009976   0.070239  -0.142   0.8871    
## Performance.ScoreNeeds Improvement  0.084732   0.114348   0.741   0.4588    
## Performance.ScorePIP               -0.105794   0.151502  -0.698   0.4851    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.01 on 1983 degrees of freedom
## Multiple R-squared:  0.003331,   Adjusted R-squared:  -0.0006898 
## F-statistic: 0.8285 on 8 and 1983 DF,  p-value: 0.5774

None of the predictors significantly influence performance ratings in this model.

Surprisingly, performance scores themselves have no statistical impact on the current employee rating. This might reflect redundancy between variables or poor measurement structure.

Terminated employees having slightly higher ratings could imply voluntary exits, or performance not being the only factor in separation.

Satisfaction Score Model: Has some predictive value, especially for employment classification types.

Suggests contract stability impacts satisfaction more than demographic traits.

Performance Rating Model: Performs poorly in predicting ratings.

Variables chosen do not adequately explain how employees are rated — perhaps other unmeasured internal factors (manager reviews, tenure, etc.) are more relevant.

# Predict on test set
satisfaction_preds <- predict(satisfaction_model, newdata = test_data)

# Actual values
actual <- test_data$Satisfaction.Score

# Root Mean Squared Error (RMSE)
rmse <- sqrt(mean((satisfaction_preds - actual)^2))

# Mean Absolute Error (MAE)
mae <- mean(abs(satisfaction_preds - actual))

print(paste("RMSE:", round(rmse, 3)))

## [1] "RMSE: 1.413"

print(paste("MAE:", round(mae, 3)))

## [1] "MAE: 1.223"

# Predict on test set
satisfaction_preds_2 <- predict(satisfaction_model_2, newdata = test_data)

# Actual values
actual_2 <- test_data$Current.Employee.Rating

# Root Mean Squared Error (RMSE)
rmse_2 <- sqrt(mean((satisfaction_preds_2 - actual_2)^2))

# Mean Absolute Error (MAE)
mae_2 <- mean(abs(satisfaction_preds_2 - actual_2))

print(paste("RMSE:", round(rmse_2, 3)))

## [1] "RMSE: 1.019"

print(paste("MAE:", round(mae_2, 3)))

## [1] "MAE: 0.706"

plot(actual, satisfaction_preds,
     xlab = "Actual Satisfaction Score",
     ylab = "Predicted Satisfaction Score",
     main = "Actual vs. Predicted Satisfaction",
     col = "darkblue", pch = 16)
abline(0, 1, col = "red", lwd = 2)  # ideal prediction line

plot(actual, satisfaction_preds_2,
     xlab = "Actual Satisfaction Score",
     ylab = "Predicted Satisfaction Score",
     main = "Actual vs. Predicted Satisfaction",
     col = "darkblue", pch = 16)
abline(0, 1, col = "red", lwd = 2)  # ideal prediction line

plot(satisfaction_model)

plot(satisfaction_model_2)

This will show:

Residuals vs Fitted (check for nonlinearity)

Normal Q-Q (residual normality)

Scale-Location (homoscedasticity)

Residuals vs Leverage (influential points)

XGBoost for Attrition Prediction

#install.packages("xgboost")
library(xgboost)
library(Matrix)

# Prepare data
HR_dataset$EmployeeStatusBin <- ifelse(HR_dataset$EmployeeStatus == "Terminated", 1, 0)
train_data$EmployeeStatusBin <- as.numeric(as.character(train_data$EmployeeStatusBin))

# Create model matrix
x_train <- model.matrix(EmployeeStatusBin ~ Satisfaction.Score + Current.Employee.Rating +
                        EmployeeType + EmployeeClassificationType + Age + Training.Outcome, data = train_data)[,-1]
y_train <- train_data$EmployeeStatusBin

# Convert to DMatrix
dtrain <- xgb.DMatrix(data = x_train, label = y_train)

# Train XGBoost
xgb_model <- xgboost(data = dtrain, nrounds = 100, objective = "binary:logistic", eval_metric = "auc")

## [1]  train-auc:0.557041 
## [2]  train-auc:0.670265 
## [3]  train-auc:0.741944 
## [4]  train-auc:0.769025 
## [5]  train-auc:0.783156 
## [6]  train-auc:0.800938 
## [7]  train-auc:0.808209 
## [8]  train-auc:0.817299 
## [9]  train-auc:0.824921 
## [10] train-auc:0.835835 
## [11] train-auc:0.853494 
## [12] train-auc:0.865078 
## [13] train-auc:0.871060 
## [14] train-auc:0.879662 
## [15] train-auc:0.883414 
## [16] train-auc:0.885735 
## [17] train-auc:0.893350 
## [18] train-auc:0.904249 
## [19] train-auc:0.907475 
## [20] train-auc:0.916510 
## [21] train-auc:0.917776 
## [22] train-auc:0.924036 
## [23] train-auc:0.928948 
## [24] train-auc:0.932194 
## [25] train-auc:0.935041 
## [26] train-auc:0.938716 
## [27] train-auc:0.940138 
## [28] train-auc:0.942453 
## [29] train-auc:0.945724 
## [30] train-auc:0.947981 
## [31] train-auc:0.949121 
## [32] train-auc:0.950314 
## [33] train-auc:0.951307 
## [34] train-auc:0.953208 
## [35] train-auc:0.955465 
## [36] train-auc:0.955603 
## [37] train-auc:0.957522 
## [38] train-auc:0.958450 
## [39] train-auc:0.958615 
## [40] train-auc:0.959473 
## [41] train-auc:0.959632 
## [42] train-auc:0.960157 
## [43] train-auc:0.960980 
## [44] train-auc:0.961529 
## [45] train-auc:0.962346 
## [46] train-auc:0.964353 
## [47] train-auc:0.966336 
## [48] train-auc:0.967421 
## [49] train-auc:0.968167 
## [50] train-auc:0.968778 
## [51] train-auc:0.970397 
## [52] train-auc:0.970794 
## [53] train-auc:0.971178 
## [54] train-auc:0.971787 
## [55] train-auc:0.972657 
## [56] train-auc:0.973723 
## [57] train-auc:0.974263 
## [58] train-auc:0.975151 
## [59] train-auc:0.976701 
## [60] train-auc:0.976967 
## [61] train-auc:0.978435 
## [62] train-auc:0.978686 
## [63] train-auc:0.978963 
## [64] train-auc:0.979662 
## [65] train-auc:0.980125 
## [66] train-auc:0.980318 
## [67] train-auc:0.981935 
## [68] train-auc:0.981851 
## [69] train-auc:0.982436 
## [70] train-auc:0.983318 
## [71] train-auc:0.983581 
## [72] train-auc:0.983999 
## [73] train-auc:0.985211 
## [74] train-auc:0.985535 
## [75] train-auc:0.985809 
## [76] train-auc:0.986193 
## [77] train-auc:0.986450 
## [78] train-auc:0.987424 
## [79] train-auc:0.987820 
## [80] train-auc:0.987859 
## [81] train-auc:0.988277 
## [82] train-auc:0.988560 
## [83] train-auc:0.988918 
## [84] train-auc:0.989304 
## [85] train-auc:0.989994 
## [86] train-auc:0.990288 
## [87] train-auc:0.990530 
## [88] train-auc:0.990681 
## [89] train-auc:0.991137 
## [90] train-auc:0.991531 
## [91] train-auc:0.991782 
## [92] train-auc:0.992135 
## [93] train-auc:0.992367 
## [94] train-auc:0.992470 
## [95] train-auc:0.992594 
## [96] train-auc:0.992545 
## [97] train-auc:0.992622 
## [98] train-auc:0.992633 
## [99] train-auc:0.992624 
## [100]    train-auc:0.992596

# Variable importance
xgb.importance(model = xgb_model) |> xgb.plot.importance()

train-auc: 0.9926 This AUC value is very high, indicating excellent model performance on the training data.

AUC = 1.0 means perfect prediction; AUC = 0.5 means no better than random.

Caution: This is training AUC — we should still check on the test set to confirm it’s not overfitting.

Feature Importance (Graph) Your plot shows the most influential features in predicting attrition: Interpretation: Age dominates: possibly due to younger employees being more likely to leave (early churn) or older employees retiring.

Satisfaction matters, confirming earlier ANOVA and logistic findings.

Training outcome has relatively weak direct impact — this is valuable insight for your training program evaluation.

Testing

# Prepare test data
x_test <- model.matrix(EmployeeStatusBin ~ Satisfaction.Score + Current.Employee.Rating +
                       EmployeeType + EmployeeClassificationType + Age + Training.Outcome,
                       data = test_data)[, -1]
dtest <- xgb.DMatrix(data = x_test)

# Predict probabilities and classes
xgb_preds <- predict(xgb_model, newdata = dtest)
xgb_classes <- ifelse(xgb_preds > 0.5, 1, 0)

# Confusion matrix
table(Predicted = xgb_classes, Actual = test_data$EmployeeStatusBin)

##          Actual
## Predicted   0   1
##         0 708 109
##         1  29   7

# Accuracy
mean(xgb_classes == test_data$EmployeeStatusBin)

## [1] 0.8382181

# AUC
library(pROC)
roc_xgb <- roc(test_data$EmployeeStatusBin, xgb_preds)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

plot(roc_xgb, col = "blue", main = "XGBoost ROC - Test Set")

auc(roc_xgb)

## Area under the curve: 0.5328

GAM for Satisfaction Score

library(mgcv)

## Warning: package 'mgcv' was built under R version 4.3.3

## Loading required package: nlme

## 
## Attaching package: 'nlme'

## The following object is masked from 'package:dplyr':
## 
##     collapse

## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.

## 
## Attaching package: 'mgcv'

## The following objects are masked from 'package:gam':
## 
##     gam, gam.control, gam.fit, s

gam_model <- gam(Satisfaction.Score ~ s(Age, k = 5) + 
                 s(Work.Life.Balance.Score, k = 5) + 
                 EmployeeType + EmployeeClassificationType + GenderCode,
                 data = train_data)

summary(gam_model)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Satisfaction.Score ~ s(Age, k = 5) + s(Work.Life.Balance.Score, 
##     k = 5) + EmployeeType + EmployeeClassificationType + GenderCode
## 
## Parametric coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          3.22393    0.07597  42.438  < 2e-16 ***
## EmployeeTypeFull-Time               -0.01644    0.07646  -0.215 0.829779    
## EmployeeTypePart-Time               -0.14805    0.07886  -1.877 0.060625 .  
## EmployeeClassificationTypePart-Time -0.21776    0.07778  -2.800 0.005165 ** 
## EmployeeClassificationTypeTemporary -0.28955    0.07694  -3.763 0.000173 ***
## GenderCodeMale                       0.05901    0.06338   0.931 0.352010    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                              edf Ref.df     F p-value  
## s(Age)                     1.381  1.662 0.331  0.7539  
## s(Work.Life.Balance.Score) 1.568  1.923 2.449  0.0632 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.00996   Deviance explained = 1.39%
## GCV = 1.9877  Scale est. = 1.9788    n = 1992

plot(gam_model, pages = 1, se = TRUE)

Model Fit Adjusted R²: 0.00996 → model explains ~1% of variance in satisfaction (very weak)

Deviance explained: 1.39%

This confirms what you’ve seen with linear models: Satisfaction.Score is not well explained by your current predictors

Conclusion:

Part-time and temporary classifications are associated with significantly lower satisfaction

Gender and full-time/part-time status alone are not significant

Conclusion:

Work-life balance may have a nonlinear effect, but it’s only marginally significant.

Age has no predictive relationship with satisfaction in this model.

Category	Variable	Effect on Satisfaction
Employment type	Temporary or Part-Time	Strongly negative
Work-life balance	Slight trend (nonlinear)	Weak, marginal
Age, Gender	Not significant	No effect

gam_enhanced <- gam(Satisfaction.Score ~ 
                    s(Age, k = 5) + 
                    s(Work.Life.Balance.Score, k = 5) +
                    EmployeeType * EmployeeClassificationType +
                    GenderCode,
                    data = train_data)
#This model:

#Adds an interaction between EmployeeType and EmployeeClassificationType

#Keeps non-linear smoothing for Age and Work.Life.Balance.Score

summary(gam_enhanced)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Satisfaction.Score ~ s(Age, k = 5) + s(Work.Life.Balance.Score, 
##     k = 5) + EmployeeType * EmployeeClassificationType + GenderCode
## 
## Parametric coefficients:
##                                                           Estimate Std. Error
## (Intercept)                                                3.20068    0.09848
## EmployeeTypeFull-Time                                     -0.04986    0.13227
## EmployeeTypePart-Time                                     -0.03989    0.13483
## EmployeeClassificationTypePart-Time                       -0.23538    0.13591
## EmployeeClassificationTypeTemporary                       -0.20271    0.13520
## GenderCodeMale                                             0.05890    0.06341
## EmployeeTypeFull-Time:EmployeeClassificationTypePart-Time  0.06517    0.18823
## EmployeeTypePart-Time:EmployeeClassificationTypePart-Time -0.01447    0.19358
## EmployeeTypeFull-Time:EmployeeClassificationTypeTemporary  0.03161    0.18623
## EmployeeTypePart-Time:EmployeeClassificationTypeTemporary -0.30722    0.19139
##                                                           t value Pr(>|t|)    
## (Intercept)                                                32.502   <2e-16 ***
## EmployeeTypeFull-Time                                      -0.377   0.7062    
## EmployeeTypePart-Time                                      -0.296   0.7674    
## EmployeeClassificationTypePart-Time                        -1.732   0.0834 .  
## EmployeeClassificationTypeTemporary                        -1.499   0.1339    
## GenderCodeMale                                              0.929   0.3531    
## EmployeeTypeFull-Time:EmployeeClassificationTypePart-Time   0.346   0.7292    
## EmployeeTypePart-Time:EmployeeClassificationTypePart-Time  -0.075   0.9404    
## EmployeeTypeFull-Time:EmployeeClassificationTypeTemporary   0.170   0.8652    
## EmployeeTypePart-Time:EmployeeClassificationTypeTemporary  -1.605   0.1086    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                              edf Ref.df     F p-value  
## s(Age)                     1.396  1.685 0.350  0.7408  
## s(Work.Life.Balance.Score) 1.566  1.921 2.457  0.0623 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0103   Deviance explained = 1.62%
## GCV = 1.9911  Scale est. = 1.9781    n = 1992

plot(gam_enhanced, pages = 1, se = TRUE, shade = TRUE)

#This shows the shapes and uncertainty of s(Age) and s(Work.Life.Balance.Score).

Metric	Value	Meaning
Adjusted R²	0.0103	Model explains just 1% of satisfaction variance — very weak
Deviance Explained	1.62%	Confirms low explanatory power
GCV (Generalized CV)	1.99	Used internally — lower is better

Interpretation: Even with added interaction terms and smooth functions, the model still struggles to predict satisfaction.

Key Takeaways Work-Life Balance has the only near-significant nonlinear influence on satisfaction.

Employment classification (e.g., part-time/temporary) may slightly lower satisfaction, but evidence is weak.

Gender, Age, and interactions have no meaningful effect.

Despite improvements, most of the variation in satisfaction remains unexplained.

preds <- predict(gam_enhanced, newdata = test_data)
actuals <- test_data$Satisfaction.Score

# RMSE and MAE
rmse <- sqrt(mean((preds - actuals)^2))
mae <- mean(abs(preds - actuals))

cat("RMSE:", round(rmse, 3), "\n")

## RMSE: 1.417

cat("MAE:", round(mae, 3), "\n")

## MAE: 1.228

#Model performance

K-Means Clustering (Satisfaction + Engagement + Rating)

# Scale relevant variables
cluster_data <- scale(HR_dataset[, c("Satisfaction.Score", "Engagement.Score", "Current.Employee.Rating")])

# Run K-Means
set.seed(2025)
kmeans_model <- kmeans(cluster_data, centers = 3, nstart = 25)

# Add cluster labels
HR_dataset$Cluster <- as.factor(kmeans_model$cluster)

# Visualize with ggplot
library(ggplot2)
library(ggfortify)
autoplot(kmeans_model, data = cluster_data, frame = TRUE)

# 1. Select relevant columns and remove missing values
cluster_vars <- HR_dataset[, c("Satisfaction.Score", "Engagement.Score", "Current.Employee.Rating")]
cluster_vars <- na.omit(cluster_vars)

# 2. Scale the variables (very important for K-means)
cluster_scaled <- scale(cluster_vars)

# 3. Choose number of clusters (try k = 3 to start)
set.seed(2025)
kmeans_model <- kmeans(cluster_scaled, centers = 3, nstart = 25)

# 4. Add cluster labels to the original data
HR_dataset$Cluster <- factor(kmeans_model$cluster)

# If not installed:
#install.packages("factoextra")
library(factoextra)

## Warning: package 'factoextra' was built under R version 4.3.3

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

# 2D cluster plot
fviz_cluster(kmeans_model, data = cluster_scaled,
             ellipse.type = "norm",
             geom = "point", 
             main = "K-Means Clusters of Employees")

aggregate(cbind(Satisfaction.Score, Engagement.Score, Current.Employee.Rating) ~ Cluster, data = HR_dataset, FUN = mean)

##   Cluster Satisfaction.Score Engagement.Score Current.Employee.Rating
## 1       1           1.411538         2.237179                2.979487
## 2       2           3.231123         4.430939                3.080110
## 3       3           4.091931         1.850868                2.853933

This helps answer:

Are there clusters with low satisfaction but high performance?

Is there a group that is disengaged across the board?

Can you link any cluster back to termination rates, departments, or training types?

Cluster	Satisfaction	Engagement	Rating	Interpretation
1	1.41 (very low)	2.24 (low)	2.98	❌ At-risk & disengaged — unhappy and underengaged, likely to leave
2	3.23 (moderate)	4.43 (high)	3.08	✅ High potential — strong engagement, moderate satisfaction and rating
3	4.09 (high)	1.85 (very low)	2.85	⚠️ Complacent or disconnected — satisfied, but unengaged — may coast or be isolated

Interpretation Cluster 1 is a classic attrition risk group — very low satisfaction + low engagement.

Cluster 2 looks like your high-performing, committed core — engaged and solidly rated.

Cluster 3 may represent content but disconnected employees — they’re satisfied but not engaged. This group might lack challenge or connection.

table(HR_dataset$Cluster, HR_dataset$EmployeeStatus)

##    
##     Active Terminated
##   1    676        104
##   2    940        146
##   3    842        137

table(HR_dataset$Cluster, HR_dataset$Training.Outcome)

##    
##     Completed Failed Incomplete Passed
##   1       200    189        198    193
##   2       283    258        280    265
##   3       254    221        253    251

library(ggplot2)
ggplot(HR_dataset, aes(x = Engagement.Score, y = Satisfaction.Score, color = Cluster)) +
  geom_point(alpha = 0.6) +
  labs(title = "Employee Clusters", x = "Engagement", y = "Satisfaction") +
  theme_minimal()

These clusters give you actionable insights beyond what your regression models could. They’re great for:

Targeting retention strategies

Tailoring engagement initiatives

Customizing training approaches

fviz_nbclust(cluster_scaled, kmeans, method = "wss") + 
  labs(subtitle = "Elbow Method for Choosing k")

PCA for Visualization

pca_data <- prcomp(cluster_data, scale. = TRUE)
summary(pca_data)  # Proportion of variance explained

## Importance of components:
##                           PC1    PC2    PC3
## Standard deviation     1.0176 0.9978 0.9844
## Proportion of Variance 0.3451 0.3319 0.3230
## Cumulative Proportion  0.3451 0.6770 1.0000

# Plot PCA
biplot(pca_data, scale = 0)

Interpretation: No single component dominates — all 3 contribute fairly equally.

2 PCs explain ~68% of total variance — that’s solid for plotting.

You can safely visualize in 2D using PC1 and PC2 with minimal info loss.

# Compute PCA
pca_res <- prcomp(cluster_scaled, scale. = TRUE)

# Add cluster labels
pca_df <- as.data.frame(pca_res$x)
pca_df$Cluster <- HR_dataset$Cluster

# Plot
ggplot(pca_df, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point(alpha = 0.6, size = 2) +
  labs(title = "PCA Plot of Employee Clusters",
       x = "Principal Component 1",
       y = "Principal Component 2") +
  theme_minimal()

This PCA plot will:

Visually confirm how well-separated your clusters are

Reveal which clusters are more compact vs scattered

Help you present intuitive groupings to stakeholders

Survival Analysis (Time Until Termination)

#install.packages("survival")
library(survival)

# Assume 'StartDate' and 'EndDate' or current date
HR_dataset$StartDate <- as.Date(HR_dataset$StartDate, format = "%d-%b-%Y")
HR_dataset$EndDate <- ifelse(HR_dataset$EmployeeStatus == "Terminated",
                              HR_dataset$Training.Date, Sys.Date())
HR_dataset$EndDate <- as.Date(HR_dataset$EndDate)

# Time to event
HR_dataset$Time <- as.numeric(difftime(HR_dataset$EndDate, HR_dataset$StartDate, units = "days"))
HR_dataset$Status <- ifelse(HR_dataset$EmployeeStatus == "Terminated", 1, 0)

# Cox model
cox_model <- coxph(Surv(Time, Status) ~ Satisfaction.Score + EmployeeType + Age, data = HR_dataset)
summary(cox_model)

## Call:
## coxph(formula = Surv(Time, Status) ~ Satisfaction.Score + EmployeeType + 
##     Age, data = HR_dataset)
## 
##   n= 2845, number of events= 387 
## 
##                            coef exp(coef)  se(coef)     z Pr(>|z|)
## Satisfaction.Score    0.0534818 1.0549378 0.0362471 1.475    0.140
## EmployeeTypeFull-Time 0.1689881 1.1841060 0.1225053 1.379    0.168
## EmployeeTypePart-Time 0.0394723 1.0402617 0.1293064 0.305    0.760
## Age                   0.0006256 1.0006257 0.0028652 0.218    0.827
## 
##                       exp(coef) exp(-coef) lower .95 upper .95
## Satisfaction.Score        1.055     0.9479    0.9826     1.133
## EmployeeTypeFull-Time     1.184     0.8445    0.9314     1.505
## EmployeeTypePart-Time     1.040     0.9613    0.8074     1.340
## Age                       1.001     0.9994    0.9950     1.006
## 
## Concordance= 0.537  (se = 0.015 )
## Likelihood ratio test= 4.41  on 4 df,   p=0.4
## Wald test            = 4.44  on 4 df,   p=0.4
## Score (logrank) test = 4.44  on 4 df,   p=0.3

Metric	Value
Sample size (n)	2,845 employees
Number of events	387 terminations
Concordance	0.537 (se = 0.015)
Log-rank p-value	0.3–0.4 (not significant)

Predictor	Coef	HR (exp(coef))	p-value	Interpretation
Satisfaction.Score	0.0535	1.055	0.140	Weak, non-significant ↑ in termination hazard as satisfaction ↑ (unexpected)
EmployeeType: Full-Time	0.169	1.18	0.168	Small, non-significant ↑ in risk
EmployeeType: Part-Time	0.0395	1.04	0.76	Very small effect, not significant
Age	0.0006	1.0006	0.826	No effect

Interpretation:

None of the variables are statistically significant (all p > 0.1).

Concordance = 0.537 → the model predicts slightly better than random, but not reliably.

Hazard Ratios (HR) close to 1 → very little influence on time to termination.

cox_model2 <- coxph(Surv(Time, Status) ~ 
                    Satisfaction.Score + 
                    Current.Employee.Rating + 
                    Engagement.Score + 
                    Training.Outcome + 
                    Performance.Score + 
                    EmployeeClassificationType + 
                    Age,
                    data = HR_dataset)
summary(cox_model2)

## Call:
## coxph(formula = Surv(Time, Status) ~ Satisfaction.Score + Current.Employee.Rating + 
##     Engagement.Score + Training.Outcome + Performance.Score + 
##     EmployeeClassificationType + Age, data = HR_dataset)
## 
##   n= 2845, number of events= 387 
## 
##                                           coef  exp(coef)   se(coef)      z
## Satisfaction.Score                   0.0618255  1.0637767  0.0364200  1.698
## Current.Employee.Rating              0.1119695  1.1184788  0.0499818  2.240
## Engagement.Score                    -0.0292633  0.9711608  0.0354678 -0.825
## Training.OutcomeFailed               0.0983209  1.1033168  0.1522914  0.646
## Training.OutcomeIncomplete           0.1373032  1.1471760  0.1471421  0.933
## Training.OutcomePassed               0.3431868  1.4094320  0.1422915  2.412
## Performance.ScoreFully Meets         0.2242329  1.2513625  0.1682340  1.333
## Performance.ScoreNeeds Improvement   0.0790807  1.0822916  0.2743003  0.288
## Performance.ScorePIP                 0.7293462  2.0737244  0.2841865  2.566
## EmployeeClassificationTypePart-Time  0.0408242  1.0416689  0.1277977  0.319
## EmployeeClassificationTypeTemporary  0.1512907  1.1633348  0.1229377  1.231
## Age                                  0.0006774  1.0006776  0.0028774  0.235
##                                     Pr(>|z|)  
## Satisfaction.Score                    0.0896 .
## Current.Employee.Rating               0.0251 *
## Engagement.Score                      0.4093  
## Training.OutcomeFailed                0.5185  
## Training.OutcomeIncomplete            0.3508  
## Training.OutcomePassed                0.0159 *
## Performance.ScoreFully Meets          0.1826  
## Performance.ScoreNeeds Improvement    0.7731  
## Performance.ScorePIP                  0.0103 *
## EmployeeClassificationTypePart-Time   0.7494  
## EmployeeClassificationTypeTemporary   0.2185  
## Age                                   0.8139  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                                     exp(coef) exp(-coef) lower .95 upper .95
## Satisfaction.Score                     1.0638     0.9400    0.9905     1.142
## Current.Employee.Rating                1.1185     0.8941    1.0141     1.234
## Engagement.Score                       0.9712     1.0297    0.9059     1.041
## Training.OutcomeFailed                 1.1033     0.9064    0.8186     1.487
## Training.OutcomeIncomplete             1.1472     0.8717    0.8598     1.531
## Training.OutcomePassed                 1.4094     0.7095    1.0664     1.863
## Performance.ScoreFully Meets           1.2514     0.7991    0.8999     1.740
## Performance.ScoreNeeds Improvement     1.0823     0.9240    0.6322     1.853
## Performance.ScorePIP                   2.0737     0.4822    1.1881     3.620
## EmployeeClassificationTypePart-Time    1.0417     0.9600    0.8109     1.338
## EmployeeClassificationTypeTemporary    1.1633     0.8596    0.9142     1.480
## Age                                    1.0007     0.9993    0.9951     1.006
## 
## Concordance= 0.559  (se = 0.015 )
## Likelihood ratio test= 21.38  on 12 df,   p=0.05
## Wald test            = 21.89  on 12 df,   p=0.04
## Score (logrank) test = 22.07  on 12 df,   p=0.04

#install.packages("survival")
#install.packages("survminer")  # includes ggsurvplot()
library(survival)
library(survminer)

## Warning: package 'survminer' was built under R version 4.3.3

## Loading required package: ggpubr

## Warning: package 'ggpubr' was built under R version 4.3.3

## 
## Attaching package: 'survminer'

## The following object is masked from 'package:survival':
## 
##     myeloma

library(survminer)
fit <- survfit(Surv(Time, Status) ~ EmployeeClassificationType, data = HR_dataset)
ggsurvplot(fit, data = HR_dataset, conf.int = TRUE, pval = TRUE)

Interpretation of the Plot Each line = a classification group (e.g., full-time, part-time, temp)

Steeper drop = faster attrition

The p-value tests whether the survival curves are significantly different

MS 4203 final project

Chance Jones

2025-03-08