HR Data Analysis

library(ggplot2)

Load data

train_data <- read.csv("HR_AcostaDegenerSchrand.csv")
test_data <- read.csv("HR_test.csv")

Summary Statistics

summary(train_data)

##  satisfaction        evaluation        number_project  average_montly_hours
##  Length:10000       Length:10000       Min.   :2.000   Min.   : 96.0       
##  Class :character   Class :character   1st Qu.:3.000   1st Qu.:156.0       
##  Mode  :character   Mode  :character   Median :4.000   Median :200.0       
##                                        Mean   :3.795   Mean   :200.9       
##                                        3rd Qu.:5.000   3rd Qu.:245.0       
##                                        Max.   :7.000   Max.   :310.0       
##  time_spend_company Work_accident         left        promotion_last_5years
##  Min.   : 2.000     Min.   :0.0000   Min.   :0.0000   Min.   :0.0000       
##  1st Qu.: 3.000     1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000       
##  Median : 3.000     Median :0.0000   Median :0.0000   Median :0.0000       
##  Mean   : 3.499     Mean   :0.1419   Mean   :0.2363   Mean   :0.0209       
##  3rd Qu.: 4.000     3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000       
##  Max.   :10.000     Max.   :1.0000   Max.   :1.0000   Max.   :1.0000       
##   department           salary         
##  Length:10000       Length:10000      
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##

Distribution of satisfaction

manual_order <- c("very dissatisfied", "slightly dissatisfied", "neutral", "slightly satisfied", "moderately dissatisfied", "moderately satisfied", "very satisfied")

ggplot(train_data, aes(x = factor(satisfaction, levels = manual_order))) +
  geom_bar(fill = "lightblue") +
  geom_text(
    stat = "count",
    aes(label = ..count..),
    position = position_stack(vjust = 0.5),
    color = "black",
    size = 3
  ) +
  labs(title = "Distribution of Satisfaction") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title.x = element_blank()
  )

## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Calculate the count and percentage of each satisfaction level in the training dataset
satisfaction_table <- table(train_data$satisfaction)
percentage_table <- prop.table(satisfaction_table) * 100

# Create a data frame for better visualization
satisfaction_df <- data.frame(
  Satisfaction = names(satisfaction_table),
  Count = as.numeric(satisfaction_table),
  Percentage = as.numeric(percentage_table)
)

# Print the table
print(satisfaction_df)

##              Satisfaction Count Percentage
## 1 moderately dissatisfied  1473      14.73
## 2    moderately satisfied  1497      14.97
## 3                 neutral  1437      14.37
## 4   slightly dissatisfied  1452      14.52
## 5      slightly satisfied  1366      13.66
## 6       very dissatisfied  1442      14.42
## 7          very satisfied  1333      13.33

# Alternatively, you can use ggplot to create a bar plot with counts and percentages
library(ggplot2)

ggplot(train_data, aes(x = factor(satisfaction, levels = manual_order))) +
  geom_bar(fill = "lightblue") +
  geom_text(
    stat = "count",
    aes(label = paste0(..count.., " (", sprintf("%.1f%%", ..count../sum(..count..) * 100), ")")),
    position = position_stack(vjust = 0.5),
    color = "black",
    size = 3
  ) +
  labs(title = "Distribution of Satisfaction") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title.x = element_blank()
  )

Distribution of Evaluation

# Calculate the count and percentage of each evaluation level in the training dataset
evaluation_order <- c("poor", "fair", "good") 
evaluation_table <- table(train_data$evaluation)
percentage_table <- prop.table(evaluation_table) * 100

# Create a data frame for better visualization
evaluation_df <- data.frame(
  Evaluation = names(evaluation_table),
  Count = as.numeric(evaluation_table),
  Percentage = as.numeric(percentage_table)
)

# Print the table
print(evaluation_df)

##   Evaluation Count Percentage
## 1       fair  3232      32.32
## 2       good  3287      32.87
## 3       poor  3481      34.81

# Create a bar plot with counts and percentages
library(ggplot2)

ggplot(train_data, aes(x = factor(evaluation, levels = evaluation_order))) +
  geom_bar(fill = "lightgreen") +
  geom_text(
    stat = "count",
    aes(label = paste0(..count.., " (", sprintf("%.1f%%", ..count../sum(..count..) * 100), ")")),
    position = position_stack(vjust = 0.5),
    color = "black",
    size = 3
  ) +
  labs(title = "Distribution of Evaluation") +
  ylab("Percentage") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title.x = element_blank()
  )

## Distribution of Number of Projects

evaluation_order <- c("poor", "fair", "good")

ggplot(train_data, aes(x = factor(evaluation, levels = evaluation_order))) +
  geom_bar(fill = "lightgreen") +
  geom_text(
    stat = "count",
    aes(label = ..count..),
    position = position_stack(vjust = 0.5),
    color = "black",
    size = 3
  ) +
  labs(title = "Distribution of Evaluation") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title.x = element_blank()
  )

Histogram of Average Monthly Hours

# Create a histogram with specified bins
histogram_plot <- ggplot(train_data, aes(x = average_montly_hours)) +
  geom_histogram(fill = "lightblue", color = "black", bins = 30) +
  labs(title = "Histogram of Average Monthly Hours") +
  xlab("Average Monthly Hours") +
  ylab("Frequency")
print(histogram_plot)

# Specify the number of bins
num_bins <- 30

# Create breaks for the histogram
breaks <- seq(min(train_data$average_montly_hours), max(train_data$average_montly_hours), length.out = num_bins + 1)

# Cut the data into bins
train_data$hour_bins <- cut(train_data$average_montly_hours, breaks = breaks, include.lowest = TRUE, right = TRUE)

# Calculate counts and percentages for each bin
hist_table_data <- as.data.frame(table(train_data$hour_bins))
hist_table_data$Percentage <- prop.table(hist_table_data$Freq) * 100

# Print the table
print(hist_table_data)

##         Var1 Freq Percentage
## 1   [96,103]   81       0.81
## 2  (103,110]   76       0.76
## 3  (110,117]   88       0.88
## 4  (117,125]   74       0.74
## 5  (125,132]  253       2.53
## 6  (132,139]  512       5.12
## 7  (139,146]  550       5.50
## 8  (146,153]  645       6.45
## 9  (153,160]  618       6.18
## 10 (160,167]  378       3.78
## 11 (167,174]  390       3.90
## 12 (174,182]  391       3.91
## 13 (182,189]  374       3.74
## 14 (189,196]  352       3.52
## 15 (196,203]  410       4.10
## 16 (203,210]  329       3.29
## 17 (210,217]  353       3.53
## 18 (217,224]  406       4.06
## 19 (224,232]  385       3.85
## 20 (232,239]  421       4.21
## 21 (239,246]  468       4.68
## 22 (246,253]  439       4.39
## 23 (253,260]  583       5.83
## 24 (260,267]  452       4.52
## 25 (267,274]  445       4.45
## 26 (274,281]  186       1.86
## 27 (281,289]  138       1.38
## 28 (289,296]   66       0.66
## 29 (296,303]   53       0.53
## 30 (303,310]   84       0.84

Distribution of Time Spent in Company

# Calculate the count and percentage of each time spent in the company level in the training dataset
time_spend_company_order <- c(2, 5, 4, 3, 7, 8, 6, 10)

time_spend_company_table <- table(train_data$time_spend_company)
percentage_time_spend_company_table <- prop.table(time_spend_company_table) * 100

# Create a data frame for better visualization
time_spend_company_df <- data.frame(
  Time_Spend_Company = names(time_spend_company_table),
  Count = as.numeric(time_spend_company_table),
  Percentage = as.numeric(percentage_time_spend_company_table)
)

# Print the table
print(time_spend_company_df)

##   Time_Spend_Company Count Percentage
## 1                  2  2191      21.91
## 2                  3  4270      42.70
## 3                  4  1683      16.83
## 4                  5   991       9.91
## 5                  6   488       4.88
## 6                  7   123       1.23
## 7                  8   107       1.07
## 8                 10   147       1.47

# Plot the bar chart with counts and percentages
library(ggplot2)

ggplot(train_data, aes(x = factor(time_spend_company, levels = time_spend_company_order))) +
  geom_bar(fill = "lightpink") +
  geom_text(
    stat = "count",
    aes(label = paste0(..count.., " (", sprintf("%.1f%%", ..count../sum(..count..) * 100), ")")),
    position = position_stack(vjust = 0.5),
    color = "black",
    size = 3
  ) +
  labs(title = "Distribution of Time Spent in Company") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title.x = element_blank()
  )

Distribution of Work Accidents

ggplot(train_data, aes(x = factor(Work_accident, labels = c("no", "yes")))) +
  geom_bar(fill = "lightcoral") +
  geom_text(
    stat = "count",
    aes(label = ..count..),
    position = position_stack(vjust = 0.5),
    color = "black",
    size = 3
  ) +
  labs(title = "Distribution of Work Accidents") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title.x = element_blank()
  )

# Create a table for 'Work Accidents'
work_accident_table <- table(train_data$Work_accident)
work_accident_percentage_table <- prop.table(work_accident_table) * 100

# Create a data frame for better visualization
work_accident_df <- data.frame(
  Work_Accident = names(work_accident_table),
  Count = as.numeric(work_accident_table),
  Percentage = as.numeric(work_accident_percentage_table)
)

# Print the table
print(work_accident_df)

##   Work_Accident Count Percentage
## 1             0  8581      85.81
## 2             1  1419      14.19

Distribution of Employee Departures

# Create a bar chart for 'left'
ggplot(train_data, aes(x = factor(left, labels = c("no", "yes")))) +
  geom_bar(fill = "lightblue") +
  geom_text(
    stat = "count",
    aes(label = ..count..),
    position = position_stack(vjust = 0.5),
    color = "black",
    size = 3
  ) +
  labs(title = "Distribution of Employee Departures") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title.x = element_blank()  # Remove x-axis title
  )

# Create a table for 'Employee Departures'
departure_table <- table(train_data$left)
departure_percentage_table <- prop.table(departure_table) * 100

# Create a data frame for better visualization
departure_df <- data.frame(
  Departure = names(departure_table),
  Count = as.numeric(departure_table),
  Percentage = as.numeric(departure_percentage_table)
)

# Print the table
print(departure_df)

##   Departure Count Percentage
## 1         0  7637      76.37
## 2         1  2363      23.63

Distribution of Promotions in the Last 5 Years

ggplot(train_data, aes(x = factor(promotion_last_5years, labels = c("no", "yes")))) +
  geom_bar(fill = "lightgreen") +
  geom_text(
    stat = "count",
    aes(label = ..count..),
    position = position_stack(vjust = 0.5),
    color = "black",
    size = 3
  ) +
  labs(title = "Distribution of Promotions in the Last 5 Years") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title.x = element_blank()
  )

# Create a table for 'Promotions in the Last 5 Years'
promotion_table <- table(train_data$promotion_last_5years)
promotion_percentage_table <- prop.table(promotion_table) * 100

# Create a data frame for better visualization
promotion_df <- data.frame(
  Promotion_Last_5_Years = names(promotion_table),
  Count = as.numeric(promotion_table),
  Percentage = as.numeric(promotion_percentage_table)
)

# Print the table
print(promotion_df)

##   Promotion_Last_5_Years Count Percentage
## 1                      0  9791      97.91
## 2                      1   209       2.09

Distribution of Employees by Department

# Calculate the count and percentage of each department in the training dataset
department_table <- table(train_data$department)
percentage_table <- prop.table(department_table) * 100

# Create a data frame for better visualization
department_df <- data.frame(
  Department = names(department_table),
  Count = as.numeric(department_table),
  Percentage = as.numeric(percentage_table)
)

# Print the table
print(department_df)

##     Department Count Percentage
## 1   accounting   499       4.99
## 2           hr   490       4.90
## 3           IT   819       8.19
## 4   management   429       4.29
## 5    marketing   593       5.93
## 6  product_mng   601       6.01
## 7        RandD   512       5.12
## 8        sales  2754      27.54
## 9      support  1446      14.46
## 10   technical  1857      18.57

# Alternatively, you can use ggplot to create a bar plot with counts and percentages
library(ggplot2)

ggplot(train_data, aes(x = department)) +
  geom_bar(fill = "lightcoral") +
  geom_text(
    stat = "count",
    aes(label = paste0(..count.., " (", sprintf("%.1f%%", ..count../sum(..count..) * 100), ")")),
    position = position_stack(vjust = 0.5),
    color = "black",
    size = 3
  ) +
  labs(title = "Distribution of Employees by Department") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Distribution of Salary Levels

# Calculate the count and percentage of each salary level in the training dataset
salary_order <- c("low", "medium", "high")

salary_table <- table(train_data$salary)
percentage_table <- prop.table(salary_table) * 100

# Create a data frame for better visualization
salary_df <- data.frame(
  Salary_Level = names(salary_table),
  Count = as.numeric(salary_table),
  Percentage = as.numeric(percentage_table)
)

# Print the table
print(salary_df)

##   Salary_Level Count Percentage
## 1         high   841       8.41
## 2          low  4881      48.81
## 3       medium  4278      42.78

# Alternatively, you can use ggplot to create a bar plot with counts and percentages
library(ggplot2)

ggplot(train_data, aes(x = factor(salary, levels = salary_order))) +
  geom_bar(fill = "lightblue") +
  geom_text(
    stat = "count",
    aes(label = paste0(..count.., " (", sprintf("%.1f%%", ..count../sum(..count..) * 100), ")")),
    position = position_stack(vjust = 0.5),
    color = "black",
    size = 3
  ) +
  labs(title = "Distribution of Salary Levels") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title.x = element_blank()
  )

Box Plot: Average Monthly Hours by Satisfaction Level

ggplot(train_data, aes(x = satisfaction, y = average_montly_hours, fill = satisfaction)) +
  geom_boxplot() +
  stat_summary(
    fun = "mean",
    geom = "point",
    position = position_dodge(0.75),
    shape = 4,
    size = 4,
    color = "black"
  ) +
  labs(
    title = "Box Plot: Average Monthly Hours by Satisfaction Level",
    x = "Satisfaction Level",
    y = "Average Monthly Hours"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Create a summary table using tapply and summary
summary_table <- tapply(
  train_data$average_montly_hours,
  train_data$satisfaction,
  function(x) c(
    Mean = mean(x),
    SD = sd(x),
    Median = median(x),
    IQR = IQR(x),
    Count = length(x)
  )
)

# Print the summary table
print(summary_table)

## $`moderately dissatisfied`
##      Mean        SD    Median       IQR     Count 
##  155.9925   34.5630  148.0000   24.0000 1473.0000 
## 
## $`moderately satisfied`
##       Mean         SD     Median        IQR      Count 
##  211.81296   43.65402  221.00000   72.00000 1497.00000 
## 
## $neutral
##       Mean         SD     Median        IQR      Count 
##  199.94502   44.94792  200.00000   77.00000 1437.00000 
## 
## $`slightly dissatisfied`
##       Mean         SD     Median        IQR      Count 
##  198.66529   45.44937  196.00000   75.25000 1452.00000 
## 
## $`slightly satisfied`
##       Mean         SD     Median        IQR      Count 
##  208.16764   43.82709  216.00000   73.00000 1366.00000 
## 
## $`very dissatisfied`
##       Mean         SD     Median        IQR      Count 
##  229.87587   56.99092  247.00000   92.75000 1442.00000 
## 
## $`very satisfied`
##       Mean         SD     Median        IQR      Count 
##  202.98650   44.42823  205.00000   76.00000 1333.00000

Box plot: Number of Projects by Department

ggplot(train_data, aes(x = department, y = number_project)) +
  geom_boxplot(fill = "skyblue", color = "darkblue") +
  labs(title = "Number of Projects by Department",
       x = "Department",
       y = "Number of Projects") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Box Plot: Time Spent at Company by Department

box_plot_time <- ggplot(train_data, aes(x = department, y = time_spend_company)) +
  geom_boxplot(fill = "skyblue", color = "darkblue") +
  labs(title = "Time Spent at Company by Department",
       x = "Department",
       y = "Time Spent at Company") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(box_plot_time)

## Data Preprocessing

test_data <- read.csv("HR_test.csv")
train_data <- read.csv("HR_AcostaDegenerSchrand.csv")
# Convert categorical variables to factors
for (var in names(train_data)) {
  if (var != "average_montly_hours") {
    train_data[[var]] <- factor(train_data[[var]])
    test_data[[var]] <- factor(test_data[[var]])
  }
}

# Convert satisfaction to numeric values
satisfaction_levels <- c("very dissatisfied", "moderately dissatisfied", "slightly dissatisfied",
                          "neutral", "moderately satisfied", "slightly satisfied", "very satisfied")

train_data$satisfaction <- factor(train_data$satisfaction, levels = satisfaction_levels, ordered = TRUE)
train_data$satisfaction_numeric <- as.numeric(train_data$satisfaction)
test_data$satisfaction <- factor(test_data$satisfaction, levels = satisfaction_levels, ordered = TRUE)
test_data$satisfaction_numeric <- as.numeric(test_data$satisfaction)

Predictive Models

Satisfaction Model

# Predictive model for the satisfaction level of an employee (linear regression)
satisfaction_model_lm <- lm(satisfaction_numeric ~ evaluation + number_project +
                              average_montly_hours + time_spend_company +
                              Work_accident + promotion_last_5years +
                              department + salary, data = train_data)

# Display model summary
summary(satisfaction_model_lm)

## 
## Call:
## lm(formula = satisfaction_numeric ~ evaluation + number_project + 
##     average_montly_hours + time_spend_company + Work_accident + 
##     promotion_last_5years + department + salary, data = train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1549 -0.9684 -0.2544  1.2715  6.0388 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             2.4592263  0.1332584  18.455  < 2e-16 ***
## evaluationgood          0.2018552  0.0431954   4.673 3.01e-06 ***
## evaluationpoor         -0.3418342  0.0441473  -7.743 1.07e-14 ***
## number_project3         1.5466866  0.0587653  26.320  < 2e-16 ***
## number_project4         1.5945525  0.0585724  27.224  < 2e-16 ***
## number_project5         1.5518754  0.0652831  23.771  < 2e-16 ***
## number_project6        -0.9890729  0.0861712 -11.478  < 2e-16 ***
## number_project7        -1.8784318  0.1528396 -12.290  < 2e-16 ***
## average_montly_hours    0.0025191  0.0003886   6.482 9.45e-11 ***
## time_spend_company3    -0.1023707  0.0457588  -2.237  0.02530 *  
## time_spend_company4    -0.6317202  0.0590179 -10.704  < 2e-16 ***
## time_spend_company5    -0.3342516  0.0677626  -4.933 8.24e-07 ***
## time_spend_company6    -0.3540191  0.0857067  -4.131 3.65e-05 ***
## time_spend_company7    -0.3739658  0.1596484  -2.342  0.01918 *  
## time_spend_company8    -0.1536099  0.1680320  -0.914  0.36065    
## time_spend_company10   -0.4126934  0.1460447  -2.826  0.00473 ** 
## Work_accident1          0.0160644  0.0487702   0.329  0.74187    
## promotion_last_5years1  0.1001693  0.1211087   0.827  0.40820    
## departmenthr            0.0953797  0.1076701   0.886  0.37572    
## departmentIT            0.2146671  0.0961000   2.234  0.02552 *  
## departmentmanagement    0.2098379  0.1138359   1.843  0.06531 .  
## departmentmarketing     0.2580584  0.1028114   2.510  0.01209 *  
## departmentproduct_mng   0.2545571  0.1025364   2.483  0.01306 *  
## departmentRandD         0.1467039  0.1065717   1.377  0.16867    
## departmentsales         0.1729626  0.0824366   2.098  0.03592 *  
## departmentsupport       0.1517452  0.0878993   1.726  0.08432 .  
## departmenttechnical     0.1845927  0.0853314   2.163  0.03055 *  
## salarylow               0.0055320  0.0653048   0.085  0.93249    
## salarymedium            0.0256233  0.0653003   0.392  0.69478    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.69 on 9971 degrees of freedom
## Multiple R-squared:  0.2744, Adjusted R-squared:  0.2723 
## F-statistic: 134.6 on 28 and 9971 DF,  p-value: < 2.2e-16

Leave Model

# Classification model for whether or not an employee will leave the company (logistic regression)
leave_model <- glm(left ~ satisfaction + evaluation + number_project +
                     average_montly_hours + time_spend_company +
                     Work_accident + promotion_last_5years +
                     department + salary, family = binomial(link = 'logit'),
                   data = train_data)

# Display model summary
summary(leave_model)

## 
## Call:
## glm(formula = left ~ satisfaction + evaluation + number_project + 
##     average_montly_hours + time_spend_company + Work_accident + 
##     promotion_last_5years + department + salary, family = binomial(link = "logit"), 
##     data = train_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.9456  -0.2174  -0.0640   0.0000   4.0548  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -8.000502   0.420741 -19.015  < 2e-16 ***
## satisfaction.L          -1.441340   0.133640 -10.785  < 2e-16 ***
## satisfaction.Q           0.874102   0.165231   5.290 1.22e-07 ***
## satisfaction.C           0.093725   0.139426   0.672  0.50144    
## satisfaction^4          -3.378539   0.162819 -20.750  < 2e-16 ***
## satisfaction^5           2.240247   0.140777  15.913  < 2e-16 ***
## satisfaction^6           0.281954   0.204820   1.377  0.16864    
## evaluationgood           1.740372   0.116782  14.903  < 2e-16 ***
## evaluationpoor           0.617081   0.133324   4.628 3.68e-06 ***
## number_project3         -4.392791   0.199222 -22.050  < 2e-16 ***
## number_project4         -2.890854   0.148508 -19.466  < 2e-16 ***
## number_project5         -2.175096   0.151608 -14.347  < 2e-16 ***
## number_project6         -1.090850   0.178546  -6.110 9.99e-10 ***
## number_project7         19.317948 442.120797   0.044  0.96515    
## average_montly_hours     0.015749   0.001087  14.495  < 2e-16 ***
## time_spend_company3      2.039147   0.215923   9.444  < 2e-16 ***
## time_spend_company4      3.155921   0.230256  13.706  < 2e-16 ***
## time_spend_company5      4.341799   0.228738  18.982  < 2e-16 ***
## time_spend_company6      3.299981   0.246939  13.364  < 2e-16 ***
## time_spend_company7    -13.873905 462.518254  -0.030  0.97607    
## time_spend_company8    -13.946272 506.090051  -0.028  0.97802    
## time_spend_company10   -13.637897 426.414227  -0.032  0.97449    
## Work_accident1          -1.309912   0.144426  -9.070  < 2e-16 ***
## promotion_last_5years1  -1.222569   0.407790  -2.998  0.00272 ** 
## departmenthr             0.252068   0.244986   1.029  0.30352    
## departmentIT            -0.230102   0.222162  -1.036  0.30032    
## departmentmanagement    -0.305417   0.282169  -1.082  0.27908    
## departmentmarketing      0.001724   0.237869   0.007  0.99422    
## departmentproduct_mng   -0.508625   0.234332  -2.171  0.02997 *  
## departmentRandD         -0.577379   0.261751  -2.206  0.02740 *  
## departmentsales          0.038742   0.186065   0.208  0.83506    
## departmentsupport       -0.008942   0.199414  -0.045  0.96423    
## departmenttechnical      0.072280   0.192045   0.376  0.70664    
## salarylow                1.730165   0.202534   8.543  < 2e-16 ***
## salarymedium             1.342978   0.204590   6.564 5.23e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10936  on 9999  degrees of freedom
## Residual deviance:  4060  on 9965  degrees of freedom
## AIC: 4130
## 
## Number of Fisher Scoring iterations: 17

Error rates for Predictive Models

# Satisfaction Model RMSE
satisfaction_pred <- predict(satisfaction_model_lm, newdata = test_data)
satisfaction_error <- sqrt(mean((satisfaction_pred - test_data$satisfaction_numeric)^2))
cat("Satisfaction Model RMSE:", satisfaction_error, "\n")

## Satisfaction Model RMSE: 1.691425

# Leave Model Error Rate
leave_pred <- predict(leave_model, newdata = test_data, type = 'response')
leave_pred_binary <- ifelse(leave_pred > 0.5, 1, 0)
leave_error <- mean(leave_pred_binary != test_data$left)
cat("Leave Model Error Rate:", leave_error, "\n")

## Leave Model Error Rate: 0.06868956

HR Data Analysis

Kelsey Pettrone

2023-12-15

Load data

Summary Statistics

Distribution of satisfaction

Distribution of Evaluation

Histogram of Average Monthly Hours

Distribution of Time Spent in Company

Distribution of Work Accidents

Distribution of Employee Departures

Distribution of Promotions in the Last 5 Years

Distribution of Employees by Department

Distribution of Salary Levels

Box Plot: Average Monthly Hours by Satisfaction Level

Box plot: Number of Projects by Department

Box Plot: Time Spent at Company by Department

Predictive Models

Satisfaction Model

Leave Model

Error rates for Predictive Models