library(ggplot2)

Load data

train_data <- read.csv("HR_AcostaDegenerSchrand.csv")
test_data <- read.csv("HR_test.csv")

Summary Statistics

summary(train_data)
##  satisfaction        evaluation        number_project  average_montly_hours
##  Length:10000       Length:10000       Min.   :2.000   Min.   : 96.0       
##  Class :character   Class :character   1st Qu.:3.000   1st Qu.:156.0       
##  Mode  :character   Mode  :character   Median :4.000   Median :200.0       
##                                        Mean   :3.795   Mean   :200.9       
##                                        3rd Qu.:5.000   3rd Qu.:245.0       
##                                        Max.   :7.000   Max.   :310.0       
##  time_spend_company Work_accident         left        promotion_last_5years
##  Min.   : 2.000     Min.   :0.0000   Min.   :0.0000   Min.   :0.0000       
##  1st Qu.: 3.000     1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000       
##  Median : 3.000     Median :0.0000   Median :0.0000   Median :0.0000       
##  Mean   : 3.499     Mean   :0.1419   Mean   :0.2363   Mean   :0.0209       
##  3rd Qu.: 4.000     3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000       
##  Max.   :10.000     Max.   :1.0000   Max.   :1.0000   Max.   :1.0000       
##   department           salary         
##  Length:10000       Length:10000      
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 

Distribution of satisfaction

manual_order <- c("very dissatisfied", "slightly dissatisfied", "neutral", "slightly satisfied", "moderately dissatisfied", "moderately satisfied", "very satisfied")

ggplot(train_data, aes(x = factor(satisfaction, levels = manual_order))) +
  geom_bar(fill = "lightblue") +
  geom_text(
    stat = "count",
    aes(label = ..count..),
    position = position_stack(vjust = 0.5),
    color = "black",
    size = 3
  ) +
  labs(title = "Distribution of Satisfaction") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title.x = element_blank()
  )
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Calculate the count and percentage of each satisfaction level in the training dataset
satisfaction_table <- table(train_data$satisfaction)
percentage_table <- prop.table(satisfaction_table) * 100

# Create a data frame for better visualization
satisfaction_df <- data.frame(
  Satisfaction = names(satisfaction_table),
  Count = as.numeric(satisfaction_table),
  Percentage = as.numeric(percentage_table)
)

# Print the table
print(satisfaction_df)
##              Satisfaction Count Percentage
## 1 moderately dissatisfied  1473      14.73
## 2    moderately satisfied  1497      14.97
## 3                 neutral  1437      14.37
## 4   slightly dissatisfied  1452      14.52
## 5      slightly satisfied  1366      13.66
## 6       very dissatisfied  1442      14.42
## 7          very satisfied  1333      13.33
# Alternatively, you can use ggplot to create a bar plot with counts and percentages
library(ggplot2)

ggplot(train_data, aes(x = factor(satisfaction, levels = manual_order))) +
  geom_bar(fill = "lightblue") +
  geom_text(
    stat = "count",
    aes(label = paste0(..count.., " (", sprintf("%.1f%%", ..count../sum(..count..) * 100), ")")),
    position = position_stack(vjust = 0.5),
    color = "black",
    size = 3
  ) +
  labs(title = "Distribution of Satisfaction") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title.x = element_blank()
  )

Distribution of Evaluation

# Calculate the count and percentage of each evaluation level in the training dataset
evaluation_order <- c("poor", "fair", "good") 
evaluation_table <- table(train_data$evaluation)
percentage_table <- prop.table(evaluation_table) * 100

# Create a data frame for better visualization
evaluation_df <- data.frame(
  Evaluation = names(evaluation_table),
  Count = as.numeric(evaluation_table),
  Percentage = as.numeric(percentage_table)
)

# Print the table
print(evaluation_df)
##   Evaluation Count Percentage
## 1       fair  3232      32.32
## 2       good  3287      32.87
## 3       poor  3481      34.81
# Create a bar plot with counts and percentages
library(ggplot2)

ggplot(train_data, aes(x = factor(evaluation, levels = evaluation_order))) +
  geom_bar(fill = "lightgreen") +
  geom_text(
    stat = "count",
    aes(label = paste0(..count.., " (", sprintf("%.1f%%", ..count../sum(..count..) * 100), ")")),
    position = position_stack(vjust = 0.5),
    color = "black",
    size = 3
  ) +
  labs(title = "Distribution of Evaluation") +
  ylab("Percentage") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title.x = element_blank()
  )

## Distribution of Number of Projects

evaluation_order <- c("poor", "fair", "good")

ggplot(train_data, aes(x = factor(evaluation, levels = evaluation_order))) +
  geom_bar(fill = "lightgreen") +
  geom_text(
    stat = "count",
    aes(label = ..count..),
    position = position_stack(vjust = 0.5),
    color = "black",
    size = 3
  ) +
  labs(title = "Distribution of Evaluation") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title.x = element_blank()
  )

Histogram of Average Monthly Hours

# Create a histogram with specified bins
histogram_plot <- ggplot(train_data, aes(x = average_montly_hours)) +
  geom_histogram(fill = "lightblue", color = "black", bins = 30) +
  labs(title = "Histogram of Average Monthly Hours") +
  xlab("Average Monthly Hours") +
  ylab("Frequency")
print(histogram_plot)

# Specify the number of bins
num_bins <- 30

# Create breaks for the histogram
breaks <- seq(min(train_data$average_montly_hours), max(train_data$average_montly_hours), length.out = num_bins + 1)

# Cut the data into bins
train_data$hour_bins <- cut(train_data$average_montly_hours, breaks = breaks, include.lowest = TRUE, right = TRUE)

# Calculate counts and percentages for each bin
hist_table_data <- as.data.frame(table(train_data$hour_bins))
hist_table_data$Percentage <- prop.table(hist_table_data$Freq) * 100

# Print the table
print(hist_table_data)
##         Var1 Freq Percentage
## 1   [96,103]   81       0.81
## 2  (103,110]   76       0.76
## 3  (110,117]   88       0.88
## 4  (117,125]   74       0.74
## 5  (125,132]  253       2.53
## 6  (132,139]  512       5.12
## 7  (139,146]  550       5.50
## 8  (146,153]  645       6.45
## 9  (153,160]  618       6.18
## 10 (160,167]  378       3.78
## 11 (167,174]  390       3.90
## 12 (174,182]  391       3.91
## 13 (182,189]  374       3.74
## 14 (189,196]  352       3.52
## 15 (196,203]  410       4.10
## 16 (203,210]  329       3.29
## 17 (210,217]  353       3.53
## 18 (217,224]  406       4.06
## 19 (224,232]  385       3.85
## 20 (232,239]  421       4.21
## 21 (239,246]  468       4.68
## 22 (246,253]  439       4.39
## 23 (253,260]  583       5.83
## 24 (260,267]  452       4.52
## 25 (267,274]  445       4.45
## 26 (274,281]  186       1.86
## 27 (281,289]  138       1.38
## 28 (289,296]   66       0.66
## 29 (296,303]   53       0.53
## 30 (303,310]   84       0.84

Distribution of Time Spent in Company

# Calculate the count and percentage of each time spent in the company level in the training dataset
time_spend_company_order <- c(2, 5, 4, 3, 7, 8, 6, 10)

time_spend_company_table <- table(train_data$time_spend_company)
percentage_time_spend_company_table <- prop.table(time_spend_company_table) * 100

# Create a data frame for better visualization
time_spend_company_df <- data.frame(
  Time_Spend_Company = names(time_spend_company_table),
  Count = as.numeric(time_spend_company_table),
  Percentage = as.numeric(percentage_time_spend_company_table)
)

# Print the table
print(time_spend_company_df)
##   Time_Spend_Company Count Percentage
## 1                  2  2191      21.91
## 2                  3  4270      42.70
## 3                  4  1683      16.83
## 4                  5   991       9.91
## 5                  6   488       4.88
## 6                  7   123       1.23
## 7                  8   107       1.07
## 8                 10   147       1.47
# Plot the bar chart with counts and percentages
library(ggplot2)

ggplot(train_data, aes(x = factor(time_spend_company, levels = time_spend_company_order))) +
  geom_bar(fill = "lightpink") +
  geom_text(
    stat = "count",
    aes(label = paste0(..count.., " (", sprintf("%.1f%%", ..count../sum(..count..) * 100), ")")),
    position = position_stack(vjust = 0.5),
    color = "black",
    size = 3
  ) +
  labs(title = "Distribution of Time Spent in Company") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title.x = element_blank()
  )

Distribution of Work Accidents

ggplot(train_data, aes(x = factor(Work_accident, labels = c("no", "yes")))) +
  geom_bar(fill = "lightcoral") +
  geom_text(
    stat = "count",
    aes(label = ..count..),
    position = position_stack(vjust = 0.5),
    color = "black",
    size = 3
  ) +
  labs(title = "Distribution of Work Accidents") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title.x = element_blank()
  )

# Create a table for 'Work Accidents'
work_accident_table <- table(train_data$Work_accident)
work_accident_percentage_table <- prop.table(work_accident_table) * 100

# Create a data frame for better visualization
work_accident_df <- data.frame(
  Work_Accident = names(work_accident_table),
  Count = as.numeric(work_accident_table),
  Percentage = as.numeric(work_accident_percentage_table)
)

# Print the table
print(work_accident_df)
##   Work_Accident Count Percentage
## 1             0  8581      85.81
## 2             1  1419      14.19

Distribution of Employee Departures

# Create a bar chart for 'left'
ggplot(train_data, aes(x = factor(left, labels = c("no", "yes")))) +
  geom_bar(fill = "lightblue") +
  geom_text(
    stat = "count",
    aes(label = ..count..),
    position = position_stack(vjust = 0.5),
    color = "black",
    size = 3
  ) +
  labs(title = "Distribution of Employee Departures") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title.x = element_blank()  # Remove x-axis title
  )

# Create a table for 'Employee Departures'
departure_table <- table(train_data$left)
departure_percentage_table <- prop.table(departure_table) * 100

# Create a data frame for better visualization
departure_df <- data.frame(
  Departure = names(departure_table),
  Count = as.numeric(departure_table),
  Percentage = as.numeric(departure_percentage_table)
)

# Print the table
print(departure_df)
##   Departure Count Percentage
## 1         0  7637      76.37
## 2         1  2363      23.63

Distribution of Promotions in the Last 5 Years

ggplot(train_data, aes(x = factor(promotion_last_5years, labels = c("no", "yes")))) +
  geom_bar(fill = "lightgreen") +
  geom_text(
    stat = "count",
    aes(label = ..count..),
    position = position_stack(vjust = 0.5),
    color = "black",
    size = 3
  ) +
  labs(title = "Distribution of Promotions in the Last 5 Years") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title.x = element_blank()
  )

# Create a table for 'Promotions in the Last 5 Years'
promotion_table <- table(train_data$promotion_last_5years)
promotion_percentage_table <- prop.table(promotion_table) * 100

# Create a data frame for better visualization
promotion_df <- data.frame(
  Promotion_Last_5_Years = names(promotion_table),
  Count = as.numeric(promotion_table),
  Percentage = as.numeric(promotion_percentage_table)
)

# Print the table
print(promotion_df)
##   Promotion_Last_5_Years Count Percentage
## 1                      0  9791      97.91
## 2                      1   209       2.09

Distribution of Employees by Department

# Calculate the count and percentage of each department in the training dataset
department_table <- table(train_data$department)
percentage_table <- prop.table(department_table) * 100

# Create a data frame for better visualization
department_df <- data.frame(
  Department = names(department_table),
  Count = as.numeric(department_table),
  Percentage = as.numeric(percentage_table)
)

# Print the table
print(department_df)
##     Department Count Percentage
## 1   accounting   499       4.99
## 2           hr   490       4.90
## 3           IT   819       8.19
## 4   management   429       4.29
## 5    marketing   593       5.93
## 6  product_mng   601       6.01
## 7        RandD   512       5.12
## 8        sales  2754      27.54
## 9      support  1446      14.46
## 10   technical  1857      18.57
# Alternatively, you can use ggplot to create a bar plot with counts and percentages
library(ggplot2)

ggplot(train_data, aes(x = department)) +
  geom_bar(fill = "lightcoral") +
  geom_text(
    stat = "count",
    aes(label = paste0(..count.., " (", sprintf("%.1f%%", ..count../sum(..count..) * 100), ")")),
    position = position_stack(vjust = 0.5),
    color = "black",
    size = 3
  ) +
  labs(title = "Distribution of Employees by Department") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Distribution of Salary Levels

# Calculate the count and percentage of each salary level in the training dataset
salary_order <- c("low", "medium", "high")

salary_table <- table(train_data$salary)
percentage_table <- prop.table(salary_table) * 100

# Create a data frame for better visualization
salary_df <- data.frame(
  Salary_Level = names(salary_table),
  Count = as.numeric(salary_table),
  Percentage = as.numeric(percentage_table)
)

# Print the table
print(salary_df)
##   Salary_Level Count Percentage
## 1         high   841       8.41
## 2          low  4881      48.81
## 3       medium  4278      42.78
# Alternatively, you can use ggplot to create a bar plot with counts and percentages
library(ggplot2)

ggplot(train_data, aes(x = factor(salary, levels = salary_order))) +
  geom_bar(fill = "lightblue") +
  geom_text(
    stat = "count",
    aes(label = paste0(..count.., " (", sprintf("%.1f%%", ..count../sum(..count..) * 100), ")")),
    position = position_stack(vjust = 0.5),
    color = "black",
    size = 3
  ) +
  labs(title = "Distribution of Salary Levels") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title.x = element_blank()
  )

Box Plot: Average Monthly Hours by Satisfaction Level

ggplot(train_data, aes(x = satisfaction, y = average_montly_hours, fill = satisfaction)) +
  geom_boxplot() +
  stat_summary(
    fun = "mean",
    geom = "point",
    position = position_dodge(0.75),
    shape = 4,
    size = 4,
    color = "black"
  ) +
  labs(
    title = "Box Plot: Average Monthly Hours by Satisfaction Level",
    x = "Satisfaction Level",
    y = "Average Monthly Hours"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Create a summary table using tapply and summary
summary_table <- tapply(
  train_data$average_montly_hours,
  train_data$satisfaction,
  function(x) c(
    Mean = mean(x),
    SD = sd(x),
    Median = median(x),
    IQR = IQR(x),
    Count = length(x)
  )
)

# Print the summary table
print(summary_table)
## $`moderately dissatisfied`
##      Mean        SD    Median       IQR     Count 
##  155.9925   34.5630  148.0000   24.0000 1473.0000 
## 
## $`moderately satisfied`
##       Mean         SD     Median        IQR      Count 
##  211.81296   43.65402  221.00000   72.00000 1497.00000 
## 
## $neutral
##       Mean         SD     Median        IQR      Count 
##  199.94502   44.94792  200.00000   77.00000 1437.00000 
## 
## $`slightly dissatisfied`
##       Mean         SD     Median        IQR      Count 
##  198.66529   45.44937  196.00000   75.25000 1452.00000 
## 
## $`slightly satisfied`
##       Mean         SD     Median        IQR      Count 
##  208.16764   43.82709  216.00000   73.00000 1366.00000 
## 
## $`very dissatisfied`
##       Mean         SD     Median        IQR      Count 
##  229.87587   56.99092  247.00000   92.75000 1442.00000 
## 
## $`very satisfied`
##       Mean         SD     Median        IQR      Count 
##  202.98650   44.42823  205.00000   76.00000 1333.00000

Box plot: Number of Projects by Department

ggplot(train_data, aes(x = department, y = number_project)) +
  geom_boxplot(fill = "skyblue", color = "darkblue") +
  labs(title = "Number of Projects by Department",
       x = "Department",
       y = "Number of Projects") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Box Plot: Time Spent at Company by Department

box_plot_time <- ggplot(train_data, aes(x = department, y = time_spend_company)) +
  geom_boxplot(fill = "skyblue", color = "darkblue") +
  labs(title = "Time Spent at Company by Department",
       x = "Department",
       y = "Time Spent at Company") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(box_plot_time)

## Data Preprocessing

test_data <- read.csv("HR_test.csv")
train_data <- read.csv("HR_AcostaDegenerSchrand.csv")
# Convert categorical variables to factors
for (var in names(train_data)) {
  if (var != "average_montly_hours") {
    train_data[[var]] <- factor(train_data[[var]])
    test_data[[var]] <- factor(test_data[[var]])
  }
}

# Convert satisfaction to numeric values
satisfaction_levels <- c("very dissatisfied", "moderately dissatisfied", "slightly dissatisfied",
                          "neutral", "moderately satisfied", "slightly satisfied", "very satisfied")

train_data$satisfaction <- factor(train_data$satisfaction, levels = satisfaction_levels, ordered = TRUE)
train_data$satisfaction_numeric <- as.numeric(train_data$satisfaction)
test_data$satisfaction <- factor(test_data$satisfaction, levels = satisfaction_levels, ordered = TRUE)
test_data$satisfaction_numeric <- as.numeric(test_data$satisfaction)

Predictive Models

Satisfaction Model

# Predictive model for the satisfaction level of an employee (linear regression)
satisfaction_model_lm <- lm(satisfaction_numeric ~ evaluation + number_project +
                              average_montly_hours + time_spend_company +
                              Work_accident + promotion_last_5years +
                              department + salary, data = train_data)

# Display model summary
summary(satisfaction_model_lm)
## 
## Call:
## lm(formula = satisfaction_numeric ~ evaluation + number_project + 
##     average_montly_hours + time_spend_company + Work_accident + 
##     promotion_last_5years + department + salary, data = train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1549 -0.9684 -0.2544  1.2715  6.0388 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             2.4592263  0.1332584  18.455  < 2e-16 ***
## evaluationgood          0.2018552  0.0431954   4.673 3.01e-06 ***
## evaluationpoor         -0.3418342  0.0441473  -7.743 1.07e-14 ***
## number_project3         1.5466866  0.0587653  26.320  < 2e-16 ***
## number_project4         1.5945525  0.0585724  27.224  < 2e-16 ***
## number_project5         1.5518754  0.0652831  23.771  < 2e-16 ***
## number_project6        -0.9890729  0.0861712 -11.478  < 2e-16 ***
## number_project7        -1.8784318  0.1528396 -12.290  < 2e-16 ***
## average_montly_hours    0.0025191  0.0003886   6.482 9.45e-11 ***
## time_spend_company3    -0.1023707  0.0457588  -2.237  0.02530 *  
## time_spend_company4    -0.6317202  0.0590179 -10.704  < 2e-16 ***
## time_spend_company5    -0.3342516  0.0677626  -4.933 8.24e-07 ***
## time_spend_company6    -0.3540191  0.0857067  -4.131 3.65e-05 ***
## time_spend_company7    -0.3739658  0.1596484  -2.342  0.01918 *  
## time_spend_company8    -0.1536099  0.1680320  -0.914  0.36065    
## time_spend_company10   -0.4126934  0.1460447  -2.826  0.00473 ** 
## Work_accident1          0.0160644  0.0487702   0.329  0.74187    
## promotion_last_5years1  0.1001693  0.1211087   0.827  0.40820    
## departmenthr            0.0953797  0.1076701   0.886  0.37572    
## departmentIT            0.2146671  0.0961000   2.234  0.02552 *  
## departmentmanagement    0.2098379  0.1138359   1.843  0.06531 .  
## departmentmarketing     0.2580584  0.1028114   2.510  0.01209 *  
## departmentproduct_mng   0.2545571  0.1025364   2.483  0.01306 *  
## departmentRandD         0.1467039  0.1065717   1.377  0.16867    
## departmentsales         0.1729626  0.0824366   2.098  0.03592 *  
## departmentsupport       0.1517452  0.0878993   1.726  0.08432 .  
## departmenttechnical     0.1845927  0.0853314   2.163  0.03055 *  
## salarylow               0.0055320  0.0653048   0.085  0.93249    
## salarymedium            0.0256233  0.0653003   0.392  0.69478    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.69 on 9971 degrees of freedom
## Multiple R-squared:  0.2744, Adjusted R-squared:  0.2723 
## F-statistic: 134.6 on 28 and 9971 DF,  p-value: < 2.2e-16

Leave Model

# Classification model for whether or not an employee will leave the company (logistic regression)
leave_model <- glm(left ~ satisfaction + evaluation + number_project +
                     average_montly_hours + time_spend_company +
                     Work_accident + promotion_last_5years +
                     department + salary, family = binomial(link = 'logit'),
                   data = train_data)

# Display model summary
summary(leave_model)
## 
## Call:
## glm(formula = left ~ satisfaction + evaluation + number_project + 
##     average_montly_hours + time_spend_company + Work_accident + 
##     promotion_last_5years + department + salary, family = binomial(link = "logit"), 
##     data = train_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.9456  -0.2174  -0.0640   0.0000   4.0548  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -8.000502   0.420741 -19.015  < 2e-16 ***
## satisfaction.L          -1.441340   0.133640 -10.785  < 2e-16 ***
## satisfaction.Q           0.874102   0.165231   5.290 1.22e-07 ***
## satisfaction.C           0.093725   0.139426   0.672  0.50144    
## satisfaction^4          -3.378539   0.162819 -20.750  < 2e-16 ***
## satisfaction^5           2.240247   0.140777  15.913  < 2e-16 ***
## satisfaction^6           0.281954   0.204820   1.377  0.16864    
## evaluationgood           1.740372   0.116782  14.903  < 2e-16 ***
## evaluationpoor           0.617081   0.133324   4.628 3.68e-06 ***
## number_project3         -4.392791   0.199222 -22.050  < 2e-16 ***
## number_project4         -2.890854   0.148508 -19.466  < 2e-16 ***
## number_project5         -2.175096   0.151608 -14.347  < 2e-16 ***
## number_project6         -1.090850   0.178546  -6.110 9.99e-10 ***
## number_project7         19.317948 442.120797   0.044  0.96515    
## average_montly_hours     0.015749   0.001087  14.495  < 2e-16 ***
## time_spend_company3      2.039147   0.215923   9.444  < 2e-16 ***
## time_spend_company4      3.155921   0.230256  13.706  < 2e-16 ***
## time_spend_company5      4.341799   0.228738  18.982  < 2e-16 ***
## time_spend_company6      3.299981   0.246939  13.364  < 2e-16 ***
## time_spend_company7    -13.873905 462.518254  -0.030  0.97607    
## time_spend_company8    -13.946272 506.090051  -0.028  0.97802    
## time_spend_company10   -13.637897 426.414227  -0.032  0.97449    
## Work_accident1          -1.309912   0.144426  -9.070  < 2e-16 ***
## promotion_last_5years1  -1.222569   0.407790  -2.998  0.00272 ** 
## departmenthr             0.252068   0.244986   1.029  0.30352    
## departmentIT            -0.230102   0.222162  -1.036  0.30032    
## departmentmanagement    -0.305417   0.282169  -1.082  0.27908    
## departmentmarketing      0.001724   0.237869   0.007  0.99422    
## departmentproduct_mng   -0.508625   0.234332  -2.171  0.02997 *  
## departmentRandD         -0.577379   0.261751  -2.206  0.02740 *  
## departmentsales          0.038742   0.186065   0.208  0.83506    
## departmentsupport       -0.008942   0.199414  -0.045  0.96423    
## departmenttechnical      0.072280   0.192045   0.376  0.70664    
## salarylow                1.730165   0.202534   8.543  < 2e-16 ***
## salarymedium             1.342978   0.204590   6.564 5.23e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10936  on 9999  degrees of freedom
## Residual deviance:  4060  on 9965  degrees of freedom
## AIC: 4130
## 
## Number of Fisher Scoring iterations: 17

Error rates for Predictive Models

# Satisfaction Model RMSE
satisfaction_pred <- predict(satisfaction_model_lm, newdata = test_data)
satisfaction_error <- sqrt(mean((satisfaction_pred - test_data$satisfaction_numeric)^2))
cat("Satisfaction Model RMSE:", satisfaction_error, "\n")
## Satisfaction Model RMSE: 1.691425
# Leave Model Error Rate
leave_pred <- predict(leave_model, newdata = test_data, type = 'response')
leave_pred_binary <- ifelse(leave_pred > 0.5, 1, 0)
leave_error <- mean(leave_pred_binary != test_data$left)
cat("Leave Model Error Rate:", leave_error, "\n")
## Leave Model Error Rate: 0.06868956