library(ggplot2)
train_data <- read.csv("HR_AcostaDegenerSchrand.csv")
test_data <- read.csv("HR_test.csv")
summary(train_data)
## satisfaction evaluation number_project average_montly_hours
## Length:10000 Length:10000 Min. :2.000 Min. : 96.0
## Class :character Class :character 1st Qu.:3.000 1st Qu.:156.0
## Mode :character Mode :character Median :4.000 Median :200.0
## Mean :3.795 Mean :200.9
## 3rd Qu.:5.000 3rd Qu.:245.0
## Max. :7.000 Max. :310.0
## time_spend_company Work_accident left promotion_last_5years
## Min. : 2.000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 3.000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 3.000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean : 3.499 Mean :0.1419 Mean :0.2363 Mean :0.0209
## 3rd Qu.: 4.000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :10.000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## department salary
## Length:10000 Length:10000
## Class :character Class :character
## Mode :character Mode :character
##
##
##
manual_order <- c("very dissatisfied", "slightly dissatisfied", "neutral", "slightly satisfied", "moderately dissatisfied", "moderately satisfied", "very satisfied")
ggplot(train_data, aes(x = factor(satisfaction, levels = manual_order))) +
geom_bar(fill = "lightblue") +
geom_text(
stat = "count",
aes(label = ..count..),
position = position_stack(vjust = 0.5),
color = "black",
size = 3
) +
labs(title = "Distribution of Satisfaction") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
axis.title.x = element_blank()
)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Calculate the count and percentage of each satisfaction level in the training dataset
satisfaction_table <- table(train_data$satisfaction)
percentage_table <- prop.table(satisfaction_table) * 100
# Create a data frame for better visualization
satisfaction_df <- data.frame(
Satisfaction = names(satisfaction_table),
Count = as.numeric(satisfaction_table),
Percentage = as.numeric(percentage_table)
)
# Print the table
print(satisfaction_df)
## Satisfaction Count Percentage
## 1 moderately dissatisfied 1473 14.73
## 2 moderately satisfied 1497 14.97
## 3 neutral 1437 14.37
## 4 slightly dissatisfied 1452 14.52
## 5 slightly satisfied 1366 13.66
## 6 very dissatisfied 1442 14.42
## 7 very satisfied 1333 13.33
# Alternatively, you can use ggplot to create a bar plot with counts and percentages
library(ggplot2)
ggplot(train_data, aes(x = factor(satisfaction, levels = manual_order))) +
geom_bar(fill = "lightblue") +
geom_text(
stat = "count",
aes(label = paste0(..count.., " (", sprintf("%.1f%%", ..count../sum(..count..) * 100), ")")),
position = position_stack(vjust = 0.5),
color = "black",
size = 3
) +
labs(title = "Distribution of Satisfaction") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
axis.title.x = element_blank()
)
# Calculate the count and percentage of each evaluation level in the training dataset
evaluation_order <- c("poor", "fair", "good")
evaluation_table <- table(train_data$evaluation)
percentage_table <- prop.table(evaluation_table) * 100
# Create a data frame for better visualization
evaluation_df <- data.frame(
Evaluation = names(evaluation_table),
Count = as.numeric(evaluation_table),
Percentage = as.numeric(percentage_table)
)
# Print the table
print(evaluation_df)
## Evaluation Count Percentage
## 1 fair 3232 32.32
## 2 good 3287 32.87
## 3 poor 3481 34.81
# Create a bar plot with counts and percentages
library(ggplot2)
ggplot(train_data, aes(x = factor(evaluation, levels = evaluation_order))) +
geom_bar(fill = "lightgreen") +
geom_text(
stat = "count",
aes(label = paste0(..count.., " (", sprintf("%.1f%%", ..count../sum(..count..) * 100), ")")),
position = position_stack(vjust = 0.5),
color = "black",
size = 3
) +
labs(title = "Distribution of Evaluation") +
ylab("Percentage") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
axis.title.x = element_blank()
)
## Distribution of Number of Projects
evaluation_order <- c("poor", "fair", "good")
ggplot(train_data, aes(x = factor(evaluation, levels = evaluation_order))) +
geom_bar(fill = "lightgreen") +
geom_text(
stat = "count",
aes(label = ..count..),
position = position_stack(vjust = 0.5),
color = "black",
size = 3
) +
labs(title = "Distribution of Evaluation") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
axis.title.x = element_blank()
)
# Create a histogram with specified bins
histogram_plot <- ggplot(train_data, aes(x = average_montly_hours)) +
geom_histogram(fill = "lightblue", color = "black", bins = 30) +
labs(title = "Histogram of Average Monthly Hours") +
xlab("Average Monthly Hours") +
ylab("Frequency")
print(histogram_plot)
# Specify the number of bins
num_bins <- 30
# Create breaks for the histogram
breaks <- seq(min(train_data$average_montly_hours), max(train_data$average_montly_hours), length.out = num_bins + 1)
# Cut the data into bins
train_data$hour_bins <- cut(train_data$average_montly_hours, breaks = breaks, include.lowest = TRUE, right = TRUE)
# Calculate counts and percentages for each bin
hist_table_data <- as.data.frame(table(train_data$hour_bins))
hist_table_data$Percentage <- prop.table(hist_table_data$Freq) * 100
# Print the table
print(hist_table_data)
## Var1 Freq Percentage
## 1 [96,103] 81 0.81
## 2 (103,110] 76 0.76
## 3 (110,117] 88 0.88
## 4 (117,125] 74 0.74
## 5 (125,132] 253 2.53
## 6 (132,139] 512 5.12
## 7 (139,146] 550 5.50
## 8 (146,153] 645 6.45
## 9 (153,160] 618 6.18
## 10 (160,167] 378 3.78
## 11 (167,174] 390 3.90
## 12 (174,182] 391 3.91
## 13 (182,189] 374 3.74
## 14 (189,196] 352 3.52
## 15 (196,203] 410 4.10
## 16 (203,210] 329 3.29
## 17 (210,217] 353 3.53
## 18 (217,224] 406 4.06
## 19 (224,232] 385 3.85
## 20 (232,239] 421 4.21
## 21 (239,246] 468 4.68
## 22 (246,253] 439 4.39
## 23 (253,260] 583 5.83
## 24 (260,267] 452 4.52
## 25 (267,274] 445 4.45
## 26 (274,281] 186 1.86
## 27 (281,289] 138 1.38
## 28 (289,296] 66 0.66
## 29 (296,303] 53 0.53
## 30 (303,310] 84 0.84
# Calculate the count and percentage of each time spent in the company level in the training dataset
time_spend_company_order <- c(2, 5, 4, 3, 7, 8, 6, 10)
time_spend_company_table <- table(train_data$time_spend_company)
percentage_time_spend_company_table <- prop.table(time_spend_company_table) * 100
# Create a data frame for better visualization
time_spend_company_df <- data.frame(
Time_Spend_Company = names(time_spend_company_table),
Count = as.numeric(time_spend_company_table),
Percentage = as.numeric(percentage_time_spend_company_table)
)
# Print the table
print(time_spend_company_df)
## Time_Spend_Company Count Percentage
## 1 2 2191 21.91
## 2 3 4270 42.70
## 3 4 1683 16.83
## 4 5 991 9.91
## 5 6 488 4.88
## 6 7 123 1.23
## 7 8 107 1.07
## 8 10 147 1.47
# Plot the bar chart with counts and percentages
library(ggplot2)
ggplot(train_data, aes(x = factor(time_spend_company, levels = time_spend_company_order))) +
geom_bar(fill = "lightpink") +
geom_text(
stat = "count",
aes(label = paste0(..count.., " (", sprintf("%.1f%%", ..count../sum(..count..) * 100), ")")),
position = position_stack(vjust = 0.5),
color = "black",
size = 3
) +
labs(title = "Distribution of Time Spent in Company") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
axis.title.x = element_blank()
)
ggplot(train_data, aes(x = factor(Work_accident, labels = c("no", "yes")))) +
geom_bar(fill = "lightcoral") +
geom_text(
stat = "count",
aes(label = ..count..),
position = position_stack(vjust = 0.5),
color = "black",
size = 3
) +
labs(title = "Distribution of Work Accidents") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
axis.title.x = element_blank()
)
# Create a table for 'Work Accidents'
work_accident_table <- table(train_data$Work_accident)
work_accident_percentage_table <- prop.table(work_accident_table) * 100
# Create a data frame for better visualization
work_accident_df <- data.frame(
Work_Accident = names(work_accident_table),
Count = as.numeric(work_accident_table),
Percentage = as.numeric(work_accident_percentage_table)
)
# Print the table
print(work_accident_df)
## Work_Accident Count Percentage
## 1 0 8581 85.81
## 2 1 1419 14.19
# Create a bar chart for 'left'
ggplot(train_data, aes(x = factor(left, labels = c("no", "yes")))) +
geom_bar(fill = "lightblue") +
geom_text(
stat = "count",
aes(label = ..count..),
position = position_stack(vjust = 0.5),
color = "black",
size = 3
) +
labs(title = "Distribution of Employee Departures") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
axis.title.x = element_blank() # Remove x-axis title
)
# Create a table for 'Employee Departures'
departure_table <- table(train_data$left)
departure_percentage_table <- prop.table(departure_table) * 100
# Create a data frame for better visualization
departure_df <- data.frame(
Departure = names(departure_table),
Count = as.numeric(departure_table),
Percentage = as.numeric(departure_percentage_table)
)
# Print the table
print(departure_df)
## Departure Count Percentage
## 1 0 7637 76.37
## 2 1 2363 23.63
ggplot(train_data, aes(x = factor(promotion_last_5years, labels = c("no", "yes")))) +
geom_bar(fill = "lightgreen") +
geom_text(
stat = "count",
aes(label = ..count..),
position = position_stack(vjust = 0.5),
color = "black",
size = 3
) +
labs(title = "Distribution of Promotions in the Last 5 Years") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
axis.title.x = element_blank()
)
# Create a table for 'Promotions in the Last 5 Years'
promotion_table <- table(train_data$promotion_last_5years)
promotion_percentage_table <- prop.table(promotion_table) * 100
# Create a data frame for better visualization
promotion_df <- data.frame(
Promotion_Last_5_Years = names(promotion_table),
Count = as.numeric(promotion_table),
Percentage = as.numeric(promotion_percentage_table)
)
# Print the table
print(promotion_df)
## Promotion_Last_5_Years Count Percentage
## 1 0 9791 97.91
## 2 1 209 2.09
# Calculate the count and percentage of each department in the training dataset
department_table <- table(train_data$department)
percentage_table <- prop.table(department_table) * 100
# Create a data frame for better visualization
department_df <- data.frame(
Department = names(department_table),
Count = as.numeric(department_table),
Percentage = as.numeric(percentage_table)
)
# Print the table
print(department_df)
## Department Count Percentage
## 1 accounting 499 4.99
## 2 hr 490 4.90
## 3 IT 819 8.19
## 4 management 429 4.29
## 5 marketing 593 5.93
## 6 product_mng 601 6.01
## 7 RandD 512 5.12
## 8 sales 2754 27.54
## 9 support 1446 14.46
## 10 technical 1857 18.57
# Alternatively, you can use ggplot to create a bar plot with counts and percentages
library(ggplot2)
ggplot(train_data, aes(x = department)) +
geom_bar(fill = "lightcoral") +
geom_text(
stat = "count",
aes(label = paste0(..count.., " (", sprintf("%.1f%%", ..count../sum(..count..) * 100), ")")),
position = position_stack(vjust = 0.5),
color = "black",
size = 3
) +
labs(title = "Distribution of Employees by Department") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Calculate the count and percentage of each salary level in the training dataset
salary_order <- c("low", "medium", "high")
salary_table <- table(train_data$salary)
percentage_table <- prop.table(salary_table) * 100
# Create a data frame for better visualization
salary_df <- data.frame(
Salary_Level = names(salary_table),
Count = as.numeric(salary_table),
Percentage = as.numeric(percentage_table)
)
# Print the table
print(salary_df)
## Salary_Level Count Percentage
## 1 high 841 8.41
## 2 low 4881 48.81
## 3 medium 4278 42.78
# Alternatively, you can use ggplot to create a bar plot with counts and percentages
library(ggplot2)
ggplot(train_data, aes(x = factor(salary, levels = salary_order))) +
geom_bar(fill = "lightblue") +
geom_text(
stat = "count",
aes(label = paste0(..count.., " (", sprintf("%.1f%%", ..count../sum(..count..) * 100), ")")),
position = position_stack(vjust = 0.5),
color = "black",
size = 3
) +
labs(title = "Distribution of Salary Levels") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
axis.title.x = element_blank()
)
ggplot(train_data, aes(x = satisfaction, y = average_montly_hours, fill = satisfaction)) +
geom_boxplot() +
stat_summary(
fun = "mean",
geom = "point",
position = position_dodge(0.75),
shape = 4,
size = 4,
color = "black"
) +
labs(
title = "Box Plot: Average Monthly Hours by Satisfaction Level",
x = "Satisfaction Level",
y = "Average Monthly Hours"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Create a summary table using tapply and summary
summary_table <- tapply(
train_data$average_montly_hours,
train_data$satisfaction,
function(x) c(
Mean = mean(x),
SD = sd(x),
Median = median(x),
IQR = IQR(x),
Count = length(x)
)
)
# Print the summary table
print(summary_table)
## $`moderately dissatisfied`
## Mean SD Median IQR Count
## 155.9925 34.5630 148.0000 24.0000 1473.0000
##
## $`moderately satisfied`
## Mean SD Median IQR Count
## 211.81296 43.65402 221.00000 72.00000 1497.00000
##
## $neutral
## Mean SD Median IQR Count
## 199.94502 44.94792 200.00000 77.00000 1437.00000
##
## $`slightly dissatisfied`
## Mean SD Median IQR Count
## 198.66529 45.44937 196.00000 75.25000 1452.00000
##
## $`slightly satisfied`
## Mean SD Median IQR Count
## 208.16764 43.82709 216.00000 73.00000 1366.00000
##
## $`very dissatisfied`
## Mean SD Median IQR Count
## 229.87587 56.99092 247.00000 92.75000 1442.00000
##
## $`very satisfied`
## Mean SD Median IQR Count
## 202.98650 44.42823 205.00000 76.00000 1333.00000
ggplot(train_data, aes(x = department, y = number_project)) +
geom_boxplot(fill = "skyblue", color = "darkblue") +
labs(title = "Number of Projects by Department",
x = "Department",
y = "Number of Projects") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
box_plot_time <- ggplot(train_data, aes(x = department, y = time_spend_company)) +
geom_boxplot(fill = "skyblue", color = "darkblue") +
labs(title = "Time Spent at Company by Department",
x = "Department",
y = "Time Spent at Company") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(box_plot_time)
## Data Preprocessing
test_data <- read.csv("HR_test.csv")
train_data <- read.csv("HR_AcostaDegenerSchrand.csv")
# Convert categorical variables to factors
for (var in names(train_data)) {
if (var != "average_montly_hours") {
train_data[[var]] <- factor(train_data[[var]])
test_data[[var]] <- factor(test_data[[var]])
}
}
# Convert satisfaction to numeric values
satisfaction_levels <- c("very dissatisfied", "moderately dissatisfied", "slightly dissatisfied",
"neutral", "moderately satisfied", "slightly satisfied", "very satisfied")
train_data$satisfaction <- factor(train_data$satisfaction, levels = satisfaction_levels, ordered = TRUE)
train_data$satisfaction_numeric <- as.numeric(train_data$satisfaction)
test_data$satisfaction <- factor(test_data$satisfaction, levels = satisfaction_levels, ordered = TRUE)
test_data$satisfaction_numeric <- as.numeric(test_data$satisfaction)
# Predictive model for the satisfaction level of an employee (linear regression)
satisfaction_model_lm <- lm(satisfaction_numeric ~ evaluation + number_project +
average_montly_hours + time_spend_company +
Work_accident + promotion_last_5years +
department + salary, data = train_data)
# Display model summary
summary(satisfaction_model_lm)
##
## Call:
## lm(formula = satisfaction_numeric ~ evaluation + number_project +
## average_montly_hours + time_spend_company + Work_accident +
## promotion_last_5years + department + salary, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.1549 -0.9684 -0.2544 1.2715 6.0388
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.4592263 0.1332584 18.455 < 2e-16 ***
## evaluationgood 0.2018552 0.0431954 4.673 3.01e-06 ***
## evaluationpoor -0.3418342 0.0441473 -7.743 1.07e-14 ***
## number_project3 1.5466866 0.0587653 26.320 < 2e-16 ***
## number_project4 1.5945525 0.0585724 27.224 < 2e-16 ***
## number_project5 1.5518754 0.0652831 23.771 < 2e-16 ***
## number_project6 -0.9890729 0.0861712 -11.478 < 2e-16 ***
## number_project7 -1.8784318 0.1528396 -12.290 < 2e-16 ***
## average_montly_hours 0.0025191 0.0003886 6.482 9.45e-11 ***
## time_spend_company3 -0.1023707 0.0457588 -2.237 0.02530 *
## time_spend_company4 -0.6317202 0.0590179 -10.704 < 2e-16 ***
## time_spend_company5 -0.3342516 0.0677626 -4.933 8.24e-07 ***
## time_spend_company6 -0.3540191 0.0857067 -4.131 3.65e-05 ***
## time_spend_company7 -0.3739658 0.1596484 -2.342 0.01918 *
## time_spend_company8 -0.1536099 0.1680320 -0.914 0.36065
## time_spend_company10 -0.4126934 0.1460447 -2.826 0.00473 **
## Work_accident1 0.0160644 0.0487702 0.329 0.74187
## promotion_last_5years1 0.1001693 0.1211087 0.827 0.40820
## departmenthr 0.0953797 0.1076701 0.886 0.37572
## departmentIT 0.2146671 0.0961000 2.234 0.02552 *
## departmentmanagement 0.2098379 0.1138359 1.843 0.06531 .
## departmentmarketing 0.2580584 0.1028114 2.510 0.01209 *
## departmentproduct_mng 0.2545571 0.1025364 2.483 0.01306 *
## departmentRandD 0.1467039 0.1065717 1.377 0.16867
## departmentsales 0.1729626 0.0824366 2.098 0.03592 *
## departmentsupport 0.1517452 0.0878993 1.726 0.08432 .
## departmenttechnical 0.1845927 0.0853314 2.163 0.03055 *
## salarylow 0.0055320 0.0653048 0.085 0.93249
## salarymedium 0.0256233 0.0653003 0.392 0.69478
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.69 on 9971 degrees of freedom
## Multiple R-squared: 0.2744, Adjusted R-squared: 0.2723
## F-statistic: 134.6 on 28 and 9971 DF, p-value: < 2.2e-16
# Classification model for whether or not an employee will leave the company (logistic regression)
leave_model <- glm(left ~ satisfaction + evaluation + number_project +
average_montly_hours + time_spend_company +
Work_accident + promotion_last_5years +
department + salary, family = binomial(link = 'logit'),
data = train_data)
# Display model summary
summary(leave_model)
##
## Call:
## glm(formula = left ~ satisfaction + evaluation + number_project +
## average_montly_hours + time_spend_company + Work_accident +
## promotion_last_5years + department + salary, family = binomial(link = "logit"),
## data = train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.9456 -0.2174 -0.0640 0.0000 4.0548
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.000502 0.420741 -19.015 < 2e-16 ***
## satisfaction.L -1.441340 0.133640 -10.785 < 2e-16 ***
## satisfaction.Q 0.874102 0.165231 5.290 1.22e-07 ***
## satisfaction.C 0.093725 0.139426 0.672 0.50144
## satisfaction^4 -3.378539 0.162819 -20.750 < 2e-16 ***
## satisfaction^5 2.240247 0.140777 15.913 < 2e-16 ***
## satisfaction^6 0.281954 0.204820 1.377 0.16864
## evaluationgood 1.740372 0.116782 14.903 < 2e-16 ***
## evaluationpoor 0.617081 0.133324 4.628 3.68e-06 ***
## number_project3 -4.392791 0.199222 -22.050 < 2e-16 ***
## number_project4 -2.890854 0.148508 -19.466 < 2e-16 ***
## number_project5 -2.175096 0.151608 -14.347 < 2e-16 ***
## number_project6 -1.090850 0.178546 -6.110 9.99e-10 ***
## number_project7 19.317948 442.120797 0.044 0.96515
## average_montly_hours 0.015749 0.001087 14.495 < 2e-16 ***
## time_spend_company3 2.039147 0.215923 9.444 < 2e-16 ***
## time_spend_company4 3.155921 0.230256 13.706 < 2e-16 ***
## time_spend_company5 4.341799 0.228738 18.982 < 2e-16 ***
## time_spend_company6 3.299981 0.246939 13.364 < 2e-16 ***
## time_spend_company7 -13.873905 462.518254 -0.030 0.97607
## time_spend_company8 -13.946272 506.090051 -0.028 0.97802
## time_spend_company10 -13.637897 426.414227 -0.032 0.97449
## Work_accident1 -1.309912 0.144426 -9.070 < 2e-16 ***
## promotion_last_5years1 -1.222569 0.407790 -2.998 0.00272 **
## departmenthr 0.252068 0.244986 1.029 0.30352
## departmentIT -0.230102 0.222162 -1.036 0.30032
## departmentmanagement -0.305417 0.282169 -1.082 0.27908
## departmentmarketing 0.001724 0.237869 0.007 0.99422
## departmentproduct_mng -0.508625 0.234332 -2.171 0.02997 *
## departmentRandD -0.577379 0.261751 -2.206 0.02740 *
## departmentsales 0.038742 0.186065 0.208 0.83506
## departmentsupport -0.008942 0.199414 -0.045 0.96423
## departmenttechnical 0.072280 0.192045 0.376 0.70664
## salarylow 1.730165 0.202534 8.543 < 2e-16 ***
## salarymedium 1.342978 0.204590 6.564 5.23e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10936 on 9999 degrees of freedom
## Residual deviance: 4060 on 9965 degrees of freedom
## AIC: 4130
##
## Number of Fisher Scoring iterations: 17
# Satisfaction Model RMSE
satisfaction_pred <- predict(satisfaction_model_lm, newdata = test_data)
satisfaction_error <- sqrt(mean((satisfaction_pred - test_data$satisfaction_numeric)^2))
cat("Satisfaction Model RMSE:", satisfaction_error, "\n")
## Satisfaction Model RMSE: 1.691425
# Leave Model Error Rate
leave_pred <- predict(leave_model, newdata = test_data, type = 'response')
leave_pred_binary <- ifelse(leave_pred > 0.5, 1, 0)
leave_error <- mean(leave_pred_binary != test_data$left)
cat("Leave Model Error Rate:", leave_error, "\n")
## Leave Model Error Rate: 0.06868956