library(tidyverse)
library(openintro)
data <- read_csv("~/Desktop/School Portfolio/UTSA/Undergrad/Data Mining/Project/HR_Analytics.csv")
## Rows: 1470 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
## dbl (26): Age, DailyRate, DistanceFromHome, Education, EmployeeCount, Employ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# data clean up
# Check the structure of the dataset
str(data)
## spc_tbl_ [1,470 × 35] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Age : num [1:1470] 41 49 37 33 27 32 59 30 38 36 ...
## $ Attrition : chr [1:1470] "Yes" "No" "Yes" "No" ...
## $ BusinessTravel : chr [1:1470] "Travel_Rarely" "Travel_Frequently" "Travel_Rarely" "Travel_Frequently" ...
## $ DailyRate : num [1:1470] 1102 279 1373 1392 591 ...
## $ Department : chr [1:1470] "Sales" "Research & Development" "Research & Development" "Research & Development" ...
## $ DistanceFromHome : num [1:1470] 1 8 2 3 2 2 3 24 23 27 ...
## $ Education : num [1:1470] 2 1 2 4 1 2 3 1 3 3 ...
## $ EducationField : chr [1:1470] "Life Sciences" "Life Sciences" "Other" "Life Sciences" ...
## $ EmployeeCount : num [1:1470] 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : num [1:1470] 1 2 4 5 7 8 10 11 12 13 ...
## $ EnvironmentSatisfaction : num [1:1470] 2 3 4 4 1 4 3 4 4 3 ...
## $ Gender : chr [1:1470] "Female" "Male" "Male" "Female" ...
## $ HourlyRate : num [1:1470] 94 61 92 56 40 79 81 67 44 94 ...
## $ JobInvolvement : num [1:1470] 3 2 2 3 3 3 4 3 2 3 ...
## $ JobLevel : num [1:1470] 2 2 1 1 1 1 1 1 3 2 ...
## $ JobRole : chr [1:1470] "Sales Executive" "Research Scientist" "Laboratory Technician" "Research Scientist" ...
## $ JobSatisfaction : num [1:1470] 4 2 3 3 2 4 1 3 3 3 ...
## $ MaritalStatus : chr [1:1470] "Single" "Married" "Single" "Married" ...
## $ MonthlyIncome : num [1:1470] 5993 5130 2090 2909 3468 ...
## $ MonthlyRate : num [1:1470] 19479 24907 2396 23159 16632 ...
## $ NumCompaniesWorked : num [1:1470] 8 1 6 1 9 0 4 1 0 6 ...
## $ Over18 : chr [1:1470] "Y" "Y" "Y" "Y" ...
## $ OverTime : chr [1:1470] "Yes" "No" "Yes" "Yes" ...
## $ PercentSalaryHike : num [1:1470] 11 23 15 11 12 13 20 22 21 13 ...
## $ PerformanceRating : num [1:1470] 3 4 3 3 3 3 4 4 4 3 ...
## $ RelationshipSatisfaction: num [1:1470] 1 4 2 3 4 3 1 2 2 2 ...
## $ StandardHours : num [1:1470] 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : num [1:1470] 0 1 0 0 1 0 3 1 0 2 ...
## $ TotalWorkingYears : num [1:1470] 8 10 7 8 6 8 12 1 10 17 ...
## $ TrainingTimesLastYear : num [1:1470] 0 3 3 3 3 2 3 2 2 3 ...
## $ WorkLifeBalance : num [1:1470] 1 3 3 3 3 2 2 3 3 2 ...
## $ YearsAtCompany : num [1:1470] 6 10 0 8 2 7 1 1 9 7 ...
## $ YearsInCurrentRole : num [1:1470] 4 7 0 7 2 7 0 0 7 7 ...
## $ YearsSinceLastPromotion : num [1:1470] 0 1 0 3 2 3 0 0 1 7 ...
## $ YearsWithCurrManager : num [1:1470] 5 7 0 0 2 6 0 0 8 7 ...
## - attr(*, "spec")=
## .. cols(
## .. Age = col_double(),
## .. Attrition = col_character(),
## .. BusinessTravel = col_character(),
## .. DailyRate = col_double(),
## .. Department = col_character(),
## .. DistanceFromHome = col_double(),
## .. Education = col_double(),
## .. EducationField = col_character(),
## .. EmployeeCount = col_double(),
## .. EmployeeNumber = col_double(),
## .. EnvironmentSatisfaction = col_double(),
## .. Gender = col_character(),
## .. HourlyRate = col_double(),
## .. JobInvolvement = col_double(),
## .. JobLevel = col_double(),
## .. JobRole = col_character(),
## .. JobSatisfaction = col_double(),
## .. MaritalStatus = col_character(),
## .. MonthlyIncome = col_double(),
## .. MonthlyRate = col_double(),
## .. NumCompaniesWorked = col_double(),
## .. Over18 = col_character(),
## .. OverTime = col_character(),
## .. PercentSalaryHike = col_double(),
## .. PerformanceRating = col_double(),
## .. RelationshipSatisfaction = col_double(),
## .. StandardHours = col_double(),
## .. StockOptionLevel = col_double(),
## .. TotalWorkingYears = col_double(),
## .. TrainingTimesLastYear = col_double(),
## .. WorkLifeBalance = col_double(),
## .. YearsAtCompany = col_double(),
## .. YearsInCurrentRole = col_double(),
## .. YearsSinceLastPromotion = col_double(),
## .. YearsWithCurrManager = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
## # A tibble: 6 × 35
## Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education
## <dbl> <chr> <chr> <dbl> <chr> <dbl> <dbl>
## 1 41 Yes Travel_Rarely 1102 Sales 1 2
## 2 49 No Travel_Freque… 279 Research … 8 1
## 3 37 Yes Travel_Rarely 1373 Research … 2 2
## 4 33 No Travel_Freque… 1392 Research … 3 4
## 5 27 No Travel_Rarely 591 Research … 2 1
## 6 32 No Travel_Freque… 1005 Research … 2 2
## # ℹ 28 more variables: EducationField <chr>, EmployeeCount <dbl>,
## # EmployeeNumber <dbl>, EnvironmentSatisfaction <dbl>, Gender <chr>,
## # HourlyRate <dbl>, JobInvolvement <dbl>, JobLevel <dbl>, JobRole <chr>,
## # JobSatisfaction <dbl>, MaritalStatus <chr>, MonthlyIncome <dbl>,
## # MonthlyRate <dbl>, NumCompaniesWorked <dbl>, Over18 <chr>, OverTime <chr>,
## # PercentSalaryHike <dbl>, PerformanceRating <dbl>,
## # RelationshipSatisfaction <dbl>, StandardHours <dbl>, …
# Check for nulls and redundant constant fields, constant variables mess up the selection process since it’s based on variability
# Check for null values
null_columns <- sapply(data, function(x) any(is.na(x)))
if (any(null_columns)) {
print("Null values detected.")
print("Columns with null values:")
print(names(data)[null_columns])
} else {
print("No null values detected.")
}
## [1] "No null values detected."
# Check for constant fields
constant_columns <- sapply(data, function(x) length(unique(x)) == 1)
if (any(constant_columns)) {
print("Constant fields detected.")
print("Columns with constant values:")
print(names(data)[constant_columns])
} else {
print("No constant fields detected.")
}
## [1] "Constant fields detected."
## [1] "Columns with constant values:"
## [1] "EmployeeCount" "Over18" "StandardHours"
# Drop constant fields and "EmployeeNumber" variable, I’m dropping EmployeeNumber since this alone can’t be a good indicator of whether an employee will term as it’s unique for every employee and only differentiates employees from one another
data <- data %>%
select(-where(~length(unique(.)) == 1), -EmployeeNumber)
#This step may not be necessary but it shouldn’t affect model performance
# Convert character variables to factors
data <- data %>%
mutate_if(is.character, as.factor)
# Check the structure of the dataset
str(data)
## tibble [1,470 × 31] (S3: tbl_df/tbl/data.frame)
## $ Age : num [1:1470] 41 49 37 33 27 32 59 30 38 36 ...
## $ Attrition : Factor w/ 2 levels "No","Yes": 2 1 2 1 1 1 1 1 1 1 ...
## $ BusinessTravel : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 2 3 2 3 2 3 3 2 3 ...
## $ DailyRate : num [1:1470] 1102 279 1373 1392 591 ...
## $ Department : Factor w/ 3 levels "Human Resources",..: 3 2 2 2 2 2 2 2 2 2 ...
## $ DistanceFromHome : num [1:1470] 1 8 2 3 2 2 3 24 23 27 ...
## $ Education : num [1:1470] 2 1 2 4 1 2 3 1 3 3 ...
## $ EducationField : Factor w/ 6 levels "Human Resources",..: 2 2 5 2 4 2 4 2 2 4 ...
## $ EnvironmentSatisfaction : num [1:1470] 2 3 4 4 1 4 3 4 4 3 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 2 2 1 2 2 1 2 2 2 ...
## $ HourlyRate : num [1:1470] 94 61 92 56 40 79 81 67 44 94 ...
## $ JobInvolvement : num [1:1470] 3 2 2 3 3 3 4 3 2 3 ...
## $ JobLevel : num [1:1470] 2 2 1 1 1 1 1 1 3 2 ...
## $ JobRole : Factor w/ 9 levels "Healthcare Representative",..: 8 7 3 7 3 3 3 3 5 1 ...
## $ JobSatisfaction : num [1:1470] 4 2 3 3 2 4 1 3 3 3 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 3 2 3 2 2 3 2 1 3 2 ...
## $ MonthlyIncome : num [1:1470] 5993 5130 2090 2909 3468 ...
## $ MonthlyRate : num [1:1470] 19479 24907 2396 23159 16632 ...
## $ NumCompaniesWorked : num [1:1470] 8 1 6 1 9 0 4 1 0 6 ...
## $ OverTime : Factor w/ 2 levels "No","Yes": 2 1 2 2 1 1 2 1 1 1 ...
## $ PercentSalaryHike : num [1:1470] 11 23 15 11 12 13 20 22 21 13 ...
## $ PerformanceRating : num [1:1470] 3 4 3 3 3 3 4 4 4 3 ...
## $ RelationshipSatisfaction: num [1:1470] 1 4 2 3 4 3 1 2 2 2 ...
## $ StockOptionLevel : num [1:1470] 0 1 0 0 1 0 3 1 0 2 ...
## $ TotalWorkingYears : num [1:1470] 8 10 7 8 6 8 12 1 10 17 ...
## $ TrainingTimesLastYear : num [1:1470] 0 3 3 3 3 2 3 2 2 3 ...
## $ WorkLifeBalance : num [1:1470] 1 3 3 3 3 2 2 3 3 2 ...
## $ YearsAtCompany : num [1:1470] 6 10 0 8 2 7 1 1 9 7 ...
## $ YearsInCurrentRole : num [1:1470] 4 7 0 7 2 7 0 0 7 7 ...
## $ YearsSinceLastPromotion : num [1:1470] 0 1 0 3 2 3 0 0 1 7 ...
## $ YearsWithCurrManager : num [1:1470] 5 7 0 0 2 6 0 0 8 7 ...
set.seed(1)
test_sample <- sample(1:nrow(data), nrow(data)/4)
train <- data[-test_sample, ]
test <- data[test_sample, ]
library(leaps)
fwd <- regsubsets(Attrition ~ ., data=train, nvmax=17, method='forward')
fwd_sum <- summary(fwd)
par(mfrow=c(2,2))
plot(fwd_sum$cp ,xlab="Number of Variables ", ylab="Cp",
type="b")
points(which.min(fwd_sum$cp), fwd_sum$cp[which.min(fwd_sum$cp)], col="red",
cex=2, pch=20)
plot(fwd_sum$bic ,xlab="Number of Variables ",
ylab="BIC",type="b")
points(which.min(fwd_sum$bic), fwd_sum$bic[which.min(fwd_sum$bic)], col="red",
cex=2, pch=20)
plot(fwd_sum$adjr2 ,xlab="Number of Variables ",
ylab="Adjusted R^2^",type="b")
points(which.max(fwd_sum$adjr2), fwd_sum$adjr2[which.max(fwd_sum$adjr2)],
col="red", cex=2, pch=20)
which.min(fwd_sum$cp)
## [1] 17
## [1] 10
## [1] 17
test_matrix <- model.matrix(Attrition~., data=test)
val.errors <- rep(NA,17)
for(i in 1:17){
coefi <- coef(fwd,id=i)
pred <- test_matrix[,names(coefi)]%*%coefi
pred_binary <- ifelse(pred >= 1.5, 2, 1)
val.errors[i] <- mean((as.numeric(test$Attrition) - pred_binary)^2)
}
which.min(val.errors)
## [1] 16
plot(val.errors, type='b')
points(which.min(val.errors), val.errors[10], col='red', pch=20, cex=2)

fwd_full <- regsubsets(Attrition ~ ., data=data, nvmax=17, method='forward')
coef(fwd_full, 10)
## (Intercept) BusinessTravelTravel_Frequently
## 1.447840578 0.093717866
## EnvironmentSatisfaction JobInvolvement
## -0.042274767 -0.063619678
## JobRoleLaboratory Technician JobRoleSales Representative
## 0.084311789 0.204129425
## JobSatisfaction MaritalStatusSingle
## -0.038640531 0.119075546
## NumCompaniesWorked OverTimeYes
## 0.014978062 0.214539213
## TotalWorkingYears
## -0.006703014
# Get the indices of the selected variables
selected_indices <- which.min(fwd_sum$bic)
# Extract the names of the selected variables
selected_variables <- names(which(fwd_sum$which[selected_indices, ] != 0, arr.ind = TRUE))
sel_var <-selected_variables[-1]
# Print the selected variables
print(sel_var)
## [1] "BusinessTravelTravel_Frequently" "EnvironmentSatisfaction"
## [3] "JobInvolvement" "JobRoleSales Representative"
## [5] "JobSatisfaction" "MaritalStatusSingle"
## [7] "NumCompaniesWorked" "OverTimeYes"
## [9] "TotalWorkingYears" "WorkLifeBalance"
# Identify the columns that need one-hot encoding
columns_to_encode <- c("MaritalStatus", "BusinessTravel","JobRole","OverTime")
# One-hot encode selected columns
encoded_train <- model.matrix(~ . - 1, data = train[columns_to_encode])
encoded_test<-model.matrix(~ . - 1, data = test[columns_to_encode])
# Combine encoded columns with original dataset
encoded_train <- cbind(train, encoded_train)
encoded_test <- cbind(test, encoded_test)
# Filter out columns from the final dataset
train_fnl <- encoded_train[, c("Attrition", sel_var)]
test_fnl <- encoded_test[, c("Attrition", sel_var)]
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following objects are masked from 'package:openintro':
##
## ethanol, lsegments
##
## Attaching package: 'caret'
## The following object is masked from 'package:openintro':
##
## dotPlot
## The following object is masked from 'package:purrr':
##
## lift
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
#Log model
# Fit logistic regression model using train_fnl
cv <- trainControl(method='cv', number=10, savePredictions = T)
logistic_model <- train(Attrition ~ ., data = train_fnl, method='glm', family = binomial, trControl=cv)
logistic_model
## Generalized Linear Model
##
## 1103 samples
## 10 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 993, 993, 993, 993, 993, 993, ...
## Resampling results:
##
## Accuracy Kappa
## 0.8694758 0.3854704
# Calculate precision, recall, and F1-score using the confusion matrix
predicted <- logistic_model$pred$pred
observed <- logistic_model$pred$obs
conf_matrix <- confusionMatrix(predicted, observed)
accuracy <- sum(conf_matrix$table[1,1], conf_matrix$table[2,2]) / sum(conf_matrix$table)
precision <- conf_matrix$byClass["Pos Pred Value"]
recall <- conf_matrix$byClass["Sensitivity"]
f1_score <- 2 * (precision * recall) / (precision + recall)
# Calculate AUC
roc_curve <- roc(as.numeric(observed), as.numeric(predicted))
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases
auc_value <- auc(roc_curve)
# Display evaluation metrics
print(paste("Accuracy:", round(accuracy, 4)))
## [1] "Accuracy: 0.8694"
print(paste("Precision:", round(precision, 4)))
## [1] "Precision: 0.8824"
print(paste("Recall:", round(recall, 4)))
## [1] "Recall: 0.974"
print(paste("F1-score:", round(f1_score, 4)))
## [1] "F1-score: 0.9259"
print(paste("AUC:", round(auc_value, 4)))
## [1] "AUC: 0.6518"
# Plot ROC curve
plot(roc_curve, main = "ROC Curve", col = "blue", lwd = 2)
text(0.95, 0, paste("AUC =", round(auc(roc_curve), 2)), adj = c(1, 0), pos = 4)

##LDA
# Fit LDA model using train_fnl
cv <- trainControl(method = 'cv', number = 10, savePredictions = TRUE)
lda_model <- train(Attrition ~ ., data = train_fnl, method = 'lda', trControl = cv)
lda_model
## Linear Discriminant Analysis
##
## 1103 samples
## 10 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 994, 993, 993, 992, 993, 993, ...
## Resampling results:
##
## Accuracy Kappa
## 0.8603583 0.3643567
# Calculate precision, recall, and F1-score using the confusion matrix
lda_predicted <- lda_model$pred$pred
lda_observed <- lda_model$pred$obs
lda_conf_matrix <- confusionMatrix(lda_predicted, lda_observed)
lda_accuracy <- sum(lda_conf_matrix$table[1,1], lda_conf_matrix$table[2,2]) / sum(lda_conf_matrix$table)
lda_precision <- lda_conf_matrix$byClass["Pos Pred Value"]
lda_recall <- lda_conf_matrix$byClass["Sensitivity"]
lda_f1_score <- 2 * (lda_precision * lda_recall) / (lda_precision + lda_recall)
# Calculate AUC
lda_roc_curve <- roc(as.numeric(lda_observed), as.numeric(lda_predicted))
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases
lda_auc_value <- auc(lda_roc_curve)
# Display evaluation metrics
print(paste("Accuracy:", round(lda_accuracy, 4)))
## [1] "Accuracy: 0.8604"
print(paste("Precision:", round(lda_precision, 4)))
## [1] "Precision: 0.8827"
print(paste("Recall:", round(lda_recall, 4)))
## [1] "Recall: 0.961"
print(paste("F1-score:", round(lda_f1_score, 4)))
## [1] "F1-score: 0.9202"
print(paste("AUC:", round(lda_auc_value, 4)))
## [1] "AUC: 0.6509"
# Plot ROC curve
plot(lda_roc_curve, main = "ROC Curve", col = "blue", lwd = 2)
text(0.95, 0, paste("AUC =", round(auc(lda_roc_curve), 2)), adj = c(1, 0), pos = 4)

#QDA
# Fit QDA model using train_fnl
cv <- trainControl(method = 'cv', number = 10, savePredictions = TRUE)
qda_model <- train(Attrition ~ ., data = train_fnl, method = 'qda', trControl = cv)
qda_model
## Quadratic Discriminant Analysis
##
## 1103 samples
## 10 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 993, 992, 993, 993, 993, 993, ...
## Resampling results:
##
## Accuracy Kappa
## 0.8377068 0.3049307
# Calculate precision, recall, and F1-score using the confusion matrix
qda_predicted <- qda_model$pred$pred
qda_observed <- qda_model$pred$obs
qda_conf_matrix <- confusionMatrix(qda_predicted, qda_observed)
qda_accuracy <- sum(qda_conf_matrix$table[1,1], qda_conf_matrix$table[2,2]) / sum(qda_conf_matrix$table)
qda_precision <- qda_conf_matrix$byClass["Pos Pred Value"]
qda_recall <- qda_conf_matrix$byClass["Sensitivity"]
qda_f1_score <- 2 * (qda_precision * qda_recall) / (qda_precision + qda_recall)
# Calculate AUC
qda_roc_curve <- roc(as.numeric(qda_observed), as.numeric(qda_predicted))
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases
qda_auc_value <- auc(qda_roc_curve)
# Display evaluation metrics
print(paste("Accuracy:", round(qda_accuracy, 4)))
## [1] "Accuracy: 0.8377"
print(paste("Precision:", round(qda_precision, 4)))
## [1] "Precision: 0.8789"
print(paste("Recall:", round(qda_recall, 4)))
## [1] "Recall: 0.9351"
print(paste("F1-score:", round(qda_f1_score, 4)))
## [1] "F1-score: 0.9061"
print(paste("AUC:", round(qda_auc_value, 4)))
## [1] "AUC: 0.6351"
# Plot ROC curve
plot(qda_roc_curve, main = "ROC Curve", col = "blue", lwd = 2)
text(0.95, 0, paste("AUC =", round(auc(qda_roc_curve), 2)), adj = c(1, 0), pos = 4)

# Parse out P values
log_p<-conf_matrix$overall["AccuracyPValue"]
lda_p<-lda_conf_matrix$overall["AccuracyPValue"]
qda_p<-qda_conf_matrix$overall["AccuracyPValue"]
# Create vectors for accuracy, recall, and AUC values for different models
model_accuracy <- c(accuracy, lda_accuracy, qda_accuracy)
model_recall <- c(recall, lda_recall, qda_recall)
model_auc_value <- c(auc_value, lda_auc_value, qda_auc_value)
model_pvalue <- c(log_p, lda_p, qda_p)
# Create a dataframe
comparison_table <- data.frame(Model = c("LOG", "LDA", "QDA"),
Accuracy = model_accuracy,
Recall = model_recall,
AUC_Value = model_auc_value,
P_Value = model_pvalue)
# Gives us a general look at some metrics we can compare, maybe something like this can be included in the slide deck
print(comparison_table)
## Model Accuracy Recall AUC_Value P_Value
## 1 LOG 0.8694470 0.9740260 0.6518175 0.001939485
## 2 LDA 0.8603808 0.9610390 0.6509105 0.021153476
## 3 QDA 0.8377153 0.9350649 0.6351302 0.519942645
---
title: "Project"
author: "Cheyenne Airington, Rani Misra"
date: "`r Sys.Date()`"
output: openintro::lab_report
---

```{r load-packages, message=FALSE}
library(tidyverse)
library(openintro)
```

```{r}
data <- read_csv("~/Desktop/School Portfolio/UTSA/Undergrad/Data Mining/Project/HR_Analytics.csv")

# data clean up 
# Check the structure of the dataset
str(data)
head(data)

# Check for nulls and redundant constant fields, constant variables mess up the selection process since it’s based on variability

# Check for null values
null_columns <- sapply(data, function(x) any(is.na(x)))
if (any(null_columns)) {
  print("Null values detected.")
  print("Columns with null values:")
  print(names(data)[null_columns])
} else {
  print("No null values detected.")
}

# Check for constant fields
constant_columns <- sapply(data, function(x) length(unique(x)) == 1)
if (any(constant_columns)) {
  print("Constant fields detected.")
  print("Columns with constant values:")
  print(names(data)[constant_columns])
} else {
  print("No constant fields detected.")
}

# Drop constant fields and "EmployeeNumber" variable, I’m dropping EmployeeNumber since this alone can’t be a good indicator of whether an employee will term as it’s unique for every employee and only differentiates employees from one another

data <- data %>%
  select(-where(~length(unique(.)) == 1), -EmployeeNumber)

#This step may not be necessary but it shouldn’t affect model performance
# Convert character variables to factors
data <- data %>%
  mutate_if(is.character, as.factor)

# Check the structure of the dataset
str(data)
```

```{r}
set.seed(1)
test_sample <- sample(1:nrow(data), nrow(data)/4)
train <- data[-test_sample, ]
test <- data[test_sample, ]

library(leaps)

fwd <- regsubsets(Attrition ~ ., data=train, nvmax=17, method='forward')
fwd_sum <- summary(fwd)

par(mfrow=c(2,2))
plot(fwd_sum$cp ,xlab="Number of Variables ", ylab="Cp",
type="b")
points(which.min(fwd_sum$cp), fwd_sum$cp[which.min(fwd_sum$cp)], col="red", 
       cex=2, pch=20)

plot(fwd_sum$bic ,xlab="Number of Variables ", 
ylab="BIC",type="b")
points(which.min(fwd_sum$bic), fwd_sum$bic[which.min(fwd_sum$bic)], col="red", 
       cex=2, pch=20)

plot(fwd_sum$adjr2 ,xlab="Number of Variables ", 
ylab="Adjusted R^2^",type="b")
points(which.max(fwd_sum$adjr2), fwd_sum$adjr2[which.max(fwd_sum$adjr2)], 
       col="red", cex=2, pch=20)

which.min(fwd_sum$cp)
which.min(fwd_sum$bic)
which.max(fwd_sum$adjr2)

test_matrix <- model.matrix(Attrition~., data=test)

val.errors <- rep(NA,17)
for(i in 1:17){
  coefi <- coef(fwd,id=i)
  pred <- test_matrix[,names(coefi)]%*%coefi
  pred_binary <- ifelse(pred >= 1.5, 2, 1)
  val.errors[i] <- mean((as.numeric(test$Attrition) - pred_binary)^2) 
}

which.min(val.errors)

plot(val.errors, type='b')
points(which.min(val.errors), val.errors[10], col='red', pch=20, cex=2)


```

```{r}
fwd_full <- regsubsets(Attrition ~ ., data=data, nvmax=17, method='forward')
coef(fwd_full, 10)
```

```{r}
# Get the indices of the selected variables
selected_indices <- which.min(fwd_sum$bic)

# Extract the names of the selected variables
selected_variables <- names(which(fwd_sum$which[selected_indices, ] != 0, arr.ind = TRUE))
sel_var <-selected_variables[-1]
# Print the selected variables
print(sel_var)

# Identify the columns that need one-hot encoding
columns_to_encode <- c("MaritalStatus", "BusinessTravel","JobRole","OverTime")

# One-hot encode selected columns
encoded_train <- model.matrix(~ . - 1, data = train[columns_to_encode])
encoded_test<-model.matrix(~ . - 1, data = test[columns_to_encode])

# Combine encoded columns with original dataset
encoded_train <- cbind(train, encoded_train)
encoded_test <- cbind(test, encoded_test)

# Filter out columns from the final dataset
train_fnl <- encoded_train[, c("Attrition", sel_var)]
test_fnl <- encoded_test[, c("Attrition", sel_var)]
```

```{r}
library(caret)
library(pROC)

#Log model

# Fit logistic regression model using train_fnl
cv <- trainControl(method='cv', number=10, savePredictions = T)
logistic_model <- train(Attrition ~ ., data = train_fnl, method='glm', family = binomial, trControl=cv)
logistic_model

# Calculate precision, recall, and F1-score using the confusion matrix
predicted <- logistic_model$pred$pred
observed <- logistic_model$pred$obs
conf_matrix <- confusionMatrix(predicted, observed)

accuracy <- sum(conf_matrix$table[1,1], conf_matrix$table[2,2]) / sum(conf_matrix$table)
precision <- conf_matrix$byClass["Pos Pred Value"]
recall <- conf_matrix$byClass["Sensitivity"]
f1_score <- 2 * (precision * recall) / (precision + recall)

# Calculate AUC
roc_curve <- roc(as.numeric(observed), as.numeric(predicted))
auc_value <- auc(roc_curve)

# Display evaluation metrics
print(paste("Accuracy:", round(accuracy, 4)))
print(paste("Precision:", round(precision, 4)))
print(paste("Recall:", round(recall, 4)))
print(paste("F1-score:", round(f1_score, 4)))
print(paste("AUC:", round(auc_value, 4)))

# Plot ROC curve
plot(roc_curve, main = "ROC Curve", col = "blue", lwd = 2)
text(0.95, 0, paste("AUC =", round(auc(roc_curve), 2)), adj = c(1, 0), pos = 4)
```

```{r}
##LDA

# Fit LDA model using train_fnl
cv <- trainControl(method = 'cv', number = 10, savePredictions = TRUE)
lda_model <- train(Attrition ~ ., data = train_fnl, method = 'lda', trControl = cv)
lda_model

# Calculate precision, recall, and F1-score using the confusion matrix
lda_predicted <- lda_model$pred$pred
lda_observed <- lda_model$pred$obs
lda_conf_matrix <- confusionMatrix(lda_predicted, lda_observed)

lda_accuracy <- sum(lda_conf_matrix$table[1,1], lda_conf_matrix$table[2,2]) / sum(lda_conf_matrix$table)
lda_precision <- lda_conf_matrix$byClass["Pos Pred Value"]
lda_recall <- lda_conf_matrix$byClass["Sensitivity"]
lda_f1_score <- 2 * (lda_precision * lda_recall) / (lda_precision + lda_recall)

# Calculate AUC
lda_roc_curve <- roc(as.numeric(lda_observed), as.numeric(lda_predicted))
lda_auc_value <- auc(lda_roc_curve)

# Display evaluation metrics
print(paste("Accuracy:", round(lda_accuracy, 4)))
print(paste("Precision:", round(lda_precision, 4)))
print(paste("Recall:", round(lda_recall, 4)))
print(paste("F1-score:", round(lda_f1_score, 4)))
print(paste("AUC:", round(lda_auc_value, 4)))

# Plot ROC curve
plot(lda_roc_curve, main = "ROC Curve", col = "blue", lwd = 2)
text(0.95, 0, paste("AUC =", round(auc(lda_roc_curve), 2)), adj = c(1, 0), pos = 4)
```

```{r}
#QDA
# Fit QDA model using train_fnl
cv <- trainControl(method = 'cv', number = 10, savePredictions = TRUE)
qda_model <- train(Attrition ~ ., data = train_fnl, method = 'qda', trControl = cv)
qda_model

# Calculate precision, recall, and F1-score using the confusion matrix
qda_predicted <- qda_model$pred$pred
qda_observed <- qda_model$pred$obs
qda_conf_matrix <- confusionMatrix(qda_predicted, qda_observed)

qda_accuracy <- sum(qda_conf_matrix$table[1,1], qda_conf_matrix$table[2,2]) / sum(qda_conf_matrix$table)
qda_precision <- qda_conf_matrix$byClass["Pos Pred Value"]
qda_recall <- qda_conf_matrix$byClass["Sensitivity"]
qda_f1_score <- 2 * (qda_precision * qda_recall) / (qda_precision + qda_recall)

# Calculate AUC
qda_roc_curve <- roc(as.numeric(qda_observed), as.numeric(qda_predicted))
qda_auc_value <- auc(qda_roc_curve)

# Display evaluation metrics
print(paste("Accuracy:", round(qda_accuracy, 4)))
print(paste("Precision:", round(qda_precision, 4)))
print(paste("Recall:", round(qda_recall, 4)))
print(paste("F1-score:", round(qda_f1_score, 4)))
print(paste("AUC:", round(qda_auc_value, 4)))

# Plot ROC curve
plot(qda_roc_curve, main = "ROC Curve", col = "blue", lwd = 2)
text(0.95, 0, paste("AUC =", round(auc(qda_roc_curve), 2)), adj = c(1, 0), pos = 4)
```

```{r}
# Parse out P values
log_p<-conf_matrix$overall["AccuracyPValue"]
lda_p<-lda_conf_matrix$overall["AccuracyPValue"]
qda_p<-qda_conf_matrix$overall["AccuracyPValue"]

# Create vectors for accuracy, recall, and AUC values for different models
model_accuracy <- c(accuracy, lda_accuracy, qda_accuracy)
model_recall <- c(recall, lda_recall, qda_recall)
model_auc_value <- c(auc_value, lda_auc_value, qda_auc_value)
model_pvalue <- c(log_p, lda_p, qda_p)

# Create a dataframe
comparison_table <- data.frame(Model = c("LOG", "LDA", "QDA"),
                               Accuracy = model_accuracy,
                               Recall = model_recall,
                               AUC_Value = model_auc_value,
                               P_Value = model_pvalue)

# Gives us a general look at some metrics we can compare, maybe something like this can be included in the slide deck
print(comparison_table)
```

