library(readr)
library(tidyverse)
library(plotly)

df <- read_csv("~/Library/Mobile Documents/com~apple~CloudDocs/Fairfield University/Fall 2024/DATA 6520/Assignments/Group/HR_comma_sep.csv") %>% 
  mutate(employee_status = ifelse(left == 1 , 'Left' , 'Stayed') ,
         work_accident = ifelse(Work_accident == 1 , 'Yes' , 'No') ,
         promotion_last_5years = ifelse(promotion_last_5years == 1 , 'Yes' , 'No')) %>% 
  select(-left , -Work_accident) %>% 
  rename(average_monthly_hours = average_montly_hours)


# Load required libraries
library(tidyverse)
library(caret)
library(car)
library(ROCR)
library(pROC)

# Prepare data for logistic regression
df_model <- df %>%
  mutate(target = ifelse(employee_status == "Left", 1, 0),
         department = as.factor(department),
         salary = as.factor(salary),
         work_accident = as.factor(work_accident),
         promotion_last_5years = as.factor(promotion_last_5years))

# Split data into training and testing sets
set.seed(123)
train_index <- createDataPartition(df_model$target, p = 0.7, list = FALSE)
train_data <- df_model[train_index, ]
test_data <- df_model[-train_index, ]

# Create logistic regression model
model <- glm(target ~ satisfaction_level + last_evaluation + number_project + 
             average_monthly_hours + time_spend_company + work_accident + 
             promotion_last_5years + department + salary,
             data = train_data, family = "binomial")

# Display model summary
summary(model)
## 
## Call:
## glm(formula = target ~ satisfaction_level + last_evaluation + 
##     number_project + average_monthly_hours + time_spend_company + 
##     work_accident + promotion_last_5years + department + salary, 
##     family = "binomial", data = train_data)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -2.200751   0.360745  -6.101 1.06e-09 ***
## satisfaction_level       -4.639888   0.192578 -24.094  < 2e-16 ***
## last_evaluation           0.651139   0.300690   2.165   0.0304 *  
## number_project           -0.455438   0.043370 -10.501  < 2e-16 ***
## average_monthly_hours     0.002181   0.001057   2.063   0.0391 *  
## time_spend_company        1.010193   0.048974  20.627  < 2e-16 ***
## work_accidentYes         -1.477157   0.161526  -9.145  < 2e-16 ***
## promotion_last_5yearsYes -1.022645   0.482370  -2.120   0.0340 *  
## departmenthr              0.179725   0.255711   0.703   0.4822    
## departmentIT              0.169669   0.233481   0.727   0.4674    
## departmentmanagement      0.116421   0.320859   0.363   0.7167    
## departmentmarketing       0.122366   0.255587   0.479   0.6321    
## departmentproduct_mng     0.167479   0.252334   0.664   0.5069    
## departmentRandD          -0.412785   0.253574  -1.628   0.1036    
## departmentsales           0.192218   0.198308   0.969   0.3324    
## departmentsupport         0.235785   0.210242   1.121   0.2621    
## departmenttechnical       0.131901   0.204717   0.644   0.5194    
## salarylow                 1.718768   0.245719   6.995 2.66e-12 ***
## salarymedium              1.348420   0.247863   5.440 5.32e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5153.6  on 3946  degrees of freedom
## Residual deviance: 3548.4  on 3928  degrees of freedom
## AIC: 3586.4
## 
## Number of Fisher Scoring iterations: 5
# Calculate VIF to check multicollinearity
vif(model)
##                           GVIF Df GVIF^(1/(2*Df))
## satisfaction_level    1.258732  1        1.121932
## last_evaluation       1.786043  1        1.336429
## number_project        2.283550  1        1.511142
## average_monthly_hours 1.866632  1        1.366247
## time_spend_company    1.425222  1        1.193827
## work_accident         1.010641  1        1.005306
## promotion_last_5years 1.008243  1        1.004113
## department            1.020682  9        1.001138
## salary                1.014655  2        1.003644
# Make predictions on test set
predictions_prob <- predict(model, test_data, type = "response")
predictions_class <- ifelse(predictions_prob > 0.5, 1, 0)

# Calculate model performance metrics
conf_matrix <- confusionMatrix(factor(predictions_class), 
                             factor(test_data$target), 
                             positive = "1")

# Print model performance metrics
print(conf_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 933 156
##          1 175 427
##                                           
##                Accuracy : 0.8043          
##                  95% CI : (0.7845, 0.8229)
##     No Information Rate : 0.6552          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.5701          
##                                           
##  Mcnemar's Test P-Value : 0.3225          
##                                           
##             Sensitivity : 0.7324          
##             Specificity : 0.8421          
##          Pos Pred Value : 0.7093          
##          Neg Pred Value : 0.8567          
##              Prevalence : 0.3448          
##          Detection Rate : 0.2525          
##    Detection Prevalence : 0.3560          
##       Balanced Accuracy : 0.7872          
##                                           
##        'Positive' Class : 1               
## 
# Create ROC curve
roc_obj <- roc(test_data$target, predictions_prob)
auc_value <- auc(roc_obj)

# Plot ROC curve
plot(roc_obj, main = "ROC Curve for Employee Attrition Model")
abline(a = 0, b = 1, lty = 2, col = "gray")
text(0.8, 0.2, paste("AUC =", round(auc_value, 3)))

# Calculate odds ratios and confidence intervals
odds_ratios <- exp(coef(model))
conf_int <- exp(confint(model))

# Combine odds ratios and confidence intervals
odds_table <- data.frame(
  OR = odds_ratios,
  CI_lower = conf_int[,1],
  CI_upper = conf_int[,2]
) %>%
  round(3)

# Print odds ratios table
print(odds_table)
##                             OR CI_lower CI_upper
## (Intercept)              0.111    0.054    0.223
## satisfaction_level       0.010    0.007    0.014
## last_evaluation          1.918    1.065    3.463
## number_project           0.634    0.582    0.690
## average_monthly_hours    1.002    1.000    1.004
## time_spend_company       2.746    2.498    3.027
## work_accidentYes         0.228    0.165    0.311
## promotion_last_5yearsYes 0.360    0.129    0.872
## departmenthr             1.197    0.725    1.978
## departmentIT             1.185    0.750    1.875
## departmentmanagement     1.123    0.597    2.101
## departmentmarketing      1.130    0.685    1.866
## departmentproduct_mng    1.182    0.721    1.941
## departmentRandD          0.662    0.402    1.087
## departmentsales          1.212    0.823    1.793
## departmentsupport        1.266    0.840    1.916
## departmenttechnical      1.141    0.765    1.708
## salarylow                5.578    3.497    9.182
## salarymedium             3.851    2.404    6.364
# Feature importance based on absolute z-values
importance <- data.frame(
  Variable = names(coef(model)[-1]),  # Exclude intercept
  Importance = abs(summary(model)$coefficients[-1, "z value"])
) %>%
  arrange(desc(Importance))

# Plot feature importance
ggplot(importance, aes(x = reorder(Variable, Importance), y = Importance)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  theme_minimal() +
  labs(title = "Feature Importance in Predicting Employee Attrition",
       x = "Variables",
       y = "Absolute Z-value")

# Investigate interaction effects
# Example interactions between satisfaction and other key variables
model_interactions <- glm(target ~ satisfaction_level * last_evaluation + 
                         satisfaction_level * number_project +
                         satisfaction_level * average_monthly_hours +
                         time_spend_company + work_accident + 
                         promotion_last_5years + department + salary,
                         data = train_data, family = "binomial")

# Compare models using AIC
aic_comparison <- AIC(model, model_interactions)
print(aic_comparison)
##                    df      AIC
## model              19 3586.362
## model_interactions 22 3393.768
# Plot predicted probabilities vs satisfaction level for different number of projects
ggplot(train_data, aes(x = satisfaction_level, y = target)) +
  geom_smooth(method = "glm", method.args = list(family = "binomial"), se = TRUE) +
  facet_wrap(~number_project) +
  theme_minimal() +
  labs(title = "Predicted Attrition Probability vs Satisfaction by Number of Projects",
       x = "Satisfaction Level",
       y = "Predicted Probability of Leaving")