#libraries to use
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)
library(corrplot)

## corrplot 0.92 loaded

library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(dummy)

## dummy 0.1.3
## dummyNews()

attrition <- read.csv("Employee_Attrition.csv")

str(attrition)

## 'data.frame':    1470 obs. of  35 variables:
##  $ Age                     : int  41 49 37 33 27 32 59 30 38 36 ...
##  $ Attrition               : chr  "Yes" "No" "Yes" "No" ...
##  $ BusinessTravel          : chr  "Travel_Rarely" "Travel_Frequently" "Travel_Rarely" "Travel_Frequently" ...
##  $ DailyRate               : int  1102 279 1373 1392 591 1005 1324 1358 216 1299 ...
##  $ Department              : chr  "Sales" "Research & Development" "Research & Development" "Research & Development" ...
##  $ DistanceFromHome        : int  1 8 2 3 2 2 3 24 23 27 ...
##  $ Education               : int  2 1 2 4 1 2 3 1 3 3 ...
##  $ EducationField          : chr  "Life Sciences" "Life Sciences" "Other" "Life Sciences" ...
##  $ EmployeeCount           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ EmployeeNumber          : int  1 2 4 5 7 8 10 11 12 13 ...
##  $ EnvironmentSatisfaction : int  2 3 4 4 1 4 3 4 4 3 ...
##  $ Gender                  : chr  "Female" "Male" "Male" "Female" ...
##  $ HourlyRate              : int  94 61 92 56 40 79 81 67 44 94 ...
##  $ JobInvolvement          : int  3 2 2 3 3 3 4 3 2 3 ...
##  $ JobLevel                : int  2 2 1 1 1 1 1 1 3 2 ...
##  $ JobRole                 : chr  "Sales Executive" "Research Scientist" "Laboratory Technician" "Research Scientist" ...
##  $ JobSatisfaction         : int  4 2 3 3 2 4 1 3 3 3 ...
##  $ MaritalStatus           : chr  "Single" "Married" "Single" "Married" ...
##  $ MonthlyIncome           : int  5993 5130 2090 2909 3468 3068 2670 2693 9526 5237 ...
##  $ MonthlyRate             : int  19479 24907 2396 23159 16632 11864 9964 13335 8787 16577 ...
##  $ NumCompaniesWorked      : int  8 1 6 1 9 0 4 1 0 6 ...
##  $ Over18                  : chr  "Y" "Y" "Y" "Y" ...
##  $ OverTime                : chr  "Yes" "No" "Yes" "Yes" ...
##  $ PercentSalaryHike       : int  11 23 15 11 12 13 20 22 21 13 ...
##  $ PerformanceRating       : int  3 4 3 3 3 3 4 4 4 3 ...
##  $ RelationshipSatisfaction: int  1 4 2 3 4 3 1 2 2 2 ...
##  $ StandardHours           : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel        : int  0 1 0 0 1 0 3 1 0 2 ...
##  $ TotalWorkingYears       : int  8 10 7 8 6 8 12 1 10 17 ...
##  $ TrainingTimesLastYear   : int  0 3 3 3 3 2 3 2 2 3 ...
##  $ WorkLifeBalance         : int  1 3 3 3 3 2 2 3 3 2 ...
##  $ YearsAtCompany          : int  6 10 0 8 2 7 1 1 9 7 ...
##  $ YearsInCurrentRole      : int  4 7 0 7 2 7 0 0 7 7 ...
##  $ YearsSinceLastPromotion : int  0 1 0 3 2 3 0 0 1 7 ...
##  $ YearsWithCurrManager    : int  5 7 0 0 2 6 0 0 8 7 ...

names(attrition)

##  [1] "Age"                      "Attrition"               
##  [3] "BusinessTravel"           "DailyRate"               
##  [5] "Department"               "DistanceFromHome"        
##  [7] "Education"                "EducationField"          
##  [9] "EmployeeCount"            "EmployeeNumber"          
## [11] "EnvironmentSatisfaction"  "Gender"                  
## [13] "HourlyRate"               "JobInvolvement"          
## [15] "JobLevel"                 "JobRole"                 
## [17] "JobSatisfaction"          "MaritalStatus"           
## [19] "MonthlyIncome"            "MonthlyRate"             
## [21] "NumCompaniesWorked"       "Over18"                  
## [23] "OverTime"                 "PercentSalaryHike"       
## [25] "PerformanceRating"        "RelationshipSatisfaction"
## [27] "StandardHours"            "StockOptionLevel"        
## [29] "TotalWorkingYears"        "TrainingTimesLastYear"   
## [31] "WorkLifeBalance"          "YearsAtCompany"          
## [33] "YearsInCurrentRole"       "YearsSinceLastPromotion" 
## [35] "YearsWithCurrManager"

colSums(is.na(attrition))

##                      Age                Attrition           BusinessTravel 
##                        0                        0                        0 
##                DailyRate               Department         DistanceFromHome 
##                        0                        0                        0 
##                Education           EducationField            EmployeeCount 
##                        0                        0                        0 
##           EmployeeNumber  EnvironmentSatisfaction                   Gender 
##                        0                        0                        0 
##               HourlyRate           JobInvolvement                 JobLevel 
##                        0                        0                        0 
##                  JobRole          JobSatisfaction            MaritalStatus 
##                        0                        0                        0 
##            MonthlyIncome              MonthlyRate       NumCompaniesWorked 
##                        0                        0                        0 
##                   Over18                 OverTime        PercentSalaryHike 
##                        0                        0                        0 
##        PerformanceRating RelationshipSatisfaction            StandardHours 
##                        0                        0                        0 
##         StockOptionLevel        TotalWorkingYears    TrainingTimesLastYear 
##                        0                        0                        0 
##          WorkLifeBalance           YearsAtCompany       YearsInCurrentRole 
##                        0                        0                        0 
##  YearsSinceLastPromotion     YearsWithCurrManager 
##                        0                        0

No columns with missing values

# Find columns with only one distinct value
anomaly_columns <- sapply(attrition, function(x) length(unique(x))) == 1
anomaly_columns

##                      Age                Attrition           BusinessTravel 
##                    FALSE                    FALSE                    FALSE 
##                DailyRate               Department         DistanceFromHome 
##                    FALSE                    FALSE                    FALSE 
##                Education           EducationField            EmployeeCount 
##                    FALSE                    FALSE                     TRUE 
##           EmployeeNumber  EnvironmentSatisfaction                   Gender 
##                    FALSE                    FALSE                    FALSE 
##               HourlyRate           JobInvolvement                 JobLevel 
##                    FALSE                    FALSE                    FALSE 
##                  JobRole          JobSatisfaction            MaritalStatus 
##                    FALSE                    FALSE                    FALSE 
##            MonthlyIncome              MonthlyRate       NumCompaniesWorked 
##                    FALSE                    FALSE                    FALSE 
##                   Over18                 OverTime        PercentSalaryHike 
##                     TRUE                    FALSE                    FALSE 
##        PerformanceRating RelationshipSatisfaction            StandardHours 
##                    FALSE                    FALSE                     TRUE 
##         StockOptionLevel        TotalWorkingYears    TrainingTimesLastYear 
##                    FALSE                    FALSE                    FALSE 
##          WorkLifeBalance           YearsAtCompany       YearsInCurrentRole 
##                    FALSE                    FALSE                    FALSE 
##  YearsSinceLastPromotion     YearsWithCurrManager 
##                    FALSE                    FALSE

Over18 column

All the values has Y response thus we can drop the column

# Remove the "Over18" column
attrition <- subset(attrition, select = -Over18)

# Convert numerical features to factors/categorical variables
attrition$Education <- factor(attrition$Education)
attrition$EnvironmentSatisfaction <- factor(attrition$EnvironmentSatisfaction)
attrition$JobInvolvement <- factor(attrition$JobInvolvement)
attrition$JobLevel <- factor(attrition$JobLevel)
attrition$JobSatisfaction <- factor(attrition$JobSatisfaction)
attrition$RelationshipSatisfaction <- factor(attrition$RelationshipSatisfaction)
attrition$StockOptionLevel <- factor(attrition$StockOptionLevel)
attrition$WorkLifeBalance <- factor(attrition$WorkLifeBalance)

Analyze and visualize numerical features

attrition %>%
  select(Age, DailyRate, DistanceFromHome, HourlyRate, MonthlyIncome, MonthlyRate,
         PercentSalaryHike, TotalWorkingYears, YearsAtCompany, YearsInCurrentRole,
         YearsSinceLastPromotion, YearsWithCurrManager) %>%
  gather(key = "Variable", value = "Value") %>%
  ggplot(aes(x = Value)) +
  geom_histogram(fill = "skyblue", color = "black", bins = 20) +
  facet_wrap(~ Variable, scales = "free") +
  theme_gray()

Analyze and visualize categorical features

# BusinessTravel - Bar Graph
attrition %>%
  count(BusinessTravel) %>%
  ggplot(aes(x = BusinessTravel, y = n, fill = BusinessTravel)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = n), vjust = -0.5, color = "black", size = 3) +
  labs(x = "Business Travel", y = "Count", fill = "Business Travel") +
  theme_minimal()

# Department - Bar Graph
attrition %>%
  count(Department) %>%
  ggplot(aes(x = Department, y = n, fill = Department)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = n), vjust = -0.5, color = "black", size = 3) +
  labs(x = "Department", y = "Count", fill = "Department") +
  theme_minimal()

# EducationField - Bar Graph
attrition %>%
  count(EducationField) %>%
  ggplot(aes(x = EducationField, y = n, fill = EducationField)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = n), vjust = -0.5, color = "black", size = 3) +
  labs(x = "Education Field", y = "Count", fill = "Education Field") +
  theme_minimal()

# Gender - Pie Chart
attrition %>%
  count(Gender) %>%
  ggplot(aes(x = "", y = n, fill = Gender)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y", start = 0) +
  labs(x = "", y = "Count", fill = "Gender") +
  theme_minimal()

# JobRole - Bar Graph
attrition %>%
  count(JobRole) %>%
  ggplot(aes(x = JobRole, y = n, fill = JobRole)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = n), vjust = -0.5, color = "black", size = 3) +
  labs(x = "Job Role", y = "Count", fill = "Job Role") +
  theme_minimal()

# MaritalStatus - Bar Graph
attrition %>%
  count(MaritalStatus) %>%
  ggplot(aes(x = MaritalStatus, y = n, fill = MaritalStatus)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = n), vjust = -0.5, color = "black", size = 3) +
  labs(x = "Marital Status", y = "Count", fill = "Marital Status") +
  theme_minimal()

# OverTime - Pie Chart
attrition %>%
  count(OverTime) %>%
  ggplot(aes(x = "", y = n, fill = OverTime)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y", start = 0) +
  labs(x = "", y = "Count", fill = "OverTime") +
  theme_minimal()

attrition %>%
  count(BusinessTravel) %>%
  ggplot(aes(x = BusinessTravel, y = n, fill = BusinessTravel)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = n), vjust = -0.5, color = "black", size = 3) +
  labs(x = "Business Travel", y = "Count", fill = "Business Travel") +
  theme_minimal()

# Select numerical features
numerical_features <- c("Age", "DailyRate", "DistanceFromHome", "HourlyRate", "MonthlyIncome",
                        "MonthlyRate", "NumCompaniesWorked", "PercentSalaryHike",
                        "TotalWorkingYears", "YearsAtCompany", "YearsInCurrentRole",
                        "YearsSinceLastPromotion", "YearsWithCurrManager")

# Compute the correlation matrix
cor_matrix <- cor(attrition[numerical_features])

# Plot the correlation matrix
corrplot(cor_matrix, method = "color", type = "upper", tl.col = "black")

# Identify variables with the highest correlation
highest_correlation <- which(cor_matrix > 0.6 & cor_matrix < 1, arr.ind = TRUE)
highest_correlation_variables <- rownames(cor_matrix)[highest_correlation[, 1]]

# Select numerical features
numerical_features <- c("Age", "DailyRate", "DistanceFromHome", "HourlyRate", "MonthlyIncome",
                        "MonthlyRate", "NumCompaniesWorked", "PercentSalaryHike",
                        "TotalWorkingYears", "YearsAtCompany", "YearsInCurrentRole",
                        "YearsSinceLastPromotion", "YearsWithCurrManager")

# Compute the correlation matrix
cor_matrix <- cor(attrition[numerical_features])

# Round correlation matrix to two decimal places
cor_matrix <- round(cor_matrix, 2)

# Print the correlation matrix

print(as.data.frame(cor_matrix))

##                          Age DailyRate DistanceFromHome HourlyRate
## Age                     1.00      0.01             0.00       0.02
## DailyRate               0.01      1.00             0.00       0.02
## DistanceFromHome        0.00      0.00             1.00       0.03
## HourlyRate              0.02      0.02             0.03       1.00
## MonthlyIncome           0.50      0.01            -0.02      -0.02
## MonthlyRate             0.03     -0.03             0.03      -0.02
## NumCompaniesWorked      0.30      0.04            -0.03       0.02
## PercentSalaryHike       0.00      0.02             0.04      -0.01
## TotalWorkingYears       0.68      0.01             0.00       0.00
## YearsAtCompany          0.31     -0.03             0.01      -0.02
## YearsInCurrentRole      0.21      0.01             0.02      -0.02
## YearsSinceLastPromotion 0.22     -0.03             0.01      -0.03
## YearsWithCurrManager    0.20     -0.03             0.01      -0.02
##                         MonthlyIncome MonthlyRate NumCompaniesWorked
## Age                              0.50        0.03               0.30
## DailyRate                        0.01       -0.03               0.04
## DistanceFromHome                -0.02        0.03              -0.03
## HourlyRate                      -0.02       -0.02               0.02
## MonthlyIncome                    1.00        0.03               0.15
## MonthlyRate                      0.03        1.00               0.02
## NumCompaniesWorked               0.15        0.02               1.00
## PercentSalaryHike               -0.03       -0.01              -0.01
## TotalWorkingYears                0.77        0.03               0.24
## YearsAtCompany                   0.51       -0.02              -0.12
## YearsInCurrentRole               0.36       -0.01              -0.09
## YearsSinceLastPromotion          0.34        0.00              -0.04
## YearsWithCurrManager             0.34       -0.04              -0.11
##                         PercentSalaryHike TotalWorkingYears YearsAtCompany
## Age                                  0.00              0.68           0.31
## DailyRate                            0.02              0.01          -0.03
## DistanceFromHome                     0.04              0.00           0.01
## HourlyRate                          -0.01              0.00          -0.02
## MonthlyIncome                       -0.03              0.77           0.51
## MonthlyRate                         -0.01              0.03          -0.02
## NumCompaniesWorked                  -0.01              0.24          -0.12
## PercentSalaryHike                    1.00             -0.02          -0.04
## TotalWorkingYears                   -0.02              1.00           0.63
## YearsAtCompany                      -0.04              0.63           1.00
## YearsInCurrentRole                   0.00              0.46           0.76
## YearsSinceLastPromotion             -0.02              0.40           0.62
## YearsWithCurrManager                -0.01              0.46           0.77
##                         YearsInCurrentRole YearsSinceLastPromotion
## Age                                   0.21                    0.22
## DailyRate                             0.01                   -0.03
## DistanceFromHome                      0.02                    0.01
## HourlyRate                           -0.02                   -0.03
## MonthlyIncome                         0.36                    0.34
## MonthlyRate                          -0.01                    0.00
## NumCompaniesWorked                   -0.09                   -0.04
## PercentSalaryHike                     0.00                   -0.02
## TotalWorkingYears                     0.46                    0.40
## YearsAtCompany                        0.76                    0.62
## YearsInCurrentRole                    1.00                    0.55
## YearsSinceLastPromotion               0.55                    1.00
## YearsWithCurrManager                  0.71                    0.51
##                         YearsWithCurrManager
## Age                                     0.20
## DailyRate                              -0.03
## DistanceFromHome                        0.01
## HourlyRate                             -0.02
## MonthlyIncome                           0.34
## MonthlyRate                            -0.04
## NumCompaniesWorked                     -0.11
## PercentSalaryHike                      -0.01
## TotalWorkingYears                       0.46
## YearsAtCompany                          0.77
## YearsInCurrentRole                      0.71
## YearsSinceLastPromotion                 0.51
## YearsWithCurrManager                    1.00

attrition %>%
  ggplot(aes(x = Attrition)) +
  geom_bar(aes(fill = Attrition)) +
  geom_text(stat = 'count', aes(label = ..count..), vjust = -0.5) +
  labs(x = "Attrition", y = "Count") +
  ggtitle("Distribution of Attrition")

## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

library(tidyverse)

attrition %>%
  group_by(JobRole) %>%
  summarise(AttritionRate = mean(Attrition == "Yes")) %>%
  arrange(desc(AttritionRate)) %>%
  head(5)

## # A tibble: 5 × 2
##   JobRole               AttritionRate
##   <chr>                         <dbl>
## 1 Sales Representative          0.398
## 2 Laboratory Technician         0.239
## 3 Human Resources               0.231
## 4 Sales Executive               0.175
## 5 Research Scientist            0.161

attrition %>%
  group_by(Gender) %>%
  summarise(MeanMonthlyIncome = mean(MonthlyIncome, na.rm = TRUE))

## # A tibble: 2 × 2
##   Gender MeanMonthlyIncome
##   <chr>              <dbl>
## 1 Female             6687.
## 2 Male               6381.

attrition %>%
  ggplot(aes(x = Gender, y = MonthlyIncome)) +
  geom_boxplot() +
  labs(x = "Gender", y = "Monthly Income") +
  ggtitle("Gender Disparities in Monthly Income")

attrition_d<- attrition %>%
  mutate(
    Attrition = recode(Attrition, "Yes" = 1, "No" = 0),
    BusinessTravel = recode(BusinessTravel, "Travel_Rarely" = 1, "Travel_Frequently" = 2, "Non-Travel" = 3),
    Department = recode(Department, "Sales" = 1, "Research & Development" = 2, "Human Resources" = 3),
    EducationField = recode(EducationField, "Life Sciences" = 1, "Medical" = 2, "Marketing" = 3, "Technical Degree" = 4,"Human Resources" =5, "Other" = 6),
    Gender = recode(Gender, "Male" = 1, "Female" = 2),
    JobRole = recode(JobRole, "Sales Executive" = 1, "Research Scientist" = 2, "Laboratory Technician" = 3, "Manufacturing Director" = 4, "Healthcare Representative" = 5, "Manager" = 6, "Sales Representative" = 7, "Research Director" = 8, "Human Resources" = 9),
    MaritalStatus = recode(MaritalStatus, "Single" = 1, "Married" = 2, "Divorced" = 3),
    OverTime = recode(OverTime, "Yes" = 1, "No" = 2)
  )

model <- glm(Attrition ~ ., data = attrition_d, family = binomial)

Model Performance

Confusion Matrix: A confusion matrix shows the counts of true positives (TP), true negatives (TN), false positives (FP), and false negatives (FN) of the model’s predictions. It provides an overview of the model’s performance in terms of correctly classified and misclassified instances.

# Predicting the response variable using the logistic regression model
attrition$predicted <- predict(model, type = "response")

# Creating a binary classification based on a threshold (e.g., 0.5)
attrition$predicted_class <- ifelse(attrition$predicted >= 0.5, 1, 0)

# Creating a confusion matrix
confusion_matrix <- table(attrition$Attrition, attrition$predicted_class)
confusion_matrix

##      
##          0    1
##   No  1200   33
##   Yes  126  111

Accuracy measures the proportion of correctly classified instances out of the total number of instances.

# Calculating accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
accuracy

## [1] 0.8918367

Precision and Recall: Precision measures the proportion of true positives out of the total predicted positives, while recall (also known as sensitivity or true positive rate) measures the proportion of true positives out of the actual positives.

# Calculating precision and recall
precision <- confusion_matrix[2, 2] / sum(confusion_matrix[, 2])
recall <- confusion_matrix[2, 2] / sum(confusion_matrix[2, ])
precision

## [1] 0.7708333

recall

## [1] 0.4683544

F1 Score: The F1 score is the harmonic mean of precision and recall and provides a single metric that balances both metrics.

# Calculating F1 score
f1_score <- 2 * (precision * recall) / (precision + recall)
f1_score

## [1] 0.5826772

Economic Policy capstone project

Wambui Martin

2023-07-17

Analyze and visualize numerical features

Analyze and visualize categorical features