# PDF Presentation Link
browseURL("https://drive.google.com/file/d/1Tnj-58S58GQHO5TJ7wKfBoArqnQEkKkX/view?usp=sharing")
# Presentation Link
browseURL("https://youtu.be/fbEB-BOKmEQ")
# Download Data
frito_data = read.csv("CaseStudy1-data.csv", header = TRUE)
# Make Attrition a factor: "Yes" = "Left", "No" = "Stayed"
frito_data$Attrition = factor(frito_data$Attrition, labels = c( "Stayed", "Left"))
# Make "Left" group the positive reference group
frito_data$Attrition = relevel(frito_data$Attrition, ref = "Left")
# Check the counts of the employees who stayed vs left
table(frito_data$Attrition)
##
## Left Stayed
## 140 730
frito_data %>%
group_by(Attrition) %>%
dplyr::summarize(count=n()) %>%
mutate(percent = (count / sum(count))*100) %>%
ggplot(aes(x=Attrition, y = percent, fill=Attrition)) +
geom_bar(stat = "identity") +
geom_text(aes(label = paste0(round(percent,2),"%")), nudge_y = 4, size = 5, family = "Oswald") +
theme_gdocs(base_family = "Oswald", base_size = 14) +
theme(legend.position = "none") +
ggtitle("Percentage of Employees Who Stayed vs. Left") +
xlab("Attrition Status") +
ylab("Percent") +
scale_fill_manual(values = c("Left" = "red", "Stayed" = "yellow"))
## Department
frito_data %>%
group_by(Department, Attrition) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(Department) %>%
mutate(percent = count / sum(count) * 100) %>%
ggplot(aes(x = Department, y = percent / 100, fill = Attrition)) +
geom_bar(position = "fill", stat = "identity") +
geom_text(aes(label = ifelse(Attrition == "Left", paste0(round(percent, 1), "%"), "")),
position = position_fill(vjust = 0.5), color = "black", family = "Oswald") +
theme_gdocs(base_family = "Oswald", base_size = 13) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(legend.position = "none") +
xlab("Department") +
ggtitle("Proportion of Employees Who Stayed vs. Left by Department") +
ylab("Proportion") +
scale_fill_manual(values = c("Left" = "red", "Stayed" = "yellow"))
## Gender
frito_data %>%
group_by(Gender, Attrition) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(Gender) %>%
mutate(percent = count / sum(count) * 100) %>%
ggplot(aes(x = Gender, y = percent / 100, fill = Attrition)) +
geom_bar(position = "fill", stat = "identity") +
geom_text(aes(label = ifelse(Attrition == "Left", paste0(round(percent, 1), "%"), "")),
position = position_fill(vjust = 0.5), color = "black", family = "Oswald", size = 5) +
theme_gdocs(base_family = "Oswald", base_size = 14) +
theme(legend.position = "none") +
xlab("Gender") +
ggtitle("Proportion of Employees Who Stayed vs. Left by Gender") +
ylab("Proportion") +
scale_fill_manual(values = c("Left" = "red", "Stayed" = "yellow"))
# Department and Gender see little variation between the number of employees leaving across their categories.
## Job Role
frito_data %>%
group_by(JobRole, Attrition) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(JobRole) %>%
mutate(percent = count / sum(count) * 100) %>%
ggplot(aes(x = JobRole, y = percent / 100, fill = Attrition)) +
geom_bar(position = "fill", stat = "identity") +
geom_text(aes(label = ifelse(Attrition == "Left", paste0(round(percent, 1), "%"), "")),
position = position_fill(vjust = 0.5), color = "black", family = "Oswald", size = 5) +
theme_gdocs(base_family = "Oswald", base_size = 14) +
ggtitle("Proportion of Employees Who Stayed vs. Left by Job Role") +
theme(plot.title = element_text(size = 15), legend.position = "none") +
xlab("Job Role") +
ylab("Proportion") +
scale_fill_manual(values = c("Left" = "red", "Stayed" = "yellow")) +
coord_flip()
## Over Time
frito_data %>%
group_by(OverTime, Attrition) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(OverTime) %>%
mutate(percent = count / sum(count) * 100) %>%
ggplot(aes(x = OverTime, y = count, fill = Attrition)) +
geom_bar(position = "fill", stat = "identity") +
geom_text(aes(label = ifelse(Attrition == "Left", paste0(round(percent, 1), "%"), "")),
position = position_fill(vjust = 0.5), color = "black", family = "Oswald", size = 5) +
theme_gdocs(base_family = "Oswald", base_size = 14) +
theme(plot.title = element_text(size = 15), legend.position = "none") +
ggtitle("Proportion of Employees Who Stayed vs. Left by Over Time") +
ylab("Proportion") +
xlab("Over Time Status") +
scale_fill_manual(values = c("Left" = "red", "Stayed" = "yellow"))
# Job Role, Over Time, and Job Level see high variation between the number of employees leaving across their categories.
# Make sure relevant columns are numeric for testing
frito_data$DailyRate = as.numeric(frito_data$DailyRate)
frito_data$MonthlyRate = as.numeric(frito_data$MonthlyRate)
frito_data$HourlyRate = as.numeric(frito_data$HourlyRate)
frito_data$NumCompaniesWorked = as.numeric(frito_data$NumCompaniesWorked)
frito_data$Education = as.numeric(frito_data$Education)
frito_data$RelationshipSatisfaction = as.numeric(frito_data$RelationshipSatisfaction)
frito_data$PerformanceRating = as.numeric(frito_data$PerformanceRating)
frito_data$PercentSalaryHike = as.numeric(frito_data$PercentSalaryHike)
frito_data$TrainingTimesLastYear = as.numeric(frito_data$TrainingTimesLastYear)
frito_data$YearsSinceLastPromotion = as.numeric(frito_data$YearsSinceLastPromotion)
frito_data$Age = as.numeric(frito_data$Age)
frito_data$MonthlyIncome = as.numeric(frito_data$MonthlyIncome)
frito_data$DistanceFromHome = as.numeric(frito_data$DistanceFromHome)
frito_data$EnvironmentSatisfaction = as.numeric(frito_data$EnvironmentSatisfaction)
frito_data$JobInvolvement = as.numeric(frito_data$JobInvolvement)
frito_data$JobLevel = as.numeric(frito_data$JobLevel)
frito_data$JobSatisfaction = as.numeric(frito_data$JobSatisfaction)
frito_data$TotalWorkingYears = as.numeric(frito_data$TotalWorkingYears)
frito_data$WorkLifeBalance = as.numeric(frito_data$WorkLifeBalance)
frito_data$YearsAtCompany = as.numeric(frito_data$YearsAtCompany)
frito_data$YearsInCurrentRole = as.numeric(frito_data$YearsInCurrentRole)
frito_data$YearsWithCurrManager = as.numeric(frito_data$YearsWithCurrManager)
frito_data$StockOptionLevel = as.numeric(frito_data$StockOptionLevel)
# We perform two-sample t-tests for each numeric variable grouped by Attrition ("Left" vs. "Stayed")
# Small p-values (<0.05) suggest statistical significance and we can reject the null hypothesis that the means are equal.
# Example: Daily Rate (Not significant)
t.test(DailyRate ~ Attrition, data = frito_data)
##
## Welch Two Sample t-test
##
## data: DailyRate by Attrition
## t = -0.99931, df = 196.61, p-value = 0.3189
## alternative hypothesis: true difference in means between group Left and group Stayed is not equal to 0
## 95 percent confidence interval:
## -109.62427 35.88944
## sample estimates:
## mean in group Left mean in group Stayed
## 784.2929 821.1603
# Example: MonthlyIncome (Significant)
t.test(MonthlyIncome ~ Attrition, data = frito_data)
##
## Welch Two Sample t-test
##
## data: MonthlyIncome by Attrition
## t = -5.3249, df = 228.45, p-value = 2.412e-07
## alternative hypothesis: true difference in means between group Left and group Stayed is not equal to 0
## 95 percent confidence interval:
## -2654.047 -1220.382
## sample estimates:
## mean in group Left mean in group Stayed
## 4764.786 6702.000
# Critical Value for Monthly Income test
qt(.975,228.45)
## [1] 1.970402
# Example: Age (Significant)
t.test(Age ~ Attrition, data = frito_data)
##
## Welch Two Sample t-test
##
## data: Age by Attrition
## t = -4.1509, df = 184.91, p-value = 5.05e-05
## alternative hypothesis: true difference in means between group Left and group Stayed is not equal to 0
## 95 percent confidence interval:
## -5.350324 -1.902905
## sample estimates:
## mean in group Left mean in group Stayed
## 33.78571 37.41233
# Critical Value for Age test
qt(.975,184.91)
## [1] 1.972876
# Below are additional tests for analysis that were run.
# Not significant
# t.test(MonthlyRate ~ Attrition, data = frito_data)
# t.test(HourlyRate ~ Attrition, data = frito_data)
# t.test(NumCompaniesWorked ~ Attrition, data = frito_data)
# t.test(Education ~ Attrition, data = frito_data)
# t.test(RelationshipSatisfaction ~ Attrition, data = frito_data)
# t.test(PerformanceRating~ Attrition, data = frito_data)
# t.test(PercentSalaryHike ~ Attrition, data = frito_data)
# t.test(TrainingTimesLastYear~ Attrition, data = frito_data)
# t.test(YearsSinceLastPromotion~ Attrition, data = frito_data)
# Significant difference of means
# t.test(DistanceFromHome ~ Attrition, data = frito_data)
# t.test(EnvironmentSatisfaction ~ Attrition, data = frito_data)
# t.test(JobSatisfaction~ Attrition, data = frito_data)
# t.test(WorkLifeBalance ~ Attrition, data = frito_data)
# t.test(StockOptionLevel ~ Attrition, data = frito_data)
# t.test(JobInvolvement ~ Attrition, data = frito_data)
# t.test(YearsWithCurrManager ~ Attrition, data = frito_data)
# t.test(YearsAtCompany~ Attrition, data = frito_data)
# Strong significance for difference of means
# t.test(JobLevel~ Attrition, data = frito_data)
# t.test(TotalWorkingYears ~ Attrition, data = frito_data)
# t.test(YearsInCurrentRole ~ Attrition, data = frito_data)
# Age vs. Monthly Income by Over Time
ggplot(frito_data, aes(x = Age, y = MonthlyIncome, color = Attrition)) +
geom_point(alpha = 0.5) +
facet_wrap(~ OverTime) +
scale_color_manual(values = c("Stayed" = "yellow", "Left" = "red")) +
theme_gdocs(base_family = "Oswald", base_size = 12) +
labs(title = "Age vs Monthly Income by Over Time",
x = "Age",
y = "Monthly Income")
# Age vs. Monthly Income for Sales Representatives
frito_data %>%
filter(JobRole == "Sales Representative") %>%
ggplot(aes(x = Age, y = MonthlyIncome, color = Attrition)) +
geom_point(alpha = 0.7) +
scale_color_manual(values = c("Stayed" = "yellow", "Left" = "red")) +
theme_gdocs(base_family = "Oswald", base_size = 12) +
labs(title = "Age vs. Monthly Income for Sales Representatives", x= "Age",y = "Monthly Income")
# Age vs. Monthly Income by Job Role
ggplot(frito_data, aes(x = Age, y = MonthlyIncome, color = Attrition)) +
geom_point(alpha = 0.5) +
facet_wrap(~ JobRole) +
scale_color_manual(values = c("Stayed" = "yellow", "Left" = "red")) +
theme_gdocs(base_family = "Oswald", base_size = 12) +
labs(title = "Age vs. Monthly Income by Job Role", x = "Age",y = "Monthly Income")
# Years In Current Role vs. Monthly Income for Job Level 1
frito_data %>%
filter(JobLevel == "1") %>%
ggplot(aes(x = YearsInCurrentRole, y = MonthlyIncome, color = Attrition)) +
geom_jitter(alpha = 0.7, width = 0.3, height = 0) +
scale_color_manual(values = c("Stayed" = "yellow", "Left" = "red")) +
theme_gdocs(base_family = "Oswald", base_size = 12) +
theme(legend.position = "none") +
labs(title = "Years In Current Role vs. Monthly Income for Job Level 1", x= "Years In Current Role",y = "Monthly Income")
# Years In Current Role vs. Monthly Income for Job Levels
ggplot(frito_data, aes(x = YearsInCurrentRole, y = MonthlyIncome, color = Attrition)) +
geom_jitter(alpha = 0.6, width = 0.2, height = 0) +
facet_wrap(~ JobLevel) +
scale_color_manual(values = c("Stayed" = "yellow", "Left" = "red")) +
theme_gdocs(base_family = "Oswald", base_size = 10) +
theme(legend.position = "none") +
labs(title = "Years In Current Role vs Monthly Income by Job Level",x = "Years In Current Role", y = "Monthly Income")
# Reload the data
fritonb = read.csv("CaseStudy1-data.csv", header = TRUE)
# Make Attrition a factor: "Yes" = "Left", "No" = "Stayed"
fritonb$Attrition = factor(fritonb$Attrition, labels = c( "Stayed", "Left"))
fritonb$Attrition = relevel(fritonb$Attrition, ref = "Left")
# Convert more significant columns from previous tests to numeric variables for the model
fritonb$MonthlyIncome = as.numeric(fritonb$MonthlyIncome)
fritonb$Age = as.numeric(fritonb$Age)
fritonb$TotalWorkingYears = as.numeric(fritonb$TotalWorkingYears)
fritonb$YearsInCurrentRole = as.numeric(fritonb$YearsInCurrentRole)
fritonb$YearsWithCurrManager = as.numeric(fritonb$YearsWithCurrManager)
# Convert the more visually significant columns from previous tests to factors for the model
fritonb$JobRole = as.factor(fritonb$JobRole)
fritonb$OverTime = as.factor(fritonb$OverTime)
fritonb$JobLevel = as.factor(fritonb$JobLevel)
fritonb$JobInvolvement = as.factor(fritonb$JobInvolvement)
# Naive Bayes Testing Model
# Here we pick a subset of three features (i.e JobLevel, OverTime, YearsInCurrentRole) to see how well they predict Attrition. We chose threshold = 0.20 after initial experimentation to balance sensitivity and specificity as sensitivity is important here as we try to correctly predict who will leave the company.
frito_nb = fritonb %>% select(JobLevel, OverTime, YearsInCurrentRole, Attrition)
threshold = .20
# Fit Naive Bayes model using selected predictors
nb_model = naiveBayes(frito_nb[,1:3],frito_nb[,4])
probsNB = predict(nb_model,frito_nb[,1:3], type = "raw")
# We use a threshold to predict "Left" for probabilities greater than 20%
NewClass = ifelse(probsNB[,1] > threshold, "Left", "Stayed")
# Confusion matrix to find accuracy, sensitivity, and specificity
CM_nb = confusionMatrix(table(NewClass, frito_nb[,4]), mode = "everything")
# Different Models Tested for Comparison
# frito_nb = fritonb %>% select(JobRole, OverTime, Age, Attrition)
## Sensitivity: 0.60000, Specificity: 0.82329, Accuracy: 0.7874
# frito_nb = fritonb %>% select(JobLevel, YearsInCurrentRole, OverTime, Attrition)
## Sensitivity: 0.6786, Specificity: 0.6918, Accuracy: 0.6897
# We run the Naive Bayes model 100 times with different seeds, each time shuffling the data, to get average performance metrics. We then calculate potential "costs" of attrition or misclassifications using a range of 50% to 400% of an average yearly salary for replacements.
# Incentive Cost for people predicted to leave
CostPerIncentive = 200
# Average annual salary
x = mean(fritonb$MonthlyIncome) * 12
# Lower bound (50% of salary)
CostPerAttrition1 = 0.5 * x
# Upper bound (400% of salary)
CostPerAttrition2 = 4.0 * x
# Vectors to store results from loop
FN_Holder = numeric(100) # False Negatives (predicted stayed, actually left)
FP_Holder = numeric(100) # False Positives (predicted left, actually stayed)
TP_Holder = numeric(100) # True Positives (predicted left, actually left)
AccHolder = numeric(100) # Accuracy
SensHolder = numeric(100) # Sensitivity
SpecHolder = numeric(100) # Specificity
# Naive Bayes Model Loop with threshold of 0.12 based on previous testing
for (seed in 101:200) {
set.seed(seed)
frito_nb = fritonb[sample(nrow(fritonb)), ]
threshold = 0.12
nb_model = naiveBayes(frito_nb[, c("JobRole", "OverTime", "Age")], frito_nb$Attrition)
probsNB = predict(nb_model, frito_nb[, c("JobRole", "OverTime", "Age")], type = "raw")
NewClass = ifelse(probsNB[,1] > threshold, "Left", "Stayed")
# Confusion Matrix
CM_nb = confusionMatrix(table(NewClass, frito_nb$Attrition), mode = "everything")
AccHolder[seed-100] = CM_nb$overall["Accuracy"]
SensHolder[seed-100] = CM_nb$byClass["Sensitivity"]
SpecHolder[seed-100] = CM_nb$byClass["Specificity"]
FN_Holder[seed-100] = CM_nb$table[2]
FP_Holder[seed-100] = CM_nb$table[3]
TP_Holder[seed-100] = CM_nb$table[1]
}
# Overall average performance of model
mean_accuracy = mean(AccHolder)
mean_sensitivity = mean(SensHolder)
mean_specificity = mean(SpecHolder)
# Average values from the confusion matrix tables
Avg_FN = mean(FN_Holder)
Avg_FP = mean(FP_Holder)
Avg_TP = mean(TP_Holder)
# Estimated costs calculation
# Incentives go to predicted positives (False Positives + True Positives)
# Employees predicted to stay, but actually leave (False Negatives) will cost 40% to 500% of an average salary to find a replacement.
Cost_Base1 = CostPerAttrition1 * Avg_FN + CostPerIncentive * (Avg_FP + Avg_TP)
Cost_Base2 = CostPerAttrition2 * Avg_FN + CostPerIncentive * (Avg_FP + Avg_TP)
# Overall results across the 100 seeds
print(list(
"Mean Accuracy" = mean_accuracy,
"Mean Sensitivity" = mean_sensitivity,
"Mean Specificity" = mean_specificity,
"Estimated Cost (50% Attrition Cost)" = Cost_Base1,
"Estimated Cost (400% Attrition Cost)" = Cost_Base2))
## $`Mean Accuracy`
## [1] 0.645977
##
## $`Mean Sensitivity`
## [1] 0.7785714
##
## $`Mean Specificity`
## [1] 0.6205479
##
## $`Estimated Cost (50% Attrition Cost)`
## [1] 1265789
##
## $`Estimated Cost (400% Attrition Cost)`
## [1] 9585913
fritok = read.csv("CaseStudy1-data.csv", header = TRUE)
# Make Attrition a factor and make the "Left" group the reference group
fritok$Attrition = factor(fritok$Attrition, labels = c( "Stayed", "Left"))
fritok$Attrition = relevel(fritok$Attrition, ref = "Left")
# Convert more significant columns from previous tests to numeric variables for the model
fritok$OverTime = as.numeric(as.factor(fritok$OverTime))
fritok$Age = as.numeric(fritok$Age)
fritok$JobRole = as.numeric(as.factor(fritok$JobRole))
fritok$JobLevel = as.numeric(as.factor(fritok$JobLevel))
fritok$YearsInCurrentRole = as.numeric(fritok$YearsInCurrentRole)
# Standardize predictor variables for the model
fritok$Z_OverTime = scale(fritok$OverTime)
fritok$Z_Age = scale(fritok$Age)
fritok$Z_JobRole = scale(fritok$JobRole)
fritok$Z_YearsInCurrentRole = scale(fritok$YearsInCurrentRole)
fritok$Z_JobLevel = scale(fritok$JobLevel)
# kNN Testing Model
# Here we pick a subset of three standardized features (i.e Z_JobRole, Z_OverTime, Z_Age) to see how well they predict Attrition. We chose threshold = 0.15 after initial experimentation to balance sensitivity and specificity as sensitivity is important here as we try to correctly predict who will leave the company.
threshold = .15
frito_k = fritok %>% select(Z_JobRole, Z_OverTime, Z_Age, Attrition)
classifications = knn(frito_k[,1:3], frito_k[,1:3], cl = frito_k[,4], prob = TRUE, k = 11)
probs = ifelse(classifications == "Left", attributes(classifications)$prob, 1 - attributes(classifications)$prob)
NewClass = ifelse(probs > threshold, "Left", "Stayed")
CM_k = confusionMatrix(table(NewClass, frito_k[,4]), mode = "everything")
# Confusion matrix to find accuracy, sensitivity, and specificity
CM_k
## Confusion Matrix and Statistics
##
##
## NewClass Left Stayed
## Left 117 220
## Stayed 23 510
##
## Accuracy : 0.7207
## 95% CI : (0.6896, 0.7503)
## No Information Rate : 0.8391
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3406
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8357
## Specificity : 0.6986
## Pos Pred Value : 0.3472
## Neg Pred Value : 0.9568
## Precision : 0.3472
## Recall : 0.8357
## F1 : 0.4906
## Prevalence : 0.1609
## Detection Rate : 0.1345
## Detection Prevalence : 0.3874
## Balanced Accuracy : 0.7672
##
## 'Positive' Class : Left
##
# frito_k = fritok %>% select(Z_JobLevel, Z_OverTime, Z_YearsInCurrentRole, Attrition)
## Sensitivity : 0.7857
## Specificity : 0.6288
## Accuracy : 0.654
## Threshold = .15
# frito_k = fritok %>% select(Z_JobRole, Z_OverTime, Z_Age, Attrition)
## Sensitivity : 0.8143
## Specificity : 0.6521
## Accuracy : 0.6782
## Threshold = .15
#Graph to find best k-value for the model
frito_k = fritok %>% select(Z_JobRole, Z_OverTime, Z_Age, Attrition)
# We will run 50 iterations over 90 values of k (1-90) and plot the average performance of them to see where our accuracy and sensitivity are highest to determine the best k-value range.
set.seed(1)
iterations = 50
numks = 90
# Vectors to store results from loop
masterAcc = matrix(nrow = iterations, ncol = numks)
masterSens = matrix(nrow = iterations, ncol = numks)
masterSpec = matrix(nrow = iterations, ncol = numks)
for(j in 1:iterations)
{
for(i in 1:numks)
{
threshold = .15
classifications = knn(frito_k[,1:3], frito_k[,1:3], cl = frito_k[,4], prob = TRUE, k = i)
probs = ifelse(classifications == "Left", attributes(classifications)$prob, 1 - attributes(classifications)$prob)
NewClass = ifelse(probs > threshold, "Left", "Stayed")
CMk = confusionMatrix(table(NewClass, frito_k[,4]), mode = "everything")
masterAcc[j,i] = CMk$overall[1]
masterSens[j,i] = CMk$byClass["Sensitivity"]
masterSpec[j,i] = CMk$byClass["Specificity"]
}
}
# Plot of k vs. accuracy
MeanAcc = colMeans(masterAcc)
plot(seq(1,numks,1),MeanAcc, type = "l", xlab = "k-value", ylab = "Mean Accuracy")
# Plot of k vs. sensitivity
MeanSens = colMeans(masterSens)
plot(seq(1,numks,1),MeanSens, type = "l", xlab = "k-value", ylab = "Mean Sensitivity")
frito_k = fritok %>% select(Z_JobRole, Z_OverTime, Z_Age, Attrition)
# Incentive Cost for people predicted to leave
CostPerIncentive = 200
# Average annual salary
x = mean(fritonb$MonthlyIncome) * 12
# Lower bound (50% of salary)
CostPerAttrition1 = 0.5 * x
# Upper bound (400% of salary)
CostPerAttrition2 = 4.0 * x
set.seed(1)
iterations = 100
# Vectors to store results from loop
FNk_Holder = numeric(100) # False Negatives (predicted stayed, actually left)
FPk_Holder = numeric(100) # False Positives (predicted left, actually stayed)
TPk_Holder = numeric(100) # True Positives (predicted left, actually left)
AccHolderk = numeric(100) # Accuracy
SensHolderk = numeric(100) # Sensitivity
SpecHolderk = numeric(100) # Specificity
# We will use k=3 and a threshold of 0.15
for (j in 1:iterations) {
threshold = .15
classifications = knn(frito_k[,1:3], frito_k[,1:3], cl = frito_k[,4], prob = TRUE, k = 3)
probs = ifelse(classifications == "Left", attributes(classifications)$prob, 1 - attributes(classifications)$prob)
NewClass = ifelse(probs > threshold, "Left", "Stayed")
CM_k = confusionMatrix(table(NewClass, frito_k[,4]), mode = "everything")
AccHolderk[j] = CM_k$overall[1]
SensHolderk[j] = CM_k$byClass["Sensitivity"]
SpecHolderk[j] = CM_k$byClass["Specificity"]
FNk_Holder[j] = CM_k$table[2]
FPk_Holder[j] = CM_k$table[3]
TPk_Holder[j] = CM_k$table[1]
}
# Average Performance Metrics
mean_accuracyk = mean(AccHolderk)
mean_sensitivityk = mean(SensHolderk)
mean_specificityk = mean(SpecHolderk)
# Average Values for Confusion Matrix Tables
Avg_FNk = mean(FNk_Holder)
Avg_FPk = mean(FPk_Holder)
Avg_TPk = mean(TPk_Holder)
# Cost Calculations
Cost_Basek1 = CostPerAttrition1 * Avg_FNk + CostPerIncentive * (Avg_FPk + Avg_TPk)
Cost_Basek2 = CostPerAttrition2 * Avg_FNk + CostPerIncentive * (Avg_FPk + Avg_TPk)
# Overall Results
print(list(
"Mean Accuracy (k=3)" = mean_accuracyk,
"Mean Sensitivity (k=3)" = mean_sensitivityk,
"Mean Specificity (k=3)" = mean_specificityk,
"Estimated Cost (50% Attrition Cost)" = Cost_Basek1,
"Estimated Cost (400% Attrition Cost)" = Cost_Basek2))
## $`Mean Accuracy (k=3)`
## [1] 0.7275862
##
## $`Mean Sensitivity (k=3)`
## [1] 0.9428571
##
## $`Mean Specificity (k=3)`
## [1] 0.6863014
##
## $`Estimated Cost (50% Attrition Cost)`
## [1] 378932.7
##
## $`Estimated Cost (400% Attrition Cost)`
## [1] 2526062
frito = read.csv("CaseStudy1-data.csv", header = TRUE)
# Make Attrition a factor and make the "Left" group the reference group
frito$Attrition = factor(frito$Attrition, labels = c( "No", "Yes"))
frito$Attrition = relevel(frito$Attrition, ref = "Yes")
# Make variables numeric
frito$OverTime = as.numeric(as.factor(frito$OverTime))
frito$Age = as.numeric(frito$Age)
frito$JobRole = as.numeric(as.factor(frito$JobRole))
# Standardize predictor variables
frito$Z_OverTime = scale(frito$OverTime)
frito$Z_Age = scale(frito$Age)
frito$Z_JobRole = scale(frito$JobRole)
# Download Competition Set of 300 rows
CompSet = read.csv("CaseStudy1CompSet No Attrition.csv", header = TRUE)
# Make variables numeric
CompSet$OverTime = as.numeric(as.factor(CompSet$OverTime))
CompSet$Age = as.numeric(CompSet$Age)
CompSet$JobRole = as.numeric(as.factor(CompSet$JobRole))
# Standardize predictor variables
CompSet$Z_OverTime = scale(CompSet$OverTime)
CompSet$Z_Age = scale(CompSet$Age)
CompSet$Z_JobRole = scale(CompSet$JobRole)
frito_train = frito %>% select(Z_OverTime,Z_JobRole,Z_Age,Attrition)
Comp_test = CompSet %>% select(Z_OverTime,Z_JobRole,Z_Age,ID)
classifications = knn(frito_train[, 1:3], Comp_test[, 1:3], cl = frito_train$Attrition, prob = TRUE, k = 3)
probs = ifelse(classifications == "Yes", attributes(classifications)$prob, 1 - attributes(classifications)$prob)
NewClass = ifelse(probs > threshold, "Yes", "No")
new_att = data.frame(ID = CompSet$ID, Attrition = NewClass)
write.csv(new_att, "Case1PredictionsChloe Attrition.csv", row.names = FALSE)
head(new_att)
## ID Attrition
## 1 1171 No
## 2 1172 Yes
## 3 1173 No
## 4 1174 Yes
## 5 1175 Yes
## 6 1176 Yes