# 1. Prepare the data ----------------------------------------------------
admin_df <- read.csv("SystemAdministrators_v2.csv", header = TRUE)
head(admin_df)
## X Experience Training Team Completed.task
## 1 1 10.9 4 B 1
## 2 2 9.9 4 B 1
## 3 3 10.4 6 B 1
## 4 4 13.7 6 B 1
## 5 5 9.4 8 A 1
## 6 6 12.4 4 A 1
nrow(admin_df)
## [1] 575
# 2. Scatter plot ----------------------------------------------------------
library(ggplot2)
ggplot(admin_df, aes(x = Training, y = Experience,
color = as.factor(Completed.task))) +
geom_point() +
scale_color_manual(values = c("red", "green")) +
labs(title = "Scatter Plot of Training vs. Experience",
x = "Training",
y = "Experience",
color = "Completed Task") +
theme_dark()

# Which predictor(s) appear(s) potentially useful for classifying task completion?
# Ans: It appears that those with more experience have higher task completion.
# 3. Logistic regression model --------------------------------------------
# a. Settings: 70-30 split, seed = 666
set.seed(666)
train_index <- sample(1:nrow(admin_df), 0.7 * nrow(admin_df))
valid_index <- setdiff(1:nrow(admin_df), train_index)
train_df <- admin_df[train_index, ]
valid_df <- admin_df[valid_index, ]
# b. Change the outcome variable data type to factor
train_df$Completed.task <- factor(train_df$Completed.task, levels = c(0, 1))
nrow(train_df)
## [1] 402
nrow(valid_df)
## [1] 173
library(caret)
## Loading required package: lattice
# c. Get the predictions and probabilities for the training and validation sets
# Convert the dependent variable to a 2-level factor
train_df$Completed.task <- factor(train_df$Completed.task, levels = c(0, 1))
# Train the model
logistic_reg <- train(Completed.task ~ Experience + Training + Team,
data = train_df, method = "glm", family = "binomial")
summary(logistic_reg)
##
## Call:
## NULL
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.5921 1.1667 -9.079 <2e-16 ***
## Experience 1.2301 0.1194 10.301 <2e-16 ***
## Training 0.1708 0.1358 1.258 0.208
## TeamB -0.1116 0.3385 -0.330 0.742
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 556.80 on 401 degrees of freedom
## Residual deviance: 232.14 on 398 degrees of freedom
## AIC: 240.14
##
## Number of Fisher Scoring iterations: 6
# Predictions and probabilities
logistic_reg_pred_train <- predict(logistic_reg,
newdata = train_df, type = "raw")
logistic_reg_pred_train_prob <- predict(logistic_reg,
newdata = train_df, type = "prob")
# Confusion matrix
confusionMatrix(logistic_reg_pred_train, train_df$Completed.task, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 192 23
## 1 16 171
##
## Accuracy : 0.903
## 95% CI : (0.8698, 0.9301)
## No Information Rate : 0.5174
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8055
##
## Mcnemar's Test P-Value : 0.3367
##
## Sensitivity : 0.8814
## Specificity : 0.9231
## Pos Pred Value : 0.9144
## Neg Pred Value : 0.8930
## Prevalence : 0.4826
## Detection Rate : 0.4254
## Detection Prevalence : 0.4652
## Balanced Accuracy : 0.9023
##
## 'Positive' Class : 1
##
#CONFUSION MATRIX:
#Because positive = '1', the confusion matrix is flipped and formatted as follows:
# 0 1
# 0 TN FN
# 1 FP TP
#True Positives: 171
#True Negatives: 192
#False Positives: 16
#False Negatives: 23
#Accuracy = (TP+TN)/Total = 192+171/403 = 90.3% cases correctly classified
#Sensitivity = TP/(TP+FN) = 171 / (171+23) = 88.1% of completions classified
#Specificity = TN/(TN+FP) = 192 / (192+16) = 92.3% of non-completions classified
#Kappa (Agreement between prediction and truth beyond chance): .8055 (>.8 is great)
#The coefficients of this analysis indicate that Experience dictates task success roughly 6 times as much as training does, and that Team B performs slightly worse at a statistically insignificant level.
# e. Plot the ROC curve. Explain
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roc_curve <- roc(train_df$Completed.task,
logistic_reg_pred_train_prob[, "1"])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_curve, col = "red", main = "ROC Curve, Logistic Regression")

auc(roc_curve)
## Area under the curve: 0.9528
# This ROC curve makes a sharp turn away from (1,0) towards (1,1). The large amount of space underneath this curve from the 45 degree line indicates both high specificity and sensitivity.
#It is important to note that the axes on this ROC curve are flipped, due to us referencing "1" as the positive class in the training set instead of "0".
#The area under the curve = .9528, which is a great indicator for the validity of this model.
# 4. Prediction -----------------------------------------------------------
# Predict outcome for the new system administrators
new_admin_df <- read.csv("new_sys_admin.csv", header = TRUE)
new_admin_df
## Experience Training Team
## 1 9.5 3 A
## 2 10.3 5 B
## 3 6.8 6 B
new_admin_pred <- predict(logistic_reg, newdata = new_admin_df, type = "raw")
new_admin_pred_prob <- predict(logistic_reg, newdata = new_admin_df, type = "prob")
new_admin_df$PredictedClass <- new_admin_pred
new_admin_df$PredictedProb <- new_admin_pred_prob[, "1"]
library(ggplot2)
ggplot(new_admin_df, aes(x = Training, y = Experience,
color = as.factor(PredictedClass), size=(PredictedProb))) +
geom_point() +
scale_color_manual(values = c("red", "green")) +
labs(title = "Scatter Plot of Training vs. Experience",
x = "Training",
y = "Experience",
color = "Completed Task") +
theme_dark()

# The new predictions classify the two system admins with the most experience as completing the task with at least 80% confidence, while classifying the third admin as not completing the task.