# 1. Prepare the data  ----------------------------------------------------
admin_df <- read.csv("SystemAdministrators_v2.csv", header = TRUE)
head(admin_df)
##   X Experience Training Team Completed.task
## 1 1       10.9        4    B              1
## 2 2        9.9        4    B              1
## 3 3       10.4        6    B              1
## 4 4       13.7        6    B              1
## 5 5        9.4        8    A              1
## 6 6       12.4        4    A              1
nrow(admin_df)
## [1] 575
# 2. Scatter plot ----------------------------------------------------------
library(ggplot2)
ggplot(admin_df, aes(x = Training, y = Experience, 
                     color = as.factor(Completed.task))) +
  geom_point() +
  scale_color_manual(values = c("red", "green")) +
  labs(title = "Scatter Plot of Training vs. Experience",
       x = "Training",
       y = "Experience",
       color = "Completed Task") +
  theme_dark()

# Which predictor(s) appear(s) potentially useful for classifying task completion? 

# Ans: It appears that those with more experience have higher task completion.

# 3. Logistic regression model --------------------------------------------

# a.    Settings: 70-30 split, seed = 666
set.seed(666)

train_index <- sample(1:nrow(admin_df), 0.7 * nrow(admin_df))
valid_index <- setdiff(1:nrow(admin_df), train_index)

train_df <- admin_df[train_index, ]
valid_df <- admin_df[valid_index, ]
# b.    Change the outcome variable data type to factor
train_df$Completed.task <- factor(train_df$Completed.task, levels = c(0, 1))

nrow(train_df)
## [1] 402
nrow(valid_df)
## [1] 173
library(caret)
## Loading required package: lattice
# c.    Get the predictions and probabilities for the training and validation sets

# Convert the dependent variable to a 2-level factor
train_df$Completed.task <- factor(train_df$Completed.task, levels = c(0, 1))

# Train the model
logistic_reg <- train(Completed.task ~ Experience + Training + Team,
                      data = train_df, method = "glm", family = "binomial")

summary(logistic_reg)
## 
## Call:
## NULL
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -10.5921     1.1667  -9.079   <2e-16 ***
## Experience    1.2301     0.1194  10.301   <2e-16 ***
## Training      0.1708     0.1358   1.258    0.208    
## TeamB        -0.1116     0.3385  -0.330    0.742    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 556.80  on 401  degrees of freedom
## Residual deviance: 232.14  on 398  degrees of freedom
## AIC: 240.14
## 
## Number of Fisher Scoring iterations: 6
# Predictions and probabilities
logistic_reg_pred_train <- predict(logistic_reg, 
                                   newdata = train_df, type = "raw")

logistic_reg_pred_train_prob <- predict(logistic_reg, 
                                        newdata = train_df, type = "prob")

# Confusion matrix
confusionMatrix(logistic_reg_pred_train, train_df$Completed.task, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 192  23
##          1  16 171
##                                           
##                Accuracy : 0.903           
##                  95% CI : (0.8698, 0.9301)
##     No Information Rate : 0.5174          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8055          
##                                           
##  Mcnemar's Test P-Value : 0.3367          
##                                           
##             Sensitivity : 0.8814          
##             Specificity : 0.9231          
##          Pos Pred Value : 0.9144          
##          Neg Pred Value : 0.8930          
##              Prevalence : 0.4826          
##          Detection Rate : 0.4254          
##    Detection Prevalence : 0.4652          
##       Balanced Accuracy : 0.9023          
##                                           
##        'Positive' Class : 1               
## 
#CONFUSION MATRIX:

  #Because positive = '1', the confusion matrix is flipped and formatted as follows:
  #     0       1
  # 0   TN     FN
  # 1   FP     TP
  #True Positives: 171
  #True Negatives: 192
  #False Positives: 16
  #False Negatives: 23

  #Accuracy = (TP+TN)/Total = 192+171/403 = 90.3% cases correctly classified

  #Sensitivity = TP/(TP+FN) = 171 / (171+23) = 88.1% of completions classified

  #Specificity = TN/(TN+FP) = 192 / (192+16) = 92.3% of non-completions classified

  #Kappa (Agreement between prediction and truth beyond chance): .8055 (>.8 is great)

  #The coefficients of this analysis indicate that Experience dictates task success roughly 6 times as much as training does, and that Team B performs slightly worse at a statistically insignificant level. 

# e.    Plot the ROC curve. Explain

library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
roc_curve <- roc(train_df$Completed.task,
                 logistic_reg_pred_train_prob[, "1"])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_curve, col = "red", main = "ROC Curve, Logistic Regression")

auc(roc_curve)
## Area under the curve: 0.9528
# This ROC curve makes a sharp turn away from (1,0) towards (1,1). The large amount of space underneath this curve from the 45 degree line indicates both high specificity and sensitivity.
#It is important to note that the axes on this ROC curve are flipped, due to us referencing "1" as the positive class in the training set instead of "0". 
#The area under the curve = .9528, which is a great indicator for the validity of this model.

# 4. Prediction -----------------------------------------------------------

# Predict outcome for the new system administrators

new_admin_df <- read.csv("new_sys_admin.csv", header = TRUE)
new_admin_df
##   Experience Training Team
## 1        9.5        3    A
## 2       10.3        5    B
## 3        6.8        6    B
new_admin_pred <- predict(logistic_reg, newdata = new_admin_df, type = "raw")
new_admin_pred_prob <- predict(logistic_reg, newdata = new_admin_df, type = "prob")

new_admin_df$PredictedClass <- new_admin_pred
new_admin_df$PredictedProb <- new_admin_pred_prob[, "1"]

library(ggplot2)
ggplot(new_admin_df, aes(x = Training, y = Experience, 
                     color = as.factor(PredictedClass), size=(PredictedProb))) +
  geom_point() +
  scale_color_manual(values = c("red", "green")) +
  labs(title = "Scatter Plot of Training vs. Experience",
       x = "Training",
       y = "Experience",
       color = "Completed Task") +
  theme_dark()

# The new predictions classify the two system admins with the most experience as completing the task with at least 80% confidence, while classifying the third admin as not completing the task.