SA

# Load necessary libraries
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

# Load the dataset
data <- read.csv("SystemAdministrators.csv")

# Inspect the data
head(data)

##   Experience Training Completed.task
## 1       10.9        4            Yes
## 2        9.9        4            Yes
## 3       10.4        6            Yes
## 4       13.7        6            Yes
## 5        9.4        8            Yes
## 6       12.4        4            Yes

summary(data)

##    Experience       Training     Completed.task    
##  Min.   : 2.70   Min.   :4.000   Length:75         
##  1st Qu.: 5.20   1st Qu.:4.000   Class :character  
##  Median : 6.30   Median :4.000   Mode  :character  
##  Mean   : 6.80   Mean   :4.613                     
##  3rd Qu.: 7.85   3rd Qu.:4.000                     
##  Max.   :13.70   Max.   :8.000

str(data)

## 'data.frame':    75 obs. of  3 variables:
##  $ Experience    : num  10.9 9.9 10.4 13.7 9.4 12.4 7.9 8.9 10.2 11.4 ...
##  $ Training      : int  4 4 6 6 8 4 6 4 6 4 ...
##  $ Completed.task: chr  "Yes" "Yes" "Yes" "Yes" ...

# Convert CompletedTask to a factor
data$Completed.task <- as.factor(data$Completed.task)
head(data)

##   Experience Training Completed.task
## 1       10.9        4            Yes
## 2        9.9        4            Yes
## 3       10.4        6            Yes
## 4       13.7        6            Yes
## 5        9.4        8            Yes
## 6       12.4        4            Yes

# Build the logistic regression model
logit_model <- glm(Completed.task ~ Experience + Training, data = data, family = binomial)

# Summarize the model
summary(logit_model)

## 
## Call:
## glm(formula = Completed.task ~ Experience + Training, family = binomial, 
##     data = data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -10.9813     2.8919  -3.797 0.000146 ***
## Experience    1.1269     0.2909   3.874 0.000107 ***
## Training      0.1805     0.3386   0.533 0.593970    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 75.060  on 74  degrees of freedom
## Residual deviance: 35.713  on 72  degrees of freedom
## AIC: 41.713
## 
## Number of Fisher Scoring iterations: 6

# Calculate odds ratios and their confidence intervals
exp(coef(logit_model))  # Odds ratios

##  (Intercept)   Experience     Training 
## 1.701686e-05 3.086170e+00 1.197827e+00

exp(confint(logit_model))  # Confidence intervals

## Waiting for profiling to be done...

##                    2.5 %      97.5 %
## (Intercept) 1.689214e-08 0.002038548
## Experience  1.918896e+00 6.192848181
## Training    6.002476e-01 2.346891104

# Predict probabilities
data$Predicted_Prob <- predict(logit_model, type = "response")

# Create a confusion matrix
threshold <- 0.5
data$Predicted_Class <- ifelse(data$Predicted_Prob > threshold, 1, 0)
confusion_matrix <- table(Actual = data$Completed.task,Predicted = data$Predicted_Class)
print(confusion_matrix)

##       Predicted
## Actual  0  1
##    No  58  2
##    Yes  5 10

# Calculate accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))

## [1] "Accuracy: 90.67 %"

# Visualize predictions
g= ggplot(data, aes(x = Experience, y = Predicted_Prob, color = Completed.task)) +
 geom_point() +
 geom_smooth(method = "glm", method.args = list(family = "binomial"), se = FALSE) +
 labs(title = "Predicted Probability of Task Completion",
       x = "Months of Experience",
       y = "Predicted Probability") +
  theme_minimal()
g

## `geom_smooth()` using formula = 'y ~ x'

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

ggplotly(g)

## `geom_smooth()` using formula = 'y ~ x'

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

SA

2024-11-28