# Load necessary libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# Load the dataset
data <- read.csv("SystemAdministrators.csv")
# Inspect the data
head(data)
## Experience Training Completed.task
## 1 10.9 4 Yes
## 2 9.9 4 Yes
## 3 10.4 6 Yes
## 4 13.7 6 Yes
## 5 9.4 8 Yes
## 6 12.4 4 Yes
summary(data)
## Experience Training Completed.task
## Min. : 2.70 Min. :4.000 Length:75
## 1st Qu.: 5.20 1st Qu.:4.000 Class :character
## Median : 6.30 Median :4.000 Mode :character
## Mean : 6.80 Mean :4.613
## 3rd Qu.: 7.85 3rd Qu.:4.000
## Max. :13.70 Max. :8.000
str(data)
## 'data.frame': 75 obs. of 3 variables:
## $ Experience : num 10.9 9.9 10.4 13.7 9.4 12.4 7.9 8.9 10.2 11.4 ...
## $ Training : int 4 4 6 6 8 4 6 4 6 4 ...
## $ Completed.task: chr "Yes" "Yes" "Yes" "Yes" ...
# Convert CompletedTask to a factor
data$Completed.task <- as.factor(data$Completed.task)
head(data)
## Experience Training Completed.task
## 1 10.9 4 Yes
## 2 9.9 4 Yes
## 3 10.4 6 Yes
## 4 13.7 6 Yes
## 5 9.4 8 Yes
## 6 12.4 4 Yes
# Build the logistic regression model
logit_model <- glm(Completed.task ~ Experience + Training, data = data, family = binomial)
# Summarize the model
summary(logit_model)
##
## Call:
## glm(formula = Completed.task ~ Experience + Training, family = binomial,
## data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.9813 2.8919 -3.797 0.000146 ***
## Experience 1.1269 0.2909 3.874 0.000107 ***
## Training 0.1805 0.3386 0.533 0.593970
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 75.060 on 74 degrees of freedom
## Residual deviance: 35.713 on 72 degrees of freedom
## AIC: 41.713
##
## Number of Fisher Scoring iterations: 6
# Calculate odds ratios and their confidence intervals
exp(coef(logit_model)) # Odds ratios
## (Intercept) Experience Training
## 1.701686e-05 3.086170e+00 1.197827e+00
exp(confint(logit_model)) # Confidence intervals
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 1.689214e-08 0.002038548
## Experience 1.918896e+00 6.192848181
## Training 6.002476e-01 2.346891104
# Predict probabilities
data$Predicted_Prob <- predict(logit_model, type = "response")
# Create a confusion matrix
threshold <- 0.5
data$Predicted_Class <- ifelse(data$Predicted_Prob > threshold, 1, 0)
confusion_matrix <- table(Actual = data$Completed.task,Predicted = data$Predicted_Class)
print(confusion_matrix)
## Predicted
## Actual 0 1
## No 58 2
## Yes 5 10
# Calculate accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Accuracy: 90.67 %"
# Visualize predictions
g= ggplot(data, aes(x = Experience, y = Predicted_Prob, color = Completed.task)) +
geom_point() +
geom_smooth(method = "glm", method.args = list(family = "binomial"), se = FALSE) +
labs(title = "Predicted Probability of Task Completion",
x = "Months of Experience",
y = "Predicted Probability") +
theme_minimal()
g
## `geom_smooth()` using formula = 'y ~ x'
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

ggplotly(g)
## `geom_smooth()` using formula = 'y ~ x'
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!