This short example demonstrates how to perform a logistic
regression in R using the glm() function with a
binary outcome.
We’ll predict whether a student is admitted (admit = 1) or
not (admit = 0) based on their GRE score
and GPA.
# Create a small sample dataset
data <- data.frame(
admit = c(1,0,1,0,1,0,0,1,0,1),
gre = c(800, 640, 700, 580, 720, 600, 500, 750, 620, 710),
gpa = c(4.0, 3.5, 3.8, 3.2, 3.9, 3.3, 2.9, 3.7, 3.4, 3.9)
)
# View first few rows
head(data)
## admit gre gpa
## 1 1 800 4.0
## 2 0 640 3.5
## 3 1 700 3.8
## 4 0 580 3.2
## 5 1 720 3.9
## 6 0 600 3.3
# glm() fits generalized linear models
# family = binomial specifies logistic regression
model <- glm(admit ~ gre + gpa, data = data, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Summarize model output
summary(model)
##
## Call:
## glm(formula = admit ~ gre + gpa, family = binomial, data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.593e+02 1.430e+06 0 1
## gre 2.486e-01 3.030e+03 0 1
## gpa 1.077e+02 7.607e+05 0 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1.3863e+01 on 9 degrees of freedom
## Residual deviance: 2.4490e-10 on 7 degrees of freedom
## AIC: 6
##
## Number of Fisher Scoring iterations: 25
# Predict probabilities for each observation
data$predicted_prob <- predict(model, type = "response")
# Display dataset with predicted probabilities
data
## admit gre gpa predicted_prob
## 1 1 800 4.0 1.000000e+00
## 2 0 640 3.5 6.605993e-11
## 3 1 700 3.8 1.000000e+00
## 4 0 580 3.2 2.220446e-16
## 5 1 720 3.9 1.000000e+00
## 6 0 600 3.3 2.220446e-16
## 7 0 500 2.9 2.220446e-16
## 8 1 750 3.7 1.000000e+00
## 9 0 620 3.4 2.220446e-16
## 10 1 710 3.9 1.000000e+00
# Convert probabilities to binary predictions using threshold = 0.5
data$predicted_class <- ifelse(data$predicted_prob >= 0.5, 1, 0)
# Display final predictions
data
## admit gre gpa predicted_prob predicted_class
## 1 1 800 4.0 1.000000e+00 1
## 2 0 640 3.5 6.605993e-11 0
## 3 1 700 3.8 1.000000e+00 1
## 4 0 580 3.2 2.220446e-16 0
## 5 1 720 3.9 1.000000e+00 1
## 6 0 600 3.3 2.220446e-16 0
## 7 0 500 2.9 2.220446e-16 0
## 8 1 750 3.7 1.000000e+00 1
## 9 0 620 3.4 2.220446e-16 0
## 10 1 710 3.9 1.000000e+00 1
# Confusion matrix
table(Predicted = data$predicted_class, Actual = data$admit)
## Actual
## Predicted 0 1
## 0 5 0
## 1 0 5
# Calculate accuracy
accuracy <- mean(data$predicted_class == data$admit)
print(paste("Model Accuracy:", round(accuracy, 2)))
## [1] "Model Accuracy: 1"
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
ggplot(data, aes(x = gre, y = predicted_prob)) +
geom_point(color = "darkred", size = 3) +
geom_smooth(method = "glm", method.args = list(family = "binomial"), se = FALSE, color = "blue") +
labs(title = "Logistic Regression Curve",
x = "GRE Score",
y = "Predicted Probability of Admission")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred