---
title: "GLM_MF"
format:
html:
theme:
light: cosmo
dark: darkly
toc: true
toc-depth: 3
number-sections: true
code-fold: true
code-tools: true
smooth-scroll: true
anchor-sections: true
fontsize: 1.05em
css: styles.css
execute:
echo: true
warning: false
message: false
cache: false
editor: visual
---
```{r}
# ================================
# Binary Logistic Regression (Male vs Female)
# ================================
# Load libraries
library(ggplot2)
library(dplyr)
library(GGally)
# -------------------------------
# Load your data
# -------------------------------
data <- read.csv("SM_data.csv")
str(data)
# -------------------------------
# Convert response to factor
# -------------------------------
# Assume column name is "MF" with values "M" and "F"
data$MF <- as.factor((data$MF=="M")*1)
# Optional: set baseline (Female)
#data$MF <- relevel(data$MF, ref = "F")
# -------------------------------
# Cross Plot (Pairwise)
# -------------------------------
p_cross <- ggpairs(
data,
aes(color = MF, alpha = 0.7),
upper = list(continuous = wrap("points", size = 1.2)),
lower = list(continuous = wrap("smooth", method = "loess", se = FALSE)),
diag = list(continuous = "densityDiag")
) +
theme_minimal()
print(p_cross)
# -------------------------------
# Fit Logistic Regression Model
# -------------------------------
# Use all predictors
model <- glm(MF ~ ., data = data, family = binomial)
summary(model)
# -------------------------------
# Predictions
# -------------------------------
prob <- predict(model, type = "response")
data$Prob_Male <- prob
# Convert to class (threshold = 0.5)
data$Predicted <- ifelse(prob > 0.5, "1", "0")
data$Predicted <- as.factor(data$Predicted)
# -------------------------------
# Accuracy
# -------------------------------
accuracy <- mean(data$Predicted == data$MF)
print(paste("Accuracy:", round(accuracy, 4)))
print(table(Predicted = data$Predicted, Actual = data$MF))
# -------------------------------
# Visualization 1: Scatter (2 variables)
# -------------------------------
# Change variables depending on your dataset
p1 <- ggplot(data, aes(x = data[,2], y = data[,3], color = MF)) +
geom_point(size = 2) +
labs(title = "True Classes") +
theme_minimal()
print(p1)
# -------------------------------
# Visualization 2: Predicted Classes
# -------------------------------
p2 <- ggplot(data, aes(x = data[,2], y = data[,3], color = Predicted)) +
geom_point(size = 2) +
labs(title = "Predicted Classes (Logistic Regression)") +
theme_minimal()
print(p2)
# -------------------------------
# Visualization 3: Decision Boundary
# -------------------------------
# Use two important variables (modify if needed)
x_var <- names(data)[2]
y_var <- names(data)[3]
grid <- expand.grid(
x = seq(min(data[[x_var]]), max(data[[x_var]]), length = 200),
y = seq(min(data[[y_var]]), max(data[[y_var]]), length = 200)
)
colnames(grid) <- c(x_var, y_var)
# Fix other variables at mean
other_vars <- setdiff(names(data), c("MF", x_var, y_var, "Predicted", "Prob_Male"))
for (v in other_vars) {
grid[[v]] <- mean(data[[v]], na.rm = TRUE)
}
# Predict on grid
grid$Prob <- predict(model, newdata = grid, type = "response")
grid$Class <- ifelse(grid$Prob > 0.5, "M", "F")
# Plot decision boundary
p3 <- ggplot() +
geom_tile(data = grid, aes_string(x = x_var, y = y_var, fill = "Class"), alpha = 0.3) +
geom_point(data = data, aes_string(x = x_var, y = y_var, color = "MF"), size = 1.5) +
labs(title = "Decision Boundary (Binary Logistic)") +
theme_minimal()
print(p3)
```