set.seed(123)
train_index <- createDataPartition(
iris2$Species,
p = 0.7,
list = FALSE
)
train_data <- iris2[train_index, ]
test_data <- iris2[-train_index, ]
logit_quad <- glm(
Species ~ Petal.Length + Petal.Width +
I(Petal.Length^2) + I(Petal.Width^2) +
Petal.Length:Petal.Width,
data = train_data,
family = binomial
)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logit_quad)
##
## Call:
## glm(formula = Species ~ Petal.Length + Petal.Width + I(Petal.Length^2) +
## I(Petal.Width^2) + Petal.Length:Petal.Width, family = binomial,
## data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1474 230864 0.006 0.995
## Petal.Length -5779 807482 -0.007 0.994
## Petal.Width 13591 1894776 0.007 0.994
## I(Petal.Length^2) 1471 205855 0.007 0.994
## I(Petal.Width^2) 3249 474290 0.007 0.995
## Petal.Length:Petal.Width -4933 693997 -0.007 0.994
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 97.0406 on 69 degrees of freedom
## Residual deviance: 3.8191 on 64 degrees of freedom
## AIC: 15.819
##
## Number of Fisher Scoring iterations: 25
logit_prob <- predict(
logit_quad,
newdata = test_data,
type = "response"
)
logit_pred <- ifelse(
logit_prob > 0.5,
levels(train_data$Species)[2],
levels(train_data$Species)[1]
)
logit_pred <- factor(logit_pred, levels = levels(test_data$Species))
logit_accuracy <- mean(logit_pred == test_data$Species)
logit_accuracy
## [1] 0.8666667
qda_model <- qda(
Species ~ Petal.Length + Petal.Width,
data = train_data
)
qda_pred <- predict(qda_model, newdata = test_data)
qda_class <- qda_pred$class
# Accuracy
qda_accuracy <- mean(qda_class == test_data$Species)
qda_accuracy
## [1] 1
confusionMatrix(logit_pred, test_data$Species)
## Confusion Matrix and Statistics
##
## Reference
## Prediction versicolor virginica
## versicolor 14 3
## virginica 1 12
##
## Accuracy : 0.8667
## 95% CI : (0.6928, 0.9624)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 2.974e-05
##
## Kappa : 0.7333
##
## Mcnemar's Test P-Value : 0.6171
##
## Sensitivity : 0.9333
## Specificity : 0.8000
## Pos Pred Value : 0.8235
## Neg Pred Value : 0.9231
## Prevalence : 0.5000
## Detection Rate : 0.4667
## Detection Prevalence : 0.5667
## Balanced Accuracy : 0.8667
##
## 'Positive' Class : versicolor
##
confusionMatrix(qda_class, test_data$Species)
## Confusion Matrix and Statistics
##
## Reference
## Prediction versicolor virginica
## versicolor 15 0
## virginica 0 15
##
## Accuracy : 1
## 95% CI : (0.8843, 1)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 9.313e-10
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0
## Specificity : 1.0
## Pos Pred Value : 1.0
## Neg Pred Value : 1.0
## Prevalence : 0.5
## Detection Rate : 0.5
## Detection Prevalence : 0.5
## Balanced Accuracy : 1.0
##
## 'Positive' Class : versicolor
##
#Plot 1: Original Data
ggplot(iris2, aes(x = Petal.Length, y = Petal.Width, color = Species)) +
geom_point(size = 3, alpha = 0.8) +
labs(
title = "Iris Data: Versicolor vs Virginica",
x = "Petal Length",
y = "Petal Width",
color = "Species"
) +
theme_minimal(base_size = 14)
#Plot 2: Logistic Regression with Quadratic Features
# Create grid for decision boundary
grid <- expand.grid(
Petal.Length = seq(
min(iris2$Petal.Length) - 0.2,
max(iris2$Petal.Length) + 0.2,
length.out = 300
),
Petal.Width = seq(
min(iris2$Petal.Width) - 0.2,
max(iris2$Petal.Width) + 0.2,
length.out = 300
)
)
# Predict class probabilities on grid
grid$logit_prob <- predict(
logit_quad,
newdata = grid,
type = "response"
)
grid$logit_class <- ifelse(
grid$logit_prob > 0.5,
levels(train_data$Species)[2],
levels(train_data$Species)[1]
)
grid$logit_class <- factor(grid$logit_class, levels = levels(iris2$Species))
# Plot decision boundary
ggplot() +
geom_tile(
data = grid,
aes(
x = Petal.Length,
y = Petal.Width,
fill = logit_class
),
alpha = 0.35
) +
geom_contour(
data = grid,
aes(
x = Petal.Length,
y = Petal.Width,
z = logit_prob
),
breaks = 0.5,
color = "black",
linewidth = 1
) +
geom_point(
data = iris2,
aes(
x = Petal.Length,
y = Petal.Width,
color = Species
),
size = 3
) +
scale_fill_manual(
values = c("versicolor" = "skyblue", "virginica" = "pink")
) +
scale_color_manual(
values = c("versicolor" = "blue", "virginica" = "red")
) +
labs(
title = "Logistic Regression with Quadratic Features",
subtitle = "Black curve shows decision boundary",
x = "Petal Length",
y = "Petal Width",
fill = "Predicted Class",
color = "Actual Species"
) +
theme_minimal(base_size = 14)
Logistic Regression with Quadratic Features adds squared terms such as
Petal.Length^2 and Petal.Width^2, plus an interaction term. This allows
logistic regression to create a curved decision boundary instead of only
a straight line.
#Plot 3: QDA Decision Boundary
# Predict QDA class and posterior probability on grid
qda_grid_pred <- predict(qda_model, newdata = grid)
grid$qda_class <- qda_grid_pred$class
grid$qda_prob <- qda_grid_pred$posterior[, 2]
# Plot QDA decision boundary
ggplot() +
geom_tile(
data = grid,
aes(
x = Petal.Length,
y = Petal.Width,
fill = qda_class
),
alpha = 0.35
) +
geom_contour(
data = grid,
aes(
x = Petal.Length,
y = Petal.Width,
z = qda_prob
),
breaks = 0.5,
color = "black",
linewidth = 1
) +
geom_point(
data = iris2,
aes(
x = Petal.Length,
y = Petal.Width,
color = Species
),
size = 3
) +
scale_fill_manual(
values = c("versicolor" = "lightgreen", "virginica" = "orange")
) +
scale_color_manual(
values = c("versicolor" = "darkgreen", "virginica" = "darkorange")
) +
labs(
title = "Quadratic Discriminant Analysis Decision Boundary",
subtitle = "Black curve shows QDA classification boundary",
x = "Petal Length",
y = "Petal Width",
fill = "Predicted Class",
color = "Actual Species"
) +
theme_minimal(base_size = 14)
QDA naturally creates curved decision boundaries because it allows each class to have its own covariance structure. This makes QDA more flexible than linear discriminant analysis.
#Plot 4: Accuracy Comparison
accuracy_df <- data.frame(
Model = c(
"Logistic Regression\nwith Quadratic Features",
"QDA"
),
Accuracy = c(logit_accuracy, qda_accuracy)
)
ggplot(accuracy_df, aes(x = Model, y = Accuracy, fill = Model)) +
geom_col(width = 0.6) +
geom_text(
aes(label = round(Accuracy, 3)),
vjust = -0.5,
size = 5
) +
scale_fill_manual(
values = c(
"Logistic Regression\nwith Quadratic Features" = "purple",
"QDA" = "goldenrod"
)
) +
ylim(0, 1.1) +
labs(
title = "Model Accuracy Comparison",
x = "Model",
y = "Accuracy"
) +
theme_minimal(base_size = 14) +
theme(legend.position = "none")