================================

QDA vs Logistic Regression

with Quadratic Features

================================

2. Train-test split

set.seed(123)

train_index <- createDataPartition(
  iris2$Species,
  p = 0.7,
  list = FALSE
)

train_data <- iris2[train_index, ]
test_data  <- iris2[-train_index, ]

3. Logistic Regression

with Quadratic Features

logit_quad <- glm(
  Species ~ Petal.Length + Petal.Width +
    I(Petal.Length^2) + I(Petal.Width^2) +
    Petal.Length:Petal.Width,
  data = train_data,
  family = binomial
)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logit_quad)
## 
## Call:
## glm(formula = Species ~ Petal.Length + Petal.Width + I(Petal.Length^2) + 
##     I(Petal.Width^2) + Petal.Length:Petal.Width, family = binomial, 
##     data = train_data)
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)
## (Intercept)                  1474     230864   0.006    0.995
## Petal.Length                -5779     807482  -0.007    0.994
## Petal.Width                 13591    1894776   0.007    0.994
## I(Petal.Length^2)            1471     205855   0.007    0.994
## I(Petal.Width^2)             3249     474290   0.007    0.995
## Petal.Length:Petal.Width    -4933     693997  -0.007    0.994
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 97.0406  on 69  degrees of freedom
## Residual deviance:  3.8191  on 64  degrees of freedom
## AIC: 15.819
## 
## Number of Fisher Scoring iterations: 25

Predict probabilities

logit_prob <- predict(
  logit_quad,
  newdata = test_data,
  type = "response"
)

Convert probabilities into class labels

logit_pred <- ifelse(
  logit_prob > 0.5,
  levels(train_data$Species)[2],
  levels(train_data$Species)[1]
)

logit_pred <- factor(logit_pred, levels = levels(test_data$Species))

Accuracy

logit_accuracy <- mean(logit_pred == test_data$Species)
logit_accuracy
## [1] 0.8666667

4. Quadratic Discriminant Analysis

qda_model <- qda(
  Species ~ Petal.Length + Petal.Width,
  data = train_data
)

qda_pred <- predict(qda_model, newdata = test_data)

qda_class <- qda_pred$class

# Accuracy
qda_accuracy <- mean(qda_class == test_data$Species)
qda_accuracy
## [1] 1

5. Confusion Matrices

confusionMatrix(logit_pred, test_data$Species)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   versicolor virginica
##   versicolor         14         3
##   virginica           1        12
##                                           
##                Accuracy : 0.8667          
##                  95% CI : (0.6928, 0.9624)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 2.974e-05       
##                                           
##                   Kappa : 0.7333          
##                                           
##  Mcnemar's Test P-Value : 0.6171          
##                                           
##             Sensitivity : 0.9333          
##             Specificity : 0.8000          
##          Pos Pred Value : 0.8235          
##          Neg Pred Value : 0.9231          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4667          
##    Detection Prevalence : 0.5667          
##       Balanced Accuracy : 0.8667          
##                                           
##        'Positive' Class : versicolor      
## 
confusionMatrix(qda_class, test_data$Species)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   versicolor virginica
##   versicolor         15         0
##   virginica           0        15
##                                      
##                Accuracy : 1          
##                  95% CI : (0.8843, 1)
##     No Information Rate : 0.5        
##     P-Value [Acc > NIR] : 9.313e-10  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0        
##             Specificity : 1.0        
##          Pos Pred Value : 1.0        
##          Neg Pred Value : 1.0        
##              Prevalence : 0.5        
##          Detection Rate : 0.5        
##    Detection Prevalence : 0.5        
##       Balanced Accuracy : 1.0        
##                                      
##        'Positive' Class : versicolor 
## 

#Plot 1: Original Data

ggplot(iris2, aes(x = Petal.Length, y = Petal.Width, color = Species)) +
  geom_point(size = 3, alpha = 0.8) +
  labs(
    title = "Iris Data: Versicolor vs Virginica",
    x = "Petal Length",
    y = "Petal Width",
    color = "Species"
  ) +
  theme_minimal(base_size = 14)

#Plot 2: Logistic Regression with Quadratic Features

# Create grid for decision boundary
grid <- expand.grid(
  Petal.Length = seq(
    min(iris2$Petal.Length) - 0.2,
    max(iris2$Petal.Length) + 0.2,
    length.out = 300
  ),
  Petal.Width = seq(
    min(iris2$Petal.Width) - 0.2,
    max(iris2$Petal.Width) + 0.2,
    length.out = 300
  )
)

# Predict class probabilities on grid
grid$logit_prob <- predict(
  logit_quad,
  newdata = grid,
  type = "response"
)

grid$logit_class <- ifelse(
  grid$logit_prob > 0.5,
  levels(train_data$Species)[2],
  levels(train_data$Species)[1]
)

grid$logit_class <- factor(grid$logit_class, levels = levels(iris2$Species))

# Plot decision boundary
ggplot() +
  geom_tile(
    data = grid,
    aes(
      x = Petal.Length,
      y = Petal.Width,
      fill = logit_class
    ),
    alpha = 0.35
  ) +
  geom_contour(
    data = grid,
    aes(
      x = Petal.Length,
      y = Petal.Width,
      z = logit_prob
    ),
    breaks = 0.5,
    color = "black",
    linewidth = 1
  ) +
  geom_point(
    data = iris2,
    aes(
      x = Petal.Length,
      y = Petal.Width,
      color = Species
    ),
    size = 3
  ) +
  scale_fill_manual(
    values = c("versicolor" = "skyblue", "virginica" = "pink")
  ) +
  scale_color_manual(
    values = c("versicolor" = "blue", "virginica" = "red")
  ) +
  labs(
    title = "Logistic Regression with Quadratic Features",
    subtitle = "Black curve shows decision boundary",
    x = "Petal Length",
    y = "Petal Width",
    fill = "Predicted Class",
    color = "Actual Species"
  ) +
  theme_minimal(base_size = 14)

Logistic Regression with Quadratic Features adds squared terms such as Petal.Length^2 and Petal.Width^2, plus an interaction term. This allows logistic regression to create a curved decision boundary instead of only a straight line.

#Plot 3: QDA Decision Boundary

# Predict QDA class and posterior probability on grid
qda_grid_pred <- predict(qda_model, newdata = grid)

grid$qda_class <- qda_grid_pred$class
grid$qda_prob <- qda_grid_pred$posterior[, 2]

# Plot QDA decision boundary
ggplot() +
  geom_tile(
    data = grid,
    aes(
      x = Petal.Length,
      y = Petal.Width,
      fill = qda_class
    ),
    alpha = 0.35
  ) +
  geom_contour(
    data = grid,
    aes(
      x = Petal.Length,
      y = Petal.Width,
      z = qda_prob
    ),
    breaks = 0.5,
    color = "black",
    linewidth = 1
  ) +
  geom_point(
    data = iris2,
    aes(
      x = Petal.Length,
      y = Petal.Width,
      color = Species
    ),
    size = 3
  ) +
  scale_fill_manual(
    values = c("versicolor" = "lightgreen", "virginica" = "orange")
  ) +
  scale_color_manual(
    values = c("versicolor" = "darkgreen", "virginica" = "darkorange")
  ) +
  labs(
    title = "Quadratic Discriminant Analysis Decision Boundary",
    subtitle = "Black curve shows QDA classification boundary",
    x = "Petal Length",
    y = "Petal Width",
    fill = "Predicted Class",
    color = "Actual Species"
  ) +
  theme_minimal(base_size = 14)

QDA naturally creates curved decision boundaries because it allows each class to have its own covariance structure. This makes QDA more flexible than linear discriminant analysis.

#Plot 4: Accuracy Comparison

accuracy_df <- data.frame(
  Model = c(
    "Logistic Regression\nwith Quadratic Features",
    "QDA"
  ),
  Accuracy = c(logit_accuracy, qda_accuracy)
)

ggplot(accuracy_df, aes(x = Model, y = Accuracy, fill = Model)) +
  geom_col(width = 0.6) +
  geom_text(
    aes(label = round(Accuracy, 3)),
    vjust = -0.5,
    size = 5
  ) +
  scale_fill_manual(
    values = c(
      "Logistic Regression\nwith Quadratic Features" = "purple",
      "QDA" = "goldenrod"
    )
  ) +
  ylim(0, 1.1) +
  labs(
    title = "Model Accuracy Comparison",
    x = "Model",
    y = "Accuracy"
  ) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none")