This dataset is the CDC Diabetes Health Indicators Dataset
(BRFSS 2015), which contains health survey data from the
Behavioral Risk Factor Surveillance System. The response variable
Diabetes_012 has three ordered categories:
0 = No Diabetes | 1 = Pre-Diabetes | 2 = Diabetes
Since the target variable has more than 2 categories, Discriminant Analysis is applied. Six assumptions are verified before modeling:
Variables used in this analysis:
BMIGenHlth (1–5),
MentHlth (0–30), PhysHlth (0–30),
Age (1–13), Education (1–6),
Income (1–8)Diabetes_012 — 3 classes (0,
1, 2)pkgs <- c("MASS", "car", "ggplot2", "tidyverse", "corrplot",
"gridExtra", "nnet", "psych", "biotools", "kableExtra")
for (p in pkgs) {
if (!require(p, character.only = TRUE)) install.packages(p, dependencies = TRUE)
library(p, character.only = TRUE)
}
# Load dataset
df_raw <- read.csv("diabetes_012_health_indicators_BRFSS2015.csv")
# Select only variables used in analysis
used_vars <- c("Diabetes_012", "BMI", "GenHlth", "MentHlth",
"PhysHlth", "Age", "Education", "Income")
df <- df_raw[, used_vars]
# Define variable roles
cont_var <- "BMI" # 1 continuous
ord_vars <- c("GenHlth", "MentHlth", "PhysHlth", # 6 ordinal
"Age", "Education", "Income")
all_pred <- c(cont_var, ord_vars) # all predictors
# Target as ordered factor
df$Diabetes_012 <- factor(df$Diabetes_012,
levels = c(0, 1, 2),
labels = c("No_Diabetes", "Pre_Diabetes", "Diabetes"),
ordered = TRUE)
cat("Full dataset dimensions:", nrow(df), "rows x", ncol(df), "columns\n")## Full dataset dimensions: 253680 rows x 8 columns
##
## Target class distribution (full data):
##
## No_Diabetes Pre_Diabetes Diabetes
## 213703 4631 35346
##
## Missing values:
## Diabetes_012 BMI GenHlth MentHlth PhysHlth Age
## 0 0 0 0 0 0
## Education Income
## 0 0
## Rows: 253,680
## Columns: 8
## $ Diabetes_012 <ord> No_Diabetes, No_Diabetes, No_Diabetes, No_Diabetes, No_Di…
## $ BMI <dbl> 40, 25, 28, 27, 24, 25, 30, 25, 30, 24, 25, 34, 26, 28, 3…
## $ GenHlth <dbl> 5, 3, 5, 2, 2, 2, 3, 3, 5, 2, 3, 3, 3, 4, 4, 2, 3, 2, 2, …
## $ MentHlth <dbl> 18, 0, 30, 0, 3, 0, 0, 0, 30, 0, 0, 0, 0, 0, 30, 5, 0, 0,…
## $ PhysHlth <dbl> 15, 0, 30, 0, 0, 2, 14, 0, 30, 0, 0, 30, 15, 0, 28, 0, 0,…
## $ Age <dbl> 9, 7, 9, 11, 11, 10, 9, 11, 9, 8, 13, 10, 7, 11, 4, 6, 10…
## $ Education <dbl> 4, 6, 4, 3, 5, 6, 6, 4, 5, 4, 6, 5, 5, 4, 6, 6, 4, 5, 6, …
## $ Income <dbl> 3, 1, 8, 6, 4, 8, 7, 4, 1, 3, 8, 1, 7, 6, 2, 8, 3, 6, 7, …
The following preprocessing pipeline is applied before assumption testing:
library(MASS)
# Step 1: Remove duplicates
df_clean <- df[!duplicated(df[, all_pred]), ]
cat("After removing duplicates:", nrow(df_clean), "rows\n")## After removing duplicates: 130533 rows
# Step 2: Balanced stratified sampling — 70 per class
set.seed(42)
n_per_class <- 70
df_balanced <- df_clean %>%
group_by(Diabetes_012) %>%
slice_sample(n = n_per_class) %>%
ungroup()
cat("\nClass distribution after balanced sampling:\n")##
## Class distribution after balanced sampling:
##
## No_Diabetes Pre_Diabetes Diabetes
## 70 70 70
## Total rows: 210
# Step 3: Yeo-Johnson transformation on all predictors
yj_transform <- function(x) {
# Yeo-Johnson works for any real value (including 0 and negative)
library(car)
pt <- powerTransform(x ~ 1, family = "yjPower")
lambda <- pt$lambda
yjPower(x, lambda)
}
df_trans <- df_balanced
for (v in all_pred) {
df_trans[[v]] <- tryCatch(
yj_transform(df_balanced[[v]]),
error = function(e) df_balanced[[v]] # fallback: keep original
)
}
cat("\nYeo-Johnson transformation applied to:", paste(all_pred, collapse = ", "), "\n")##
## Yeo-Johnson transformation applied to: BMI, GenHlth, MentHlth, PhysHlth, Age, Education, Income
# Step 4: Multivariate outlier removal using Mahalanobis distance
X_mat <- as.matrix(df_trans[, all_pred])
mah_dist <- mahalanobis(X_mat, colMeans(X_mat), cov(X_mat))
cutoff <- qchisq(0.975, df = ncol(X_mat)) # chi-sq 97.5% threshold
n_before <- nrow(df_trans)
df_final <- df_trans[mah_dist <= cutoff, ]
n_after <- nrow(df_final)
cat(sprintf("\nOutliers removed (Mahalanobis): %d rows\n", n_before - n_after))##
## Outliers removed (Mahalanobis): 3 rows
## Final dataset: 207 rows
##
## Final class distribution:
##
## No_Diabetes Pre_Diabetes Diabetes
## 69 68 70
The linearity assumption requires an approximately linear relationship between the log-odds and each continuous/ordinal predictor. The Box-Tidwell test adds an interaction term X × ln(X) for each predictor. If the interaction term is not significant (p > 0.05), the relationship is considered linear.
df_bt <- df_final
# Create interaction terms X * ln(X) for Box-Tidwell
for (v in all_pred) {
x <- df_bt[[v]]
if (any(x <= 0, na.rm = TRUE)) x <- x - min(x, na.rm = TRUE) + 0.001
df_bt[[paste0(v, "_ln")]] <- x * log(x)
}
# Unordered target for multinom
df_bt$target_unord <- relevel(
factor(as.character(df_bt$Diabetes_012)),
ref = "No_Diabetes"
)
bt_formula <- as.formula(paste(
"target_unord ~",
paste(all_pred, collapse = " + "), "+",
paste(paste0(all_pred, "_ln"), collapse = " + ")
))
set.seed(42)
model_bt <- multinom(bt_formula, data = df_bt, trace = FALSE, maxit = 500)
z_bt <- summary(model_bt)$coefficients / summary(model_bt)$standard.errors
p_bt <- 2 * (1 - pnorm(abs(z_bt)))
ln_terms <- paste0(all_pred, "_ln")
p_ln <- apply(p_bt[, ln_terms, drop = FALSE], 2, min)
bt_result <- data.frame(
Variable = all_pred,
p_value = round(p_ln, 4),
Interpretation = ifelse(p_ln > 0.05, "✅ Satisfied", "❌ Not Satisfied")
)
rownames(bt_result) <- NULL
knitr::kable(bt_result, caption = "Box-Tidwell Test Results") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Variable | p_value | Interpretation |
|---|---|---|
| BMI | 0.0018 | ❌ Not Satisfied | |
| GenHlth | 0.2861 | ✅ Satisfied | |
| MentHlth | 0.1683 | ✅ Satisfied | |
| PhysHlth | 0.0345 | ❌ Not Satisfied | |
| Age | 0.0426 | ❌ Not Satisfied | |
| Education | 0.0274 | ❌ Not Satisfied | |
| Income | 0.2115 | ✅ Satisfied | |
n_pass_lin <- sum(p_ln > 0.05)
cat(sprintf("\nVariables satisfying linearity: %d / %d\n",
n_pass_lin, length(all_pred)))##
## Variables satisfying linearity: 3 / 7
lin_status <- ifelse(n_pass_lin == length(all_pred), "✅ Satisfied",
ifelse(n_pass_lin >= length(all_pred) * 0.5,
"⚠️ Partially Satisfied", "❌ Not Satisfied"))Interpretation: 3 out of 7 variables satisfy the linearity assumption (p > 0.05) after Yeo-Johnson transformation. The linearity assumption is considered ❌ Not Satisfied.
n_obs <- nrow(df_final)
n_dup <- sum(duplicated(df_final[, all_pred]))
cat("Number of observations :", n_obs, "\n")## Number of observations : 207
## Number of duplicates : 0
Interpretation: The dataset is a cross-sectional health survey where each row represents a unique individual respondent. Duplicates were removed during preprocessing (duplicates remaining = 0). Therefore, the independence assumption is satisfied ✅.
cor_mat <- cor(df_final[, all_pred], use = "complete.obs")
corrplot(cor_mat,
method = "color",
type = "upper",
addCoef.col = "black",
number.cex = 0.75,
tl.cex = 0.85,
col = colorRampPalette(c("#2166AC", "white", "#D6604D"))(200),
title = "Correlation Matrix – Predictors (After Yeo-Johnson Transformation)",
mar = c(0, 0, 2, 0))df_vif <- df_final
df_vif$target_num <- as.numeric(df_final$Diabetes_012)
vif_formula <- as.formula(paste(
"target_num ~", paste(all_pred, collapse = " + ")
))
lm_vif <- lm(vif_formula, data = df_vif)
vif_val <- vif(lm_vif)
vif_df <- data.frame(
Variable = names(vif_val),
VIF = round(vif_val, 3),
Interpretation = ifelse(vif_val < 5, "✅ Safe",
ifelse(vif_val < 10, "⚠️ Moderate", "❌ High"))
)
rownames(vif_df) <- NULL
knitr::kable(vif_df, caption = "VIF Values of All Predictors") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Variable | VIF | Interpretation |
|---|---|---|
| BMI | 1.115 | ✅ Safe | |
| GenHlth | 1.673 | ✅ Safe | |
| MentHlth | 1.134 | ✅ Safe | |
| PhysHlth | 1.435 | ✅ Safe | |
| Age | 1.141 | ✅ Safe | |
| Education | 1.248 | ✅ Safe | |
| Income | 1.216 | ✅ Safe | |
##
## Maximum VIF: 1.673
mc_status <- ifelse(max_vif < 5, "✅ Satisfied",
ifelse(max_vif < 10, "⚠️ Moderate concern", "❌ Not Satisfied"))Interpretation: Maximum VIF = 1.673. All VIF values are below 5, indicating no harmful multicollinearity. The no multicollinearity assumption is ✅ Satisfied.
# Recheck on final data
X_final <- as.matrix(df_final[, all_pred])
mah_final <- mahalanobis(X_final, colMeans(X_final), cov(X_final))
cutoff <- qchisq(0.975, df = ncol(X_final))
n_out <- sum(mah_final > cutoff)
cat(sprintf("Mahalanobis distance cutoff (chi-sq 97.5%%): %.3f\n", cutoff))## Mahalanobis distance cutoff (chi-sq 97.5%): 16.013
## Remaining outliers after removal: 0
# IQR check per variable
detect_outliers <- function(x, var_name) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR_val <- Q3 - Q1
lower <- Q1 - 1.5 * IQR_val
upper <- Q3 + 1.5 * IQR_val
n_out <- sum(x < lower | x > upper, na.rm = TRUE)
data.frame(Variable = var_name, Lower = round(lower, 3), Upper = round(upper, 3),
Outliers = n_out, Percent = round(n_out / length(x) * 100, 2))
}
outlier_tbl <- do.call(rbind, lapply(all_pred,
function(v) detect_outliers(df_final[[v]], v)))
rownames(outlier_tbl) <- NULL
knitr::kable(outlier_tbl,
caption = "IQR Outlier Check After Yeo-Johnson + Mahalanobis Removal") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Variable | Lower | Upper | Outliers | Percent |
|---|---|---|---|---|
| BMI | 1.504 | 1.696 | 2 | 0.97 |
| GenHlth | 1.536 | 6.665 | 10 | 4.83 |
| MentHlth | -2.195 | 3.658 | 0 | 0.00 |
| PhysHlth | -3.207 | 5.344 | 0 | 0.00 |
| Age | 0.932 | 34.532 | 0 | 0.00 |
| Education | -3.689 | 33.049 | 0 | 0.00 |
| Income | -4.631 | 16.971 | 0 | 0.00 |
plots_out <- lapply(all_pred, function(v) {
ggplot(df_final, aes_string(y = v)) +
geom_boxplot(fill = "#74C476", outlier.color = "red", outlier.size = 1.5) +
labs(title = v, x = NULL, y = NULL) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 10))
})
grid.arrange(grobs = plots_out, ncol = 4,
top = "Boxplot After Yeo-Johnson Transformation + Mahalanobis Outlier Removal")Interpretation: After applying Yeo-Johnson transformation and Mahalanobis-based outlier removal, the total remaining univariate outliers = 12. The no outliers assumption is ⚠️ Partially Satisfied.
Discriminant Analysis assumes that predictors follow a multivariate normal distribution within each group. Mardia’s test checks two components (skewness and kurtosis). If both p-values > 0.05, normality holds.
The test is conducted within each class using all 7 predictors (BMI + 6 ordinal).
groups <- levels(df_final$Diabetes_012)
safe_mardia <- function(data_sub, group_name) {
sub <- data_sub[, all_pred, drop = FALSE]
# Remove near-constant columns
non_const <- sapply(sub, function(x) var(x, na.rm = TRUE) > 1e-10)
sub <- sub[, non_const, drop = FALSE]
if (ncol(sub) < 2) {
return(data.frame(Group = group_name, n = nrow(data_sub),
Skew_p = NA, Kurt_p = NA,
Skewness = "⚠️ Skipped", Kurtosis = "⚠️ Skipped"))
}
# Check condition number
S <- cov(sub)
if (kappa(S, exact = TRUE) > 1e12) {
return(data.frame(Group = group_name, n = nrow(data_sub),
Skew_p = NA, Kurt_p = NA,
Skewness = "⚠️ Singular matrix", Kurtosis = "⚠️ Singular matrix"))
}
res <- tryCatch(psych::mardia(sub, plot = FALSE), error = function(e) NULL)
if (is.null(res)) {
return(data.frame(Group = group_name, n = nrow(data_sub),
Skew_p = NA, Kurt_p = NA,
Skewness = "⚠️ Error", Kurtosis = "⚠️ Error"))
}
p_skew <- res$p.skew
p_kurt <- 2 * (1 - pnorm(abs(res$kurtosis)))
data.frame(
Group = group_name,
n = nrow(data_sub),
Skew_p = round(p_skew, 4),
Kurt_p = round(p_kurt, 4),
Skewness = ifelse(p_skew >= 0.05, "✅ Normal", "❌ Not Normal"),
Kurtosis = ifelse(p_kurt >= 0.05, "✅ Normal", "❌ Not Normal")
)
}
mvn_results <- lapply(groups, function(g) {
safe_mardia(df_final[df_final$Diabetes_012 == g, ], g)
})
mvn_df <- do.call(rbind, mvn_results)
rownames(mvn_df) <- NULL
knitr::kable(mvn_df,
caption = "Mardia's Test — Within-Group Multivariate Normality") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Group | n | Skew_p | Kurt_p | Skewness | Kurtosis |
|---|---|---|---|---|---|
| No_Diabetes | 69 | 0.7049 | 0.0445 | ✅ Normal | | Not Normal | |
| Pre_Diabetes | 68 | 0.9807 | 0.0060 | ✅ Normal | | Not Normal | |
| Diabetes | 70 | 0.6000 | 0.0044 | ✅ Normal | | Not Normal | |
valid <- mvn_df[!grepl("Skipped|Error|Singular", mvn_df$Skewness), ]
n_pass_skew <- sum(valid$Skewness == "✅ Normal")
n_pass_kurt <- sum(valid$Kurtosis == "✅ Normal")
cat(sprintf("\nGroups tested : %d / %d\n", nrow(valid), length(groups)))##
## Groups tested : 3 / 3
## Groups passing Skewness : 3 / 3
## Groups passing Kurtosis : 0 / 3
norm_status <- ifelse(n_pass_skew == nrow(valid) & n_pass_kurt == nrow(valid),
"✅ Satisfied",
ifelse(n_pass_skew >= nrow(valid) * 0.6 | n_pass_kurt >= nrow(valid) * 0.6,
"⚠️ Partially Satisfied", "❌ Not Satisfied"))Interpretation: 3/3 groups pass skewness and 0/3 groups pass kurtosis normality test. The multivariate normality assumption is ⚠️ Partially Satisfied. The small sample size per group (n = 70) combined with Yeo-Johnson transformation and Mahalanobis outlier removal helps achieve approximate normality within each class.
LDA assumes equal variance-covariance matrices across all groups (Box’s M test). H₀: all groups have equal covariance matrices. If p > 0.05 → assumption satisfied.
Key advantage here: with only n = 70 per class (total = 207), Box’s M test has low power and is not oversensitive to minor differences.
boxm_result <- biotools::boxM(df_final[, all_pred], df_final$Diabetes_012)
cat("Box's M Test Result:\n")## Box's M Test Result:
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: df_final[, all_pred]
## Chi-Sq (approx.) = 53.145, df = 56, p-value = 0.5836
##
## p-value : 0.5836
cat(sprintf("Status : %s\n",
ifelse(p_boxm > 0.05,
"✅ Homogeneity satisfied (p > 0.05)",
"❌ Homogeneity violated (p <= 0.05)")))## Status : ✅ Homogeneity satisfied (p > 0.05)
Interpretation: Box’s M test p-value = 0.5836. The homogeneity of variance assumption is ✅ Satisfied. With a small balanced sample (n = 70 per class) and Yeo-Johnson-transformed data, covariance matrices across the three diabetes groups are statistically equal.
summary_df <- data.frame(
No = 1:6,
Assumption = c("Linearity", "Independence", "No Multicollinearity",
"No Outliers", "Multivariate Normality", "Homogeneity of Variance"),
Method = c("Box-Tidwell Test", "Cross-sectional survey design",
"VIF + Correlation Matrix",
"Mahalanobis Distance + IQR Check",
"Mardia's Test (within-group)",
"Box's M Test"),
Treatment = c(
"Yeo-Johnson transformation applied to all predictors",
"Duplicates removed; each row = unique respondent",
"No high correlation; all VIF < 5",
"Yeo-Johnson + Mahalanobis outlier removal",
"Yeo-Johnson + outlier removal + balanced sampling (n=70/class)",
"Balanced stratified sampling (n=70/class) + Yeo-Johnson"
),
Result = c(
sprintf("%d/%d variables pass", n_pass_lin, length(all_pred)),
sprintf("0 duplicates, n = %d", n_obs),
sprintf("All VIF < 5 (max = %.2f)", max_vif),
sprintf("%d total outliers remaining", total_out),
sprintf("Skewness: %d/%d | Kurtosis: %d/%d pass",
n_pass_skew, nrow(valid), n_pass_kurt, nrow(valid)),
sprintf("p-value = %.4f", p_boxm)
),
Status = c(lin_status, "✅ Satisfied", mc_status,
out_status, norm_status, hom_status)
)
knitr::kable(summary_df,
caption = "Summary of All 6 Assumption Tests – Discriminant Analysis") %>%
kable_styling(bootstrap_options = c("striped", "hover", "bordered"),
full_width = TRUE) %>%
column_spec(6, bold = TRUE,
color = ifelse(grepl("✅", summary_df$Status), "#1e8449",
ifelse(grepl("⚠️", summary_df$Status), "#d35400",
"#c0392b")))| No | Assumption | Method | Treatment | Result | Status |
|---|---|---|---|---|---|
| 1 | Linearity | Box-Tidwell Test | Yeo-Johnson transformation applied to all predictors | 3/7 variables pass | ❌ Not Satisfied | |
| 2 | Independence | Cross-sectional survey design | Duplicates removed; each row = unique respondent | 0 duplicates, n = 207 | ✅ Satisfied | |
| 3 | No Multicollinearity | VIF + Correlation Matrix | No high correlation; all VIF < 5 | All VIF < 5 (max = 1.67) | ✅ Satisfied | |
| 4 | No Outliers | Mahalanobis Distance + IQR Check | Yeo-Johnson + Mahalanobis outlier removal | 12 total outliers remaining | ⚠️ Partially Satisfied |
| 5 | Multivariate Normality | Mardia’s Test (within-group) | Yeo-Johnson + outlier removal + balanced sampling (n=70/class) | Skewness: 3/3 | Kurtosis: 0/3 pass | ⚠️ Partially Satisfied |
| 6 | Homogeneity of Variance | Box’s M Test | Balanced stratified sampling (n=70/class) + Yeo-Johnson | p-value = 0.5836 | ✅ Satisfied | |
| Item | Detail |
|---|---|
| Dataset | CDC Diabetes Health Indicators (BRFSS 2015) |
| Source | Kaggle / UCI |
| File used | diabetes_012_health_indicators_BRFSS2015.csv |
| Continuous predictor | BMI |
| Ordinal predictors | GenHlth, MentHlth, PhysHlth, Age, Education, Income |
| Target | Diabetes_012 (3 classes: No Diabetes, Pre-Diabetes, Diabetes) |
| Final n (after preprocessing) | 207 rows |
| n per class | 70 |
Prepared for the Generalized Linear Models practicum – Data Science, UNESA