Setup and Data

library(tidyverse)
library(broom)
library(knitr)
library(kableExtra)
library(car)
library(gtsummary)
brfss_mlr <- readRDS("brfss_slr_2020.rds")

Round 1: Student A Drives, Student B Navigates


Task 1: Fitting Models and ANOVA Tables (15 points)

1a. Fit the three-predictor model (5 pts)

We fit the model:

\[\text{menthlth\_days} = \beta_0 + \beta_1 \cdot \text{physhlth\_days} + \beta_2 \cdot \text{sleep\_hrs} + \beta_3 \cdot \text{age} + \varepsilon\]

# Fit the 3-predictor model
m_3pred <-brfss_mlr %>%  lm(phys_days + sleep_hrs + age,
              data = brfss_mlr)

# Display coefficients with confidence intervals
tidy(m_3pred, conf.int = TRUE) %>%
  mutate(across(where(is.numeric), ~ round(., 4))) %>%
  kable(
    caption = "Table 1a. Three-Predictor Model Coefficients",
    col.names = c("Term", "Estimate", "Std. Error", "t-statistic",
                  "p-value", "95% CI Lower", "95% CI Upper")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Table 1a. Three-Predictor Model Coefficients
Term Estimate Std. Error t-statistic p-value 95% CI Lower 95% CI Upper
(Intercept) 27.6840 0.5663 48.8892 0.0000 26.5737 28.7943
age -0.0013 0.0068 -0.1899 0.8494 -0.0147 0.0121
sexMale -0.1078 0.2248 -0.4795 0.6317 -0.5486 0.3330
phys_days 0.1367 0.0102 13.3962 0.0000 0.1167 0.1568
sleep_hrs 0.0397 0.0680 0.5832 0.5598 -0.0937 0.1731
# Extract coefficients for fitted equation
coefs <- round(coef(m_3pred), 4)
cat("Fitted equation:\n")
## Fitted equation:
cat("menthlth_days =", coefs[1],
    "+", coefs[2], "* physhlth_days",
    "+", coefs[3], "* sleep_hrs",
    "+", coefs[4], "* age\n")
## menthlth_days = 27.684 + -0.0013 * physhlth_days + -0.1078 * sleep_hrs + 0.1367 * age

Fitted Equation:

\[\widehat{\text{menthlth\_days}} = \hat{\beta}_0 + \hat{\beta}_1 \cdot \text{physhlth\_days} + \hat{\beta}_2 \cdot \text{sleep\_hrs} + \hat{\beta}_3 \cdot \text{age}\]

(Rounded coefficient values will print above from your data.)

Interpretation example: For each additional hour of sleep, mental health days decrease by \(\hat{\beta}_2\) days (holding physical health days and age constant).


1b. Type I (Sequential) Sums of Squares via anova() (5 pts)

# Type I ANOVA table
anova_t1 <- anova(m_3pred)

anova_t1 %>%
  as.data.frame() %>%
  rownames_to_column("Source") %>%
  mutate(across(where(is.numeric), ~ round(., 2))) %>%
  kable(
    caption = "Table 1b. Type I (Sequential) ANOVA — anova(m_3pred)",
    col.names = c("Source", "df", "Sum of Sq", "Mean Sq", "F value", "p-value")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  footnote(general = "Predictors tested sequentially in entry order.")
Table 1b. Type I (Sequential) ANOVA — anova(m_3pred)
Source df Sum of Sq Mean Sq F value p-value
age 1 625.24 625.24 16.99 0.00
sex 1 89.01 89.01 2.42 0.12
phys_days 1 6615.62 6615.62 179.77 0.00
sleep_hrs 1 12.52 12.52 0.34 0.56
Residuals 2995 110220.21 36.80 NA NA
Note:
Predictors tested sequentially in entry order.
# Verify: sum of predictor Type I SS = SSR (Model SS)
ss_vals   <- anova_t1$`Sum Sq`
ssr_model <- sum(ss_vals[1:3])   # SS for the 3 predictors
sse       <- ss_vals[4]          # Residual SS
ssy       <- ssr_model + sse

cat("SS(physhlth_days):           ", round(ss_vals[1], 2), "\n")
## SS(physhlth_days):            625.24
cat("SS(sleep_hrs | physhlth):    ", round(ss_vals[2], 2), "\n")
## SS(sleep_hrs | physhlth):     89.01
cat("SS(age | physhlth, sleep):   ", round(ss_vals[3], 2), "\n")
## SS(age | physhlth, sleep):    6615.62
cat("-------------------------------\n")
## -------------------------------
cat("SSR (sum of predictor SS):   ", round(ssr_model, 2), "\n")
## SSR (sum of predictor SS):    7329.87
cat("SSE (Residual):              ", round(sse, 2), "\n")
## SSE (Residual):               12.52
cat("SSY (Total):                 ", round(ssy, 2), "\n")
## SSY (Total):                  7342.39
cat("R² = SSR / SSY =             ", round(ssr_model / ssy, 4), "\n")
## R² = SSR / SSY =              0.9983
cat("\nVerification from glance():\n")
## 
## Verification from glance():
glance(m_3pred) %>%
  select(r.squared, adj.r.squared, nobs) %>%
  mutate(across(everything(), ~ round(., 4))) %>%
  print()
## # A tibble: 1 × 3
##   r.squared adj.r.squared  nobs
##       <dbl>         <dbl> <dbl>
## 1    0.0625        0.0612  3000

1c. Type III (Partial) Sums of Squares and Comparison (5 pts)

# Type III ANOVA table
anova_t3 <- Anova(m_3pred, type = "III")

anova_t3 %>%
  as.data.frame() %>%
  rownames_to_column("Source") %>%
  mutate(across(where(is.numeric), ~ round(., 4))) %>%
  kable(
    caption = "Table 1c. Type III (Partial) ANOVA — car::Anova(Three-Predictor Model, type='III')",
    col.names = c("Source", "Sum of Sq", "df", "F value", "p-value")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  footnote(general = "Each variable tested after adjusting for ALL other predictors.")
Table 1c. Type III (Partial) ANOVA — car::Anova(Three-Predictor Model, type=‘III’)
Source Sum of Sq df F value p-value
(Intercept) 87961.1036 1 2390.1561 0.0000
age 1.3276 1 0.0361 0.8494
sex 8.4596 1 0.2299 0.6317
phys_days 6604.2651 1 179.4569 0.0000
sleep_hrs 12.5165 1 0.3401 0.5598
Residuals 110220.2077 2995 NA NA
Note:
Each variable tested after adjusting for ALL other predictors.
# Side-by-side comparison of Type I vs Type III SS
tibble(
  Variable      = c("physhlth_days", "sleep_hrs", "age"),
  `Type I SS`   = round(anova_t1$`Sum Sq`[1:3], 2),
  `Type III SS` = round(anova_t3$`Sum Sq`[2:4], 2),
  `Difference`  = round(anova_t1$`Sum Sq`[1:3] - anova_t3$`Sum Sq`[2:4], 2)
) %>%
  kable(caption = "Table 1c-2. Comparison of Type I vs. Type III SS") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  footnote(general = "The last variable entered (age) has identical Type I and Type III SS.")
Table 1c-2. Comparison of Type I vs. Type III SS
Variable Type I SS Type III SS Difference
physhlth_days 625.24 1.33 623.91
sleep_hrs 89.01 8.46 80.55
age 6615.62 6604.27 11.35
Note:
The last variable entered (age) has identical Type I and Type III SS.

##Which variable’s SS is the same in both tables?

age — the last variable entered in the model — has identical Type I and Type III SS. This is because:

Since age is the last predictor, both conditioning sets are identical, producing the same SS value. For physhlth_days and sleep_hrs, Type I SS does not condition on variables entered later (e.g., Type I SS for physhlth_days ignores sleep_hrs and age), which is why their Type I and Type III values differ.


Task 2: Type I vs. Type III Sums of Squares (15 points)

2a. Reversed variable order — effect on Type I SS (5 pts)

# Fit the SAME model with REVERSED variable order
m_3pred_rev <- brfss_mlr %>% lm( age + sleep_hrs + phys_days,
                  data = brfss_mlr)

# Type I ANOVA on the reversed model
anova_rev <- anova(m_3pred_rev)

anova_rev %>%
  as.data.frame() %>%
  rownames_to_column("Source") %>%
  mutate(across(where(is.numeric), ~ round(., 2))) %>%
  kable(
    caption = "Table 2a. Type I ANOVA — Reversed Order (age → sleep_hrs → physhlth_days)",
    col.names = c("Source", "df", "Sum of Sq", "Mean Sq", "F value", "p-value")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Table 2a. Type I ANOVA — Reversed Order (age → sleep_hrs → physhlth_days)
Source df Sum of Sq Mean Sq F value p-value
age 1 625.24 625.24 16.99 0.00
sex 1 89.01 89.01 2.42 0.12
phys_days 1 6615.62 6615.62 179.77 0.00
sleep_hrs 1 12.52 12.52 0.34 0.56
Residuals 2995 110220.21 36.80 NA NA
# Compare Type I SS between the two orderings
tibble(
  Variable                     = c("physhlth_days", "sleep_hrs", "age"),
  `Type I SS (Original Order)` = round(anova_t1$`Sum Sq`[1:3], 2),
  `Type I SS (Reversed Order)` = round(
    c(anova_rev$`Sum Sq`[3],   # physhlth_days is now last
      anova_rev$`Sum Sq`[2],   # sleep_hrs is middle
      anova_rev$`Sum Sq`[1]),  # age is first
    2)
) %>%
  kable(caption = "Table 2a-2. Type I SS: Original Order vs. Reversed Order") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Table 2a-2. Type I SS: Original Order vs. Reversed Order
Variable Type I SS (Original Order) Type I SS (Reversed Order)
physhlth_days 625.24 6615.62
sleep_hrs 89.01 89.01
age 6615.62 625.24

Interpretation:

  • Type I SS changed for physhlth_days, sleep_hrs, and age because each variable is now being tested given a different set of already-entered variables.
  • In the original order, physhlth_days enters first so its SS reflects its marginal (unadjusted) contribution. In the reversed order, physhlth_days enters last and is adjusted for both sleep_hrs and age.
  • The only value that stays the same is the Type I SS for the last variable in each ordering — in both cases, the last-entered variable’s Type I SS equals its Type III SS, which is order-invariant.

2b. Type III SS on the reordered model (5 pts)

# Type III on the reversed model
anova_t3_rev <- Anova(m_3pred_rev, type = "III")

# Compare Type III SS: original order vs. reversed order
tibble(
  Variable                      = c("physhlth_days", "sleep_hrs", "age"),
  `Type III SS (Original Order)` = round(anova_t3$`Sum Sq`[2:4], 4),
  `Type III SS (Reversed Order)` = round(
    c(anova_t3_rev$`Sum Sq`[4],  # physhlth_days row in reversed model
      anova_t3_rev$`Sum Sq`[3],  # sleep_hrs row
      anova_t3_rev$`Sum Sq`[2]), # age row
    4)
) %>%
  kable(caption = "Table 2b. Type III SS: Invariant to Variable Entry Order") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  footnote(general = "Type III SS are identical regardless of the order variables appear in the model formula.")
Table 2b. Type III SS: Invariant to Variable Entry Order
Variable Type III SS (Original Order) Type III SS (Reversed Order)
physhlth_days 1.3276 6604.2651
sleep_hrs 8.4596 8.4596
age 6604.2651 1.3276
Note:
Type III SS are identical regardless of the order variables appear in the model formula.

Did the Type III SS change?

No. The Type III SS for each predictor are identical regardless of the order variables appear in the model formula. This is the defining property of Type III (partial) sums of squares: each variable is always tested after conditioning on all other variables, so the conditioning set never changes with reordering. Both orderings produce the exact same model fit and the same partial adjustments.


2c. When to prefer Type III over Type I SS in epidemiology (5 pts)

Type III should be prefered (partial) SS whenever the goal is to assess the independent contribution of each predictor after adjusting for all other covariates in the model. Type I SS are sensitive to entry order, meaning the apparent importance of a variable changes arbitrarily depending on how the analyst chose to order predictors, which is scientifically undesirable.


Task 3: Partial F-Tests for Individual Variables (20 points)

3a. Partial F-test for age using model comparison (10 pts)

# Reduced model: WITHOUT age
m_no_age <- brfss_mlr %>% lm(phys_days + sleep_hrs,
               data = brfss_mlr)

# Full model: WITH age
m_with_age <- brfss_mlr %>% lm(phys_days + sleep_hrs + age,
                 data = brfss_mlr)

# Partial F-test: compare reduced vs full
anova(m_no_age, m_with_age) %>%
  as.data.frame() %>%
  rownames_to_column("Model") %>%
  mutate(
    Model = c("Reduced (physhlth + sleep, no age)", "Full (+ age)"),
    across(where(is.numeric), ~ round(., 4))
  ) %>%
  kable(
    caption = "Table 3a. Partial F-Test: Does age add to the model?",
    col.names = c("Model", "Res. df", "RSS", "df", "Sum of Sq", "F", "p-value")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Table 3a. Partial F-Test: Does age add to the model?
Model Res. df RSS df Sum of Sq F p-value
Reduced (physhlth + sleep, no age) 2995 90235.79 NA NA NA NA
Full (+ age) 2995 110220.21 0 -19984.42 NA NA

Hypothesis Test:

  • \(H_0: \beta_{\text{age}} = 0\) — Age does not add to the prediction of mental health days, given that phys_days and sleep_hrs are already in the model.
  • \(H_1: \beta_{\text{age}} \neq 0\) — Age does add information beyond phys_days and sleep_hrs.

Decision (\(\alpha = 0.05\)): Examine the F-statistic and p-value from the table above.

  • If p < 0.05: Reject \(H_0\). Conclude that age significantly improves prediction of mental health days beyond physical health and sleep hours.
  • If p ≥ 0.05: Fail to reject \(H_0\). Age does not add significant predictive value given the other two predictors.

3b. Manual verification of the partial F-test (10 pts)

# From the Type I ANOVA on the FULL model (Task 1b),
# SS(age | physhlth_days, sleep_hrs) is the Type I SS for age
# (age was entered 3rd, so it is conditioned on physhlth_days and sleep_hrs)
ss_age_given_others <-  anova_t1$`Sum Sq`[3]   # Type I SS for age (last entered)
mse_full            <- anova_t1$`Mean Sq`[4]  # MSE from full model residuals
n                   <- nrow(brfss_mlr)
p                   <- 3  # number of predictors in full model

# Degrees of freedom for residuals: n - p - 1
df_resid <- n - p - 1

# Manual F-statistic
F_manual <- (ss_age_given_others / 1) / mse_full

# Critical value at alpha = 0.05
F_crit <- qf(0.95, df1 = 1, df2 = df_resid)

# p-value
p_val <- pf(F_manual, df1 = 1, df2 = df_resid, lower.tail = FALSE)

cat("=== Manual Partial F-Test for age ===\n\n")
## === Manual Partial F-Test for age ===
cat("SS(age | physhlth_days, sleep_hrs):", round(ss_age_given_others, 4), "\n")
## SS(age | physhlth_days, sleep_hrs): 6615.616
cat("df for age:                         1\n")
## df for age:                         1
cat("MSE (full model):                  ", round(mse_full, 4), "\n")
## MSE (full model):                   12.5165
cat("Residual df (n - p - 1):           ", df_resid, "\n\n")
## Residual df (n - p - 1):            2996
cat("F-statistic (manual):              ", round(F_manual, 4), "\n")
## F-statistic (manual):               528.5533
cat("Critical value F(1,", df_resid, ", 0.95):", round(F_crit, 4), "\n")
## Critical value F(1, 2996 , 0.95): 3.8446
cat("p-value:                           ", format.pval(p_val), "\n\n")
## p-value:                            < 2.22e-16
cat("Decision: F_manual", ifelse(F_manual > F_crit, ">", "<"), "F_crit →",
    ifelse(F_manual > F_crit, "Reject H₀", "Fail to reject H₀"), "\n")
## Decision: F_manual > F_crit → Reject H₀


Round 2: Student B Drives, Student A Navigates


Task 4: T-Tests and the F-Test Equivalence (15 points)

4a. Extract t-statistics and p-values from summary() (5 pts)

# Summary of the full 3-predictor model
summary(m_3pred)
## 
## Call:
## lm(formula = ., data = brfss_mlr, subset = phys_days + sleep_hrs + 
##     age)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -12.701  -3.985  -1.010   3.855  24.949 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 27.684015   0.566260  48.889   <2e-16 ***
## age         -0.001301   0.006849  -0.190    0.849    
## sexMale     -0.107780   0.224799  -0.479    0.632    
## phys_days    0.136748   0.010208  13.396   <2e-16 ***
## sleep_hrs    0.039678   0.068036   0.583    0.560    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.066 on 2995 degrees of freedom
## Multiple R-squared:  0.06246,    Adjusted R-squared:  0.0612 
## F-statistic: 49.88 on 4 and 2995 DF,  p-value: < 2.2e-16
# Also show with tidy()
tidy(m_3pred) %>%
  filter(term != "(Intercept)") %>%
  mutate(across(where(is.numeric), ~ round(., 6))) %>%
  select(term, estimate, std.error, statistic, p.value) %>%
  kable(
    caption = "Table 4a. T-Statistics and P-Values from summary()",
    col.names = c("Predictor", "Estimate", "Std. Error", "t-statistic", "p-value")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Table 4a. T-Statistics and P-Values from summary()
Predictor Estimate Std. Error t-statistic p-value
age -0.001301 0.006849 -0.189934 0.849374
sexMale -0.107780 0.224799 -0.479451 0.631653
phys_days 0.136748 0.010208 13.396151 0.000000
sleep_hrs 0.039678 0.068036 0.583188 0.559811

The summary() output reports a t-test for each coefficient: \[H_0: \beta_j = 0 \qquad t = \frac{\hat{\beta}_j}{se(\hat{\beta}_j)} \sim t_{n-p-2}\] Each t-test assesses whether that predictor’s coefficient is zero after adjusting for all other predictors — the same question addressed by the Type III partial F-test.


4b. Demonstrate \(t^2 = F\) (Type III equivalence) (5 pts)

# t-statistics from summary()
t_stats <- tidy(m_3pred) %>%
  filter(term != "(Intercept)") %>%
  select(term, t_stat = statistic, t_pvalue = p.value)

# Type III F-statistics from car::Anova()
f_stats <- Anova(m_3pred, type = "III") %>%
  as.data.frame() %>%
  rownames_to_column("term") %>%
  filter(!term %in% c("(Intercept)", "Residuals")) %>%
  select(term, f_stat = `F value`, f_pvalue = `Pr(>F)`)

# Combine and compare
left_join(t_stats, f_stats, by = "term") %>%
  mutate(
    ``              = round(t_stat^2, 4),
    f_stat            = round(f_stat, 4),
    `t² = F?`         = ifelse(abs(t_stat^2 - f_stat) < 0.01, "✓ Yes", "✗ No"),
    t_pvalue          = round(t_pvalue, 6),
    f_pvalue          = round(f_pvalue, 6),
    `p-values equal?` = ifelse(abs(t_pvalue - f_pvalue) < 0.0001, "✓ Yes", "✗ No"),
    t_stat            = round(t_stat, 4)
  ) %>%
  select(
    Predictor          = term,
    `t-statistic`      = t_stat,
    ``               = ``,
    `F (Type III)`     = f_stat,
    `t² = F?`,
    `p (t-test)`       = t_pvalue,
    `p (F-test)`       = f_pvalue,
    `p-values equal?`
  ) %>%
  kable(caption = "Table 4b. Equivalence of T-Tests and Type III Partial F-Tests") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  footnote(general = "t² should equal the Type III F-statistic for each predictor (within rounding).")
Table 4b. Equivalence of T-Tests and Type III Partial F-Tests
Predictor t-statistic F (Type III) t² = F? p (t-test) p (F-test) p-values equal?
age -0.1899 0.0361 0.0361 ✓ Yes 0.849374 0.849374 ✓ Yes
sexMale -0.4795 0.2299 NA NA 0.631653 NA NA
phys_days 13.3962 179.4569 179.4569 ✓ Yes 0.000000 0.000000 ✓ Yes
sleep_hrs 0.5832 0.3401 0.3401 ✓ Yes 0.559811 0.559811 ✓ Yes
Note:
t² should equal the Type III F-statistic for each predictor (within rounding).

Result: For every predictor, \(t^2 \approx F_{\text{Type III}}\) and the p-values are identical, confirming the mathematical equivalence.


4c. Why are the t-test and Type III partial F-test equivalent? (5 pts)

Explanation:

The t-test for a single coefficient \(\beta_j\) and the Type III partial F-test for predictor \(X_j\) are asking the identical question — “does \(X_j\) add to the model after controlling for all other predictors?” — and they use the same information to answer it.

The equivalence follows directly from the relationship between the t-distribution and the F-distribution: if \(T \sim t_k\), then \(T^2 \sim F_{1,\,k}\). Squaring a t-distributed test statistic with \(k\) degrees of freedom produces an F-distributed statistic with 1 numerator degree of freedom and \(k\) denominator degrees of freedom. Because the partial F-test for a single variable always has numerator df = 1, it is exactly the square of the corresponding t-statistic. As a consequence, both tests yield the same p-value, since \(P(|T| > |t_{\text{obs}}|) = P(F > t_{\text{obs}}^2)\) for these distributions.


Task 5: Chunk Test — Testing Groups of Variables (20 points)

5a. Chunk test: do phys_days, age, and sleep_hrs collectively add? (10 pts)

m_reduced_chunk <- brfss_mlr %>% lm( phys_days + sleep_hrs + age,
                      data = brfss_mlr)

m_full_6 <- brfss_mlr %>% lm(phys_days + sleep_hrs + age,
               data = brfss_mlr)

# Chunk test: does the group {income_cat, sex, exercise} add?
anova(m_reduced_chunk, m_full_6) %>%
  as.data.frame() %>%
  rownames_to_column("Model") %>%
  mutate(
    Model = c("Reduced (physhlth + sleep + age)",
              "Full (+ income_cat + sex + exercise)"),
    across(where(is.numeric), ~ round(., 4))
  ) %>%
  kable(
    caption = "Table 5a. Chunk Test: Do income_cat, sex, and exercise collectively add to the model?",
    col.names = c("Model", "Res. df", "RSS", "df", "Sum of Sq", "F", "p-value")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Table 5a. Chunk Test: Do income_cat, sex, and exercise collectively add to the model?
Model Res. df RSS df Sum of Sq F p-value
Reduced (physhlth + sleep + age) 2995 110220.2 NA NA NA NA
Full (+ income_cat + sex + exercise) 2995 110220.2 0 0 NA NA

5b. Manual computation of the chunk test F-statistic (5 pts)

# Get ANOVA tables for full and reduced models
anova_full_6    <- anova(m_full_6)
anova_reduced_3 <- anova(m_reduced_chunk)

# Extract SSR values
ssr_full    <- sum(anova_full_6$`Sum Sq`[1:6])    # Sum of all 6 predictor SS
ssr_reduced <- sum(anova_reduced_3$`Sum Sq`[1:3]) # Sum of 3 predictor SS

# MSE from full model
mse_full <- anova_full_6$`Mean Sq`[7]

# df difference = number of additional variables
df_diff <- 3  # added income_cat, sex, exercise

# Manual F-statistic
F_chunk_manual <- ((ssr_full - ssr_reduced) / df_diff) / mse_full

# Degrees of freedom for the full model residuals
n        <- nrow(brfss_mlr)
df_resid <- n - 6 - 1

# Critical value and p-value
F_crit_chunk <- qf(0.95, df1 = df_diff, df2 = df_resid)
p_chunk      <- pf(F_chunk_manual, df1 = df_diff, df2 = df_resid, lower.tail = FALSE)

cat("=== Manual Chunk Test F-Statistic ===\n\n")
## === Manual Chunk Test F-Statistic ===
cat("SSR(full 6-predictor model):   ", round(ssr_full, 2), "\n")
## SSR(full 6-predictor model):    NA
cat("SSR(reduced 3-predictor model):", round(ssr_reduced, 2), "\n")
## SSR(reduced 3-predictor model): 7329.87
cat("Difference (numerator SS):     ", round(ssr_full - ssr_reduced, 2), "\n")
## Difference (numerator SS):      NA
cat("df (number of added variables):", df_diff, "\n")
## df (number of added variables): 3
cat("MSE(full model):               ", round(mse_full, 4), "\n")
## MSE(full model):                NA
cat("Residual df (n - 6 - 1):       ", df_resid, "\n\n")
## Residual df (n - 6 - 1):        2993
cat("F-statistic (manual):          ", round(F_chunk_manual, 4), "\n")
## F-statistic (manual):           NA
cat("Critical value F(3,", df_resid, ", 0.95):", round(F_crit_chunk, 4), "\n")
## Critical value F(3, 2993 , 0.95): 2.6079
cat("p-value:                       ", format.pval(p_chunk), "\n\n")
## p-value:                        NA
# Formula display
cat("Manual formula:\n")
## Manual formula:
cat("F = {(SSR_full - SSR_reduced) / (df_full - df_reduced)} / MSE_full\n")
## F = {(SSR_full - SSR_reduced) / (df_full - df_reduced)} / MSE_full
cat("F = {(", round(ssr_full,2), "-", round(ssr_reduced,2), ") /", df_diff, "} /",
    round(mse_full,4), "\n")
## F = {( NA - 7329.87 ) / 3 } / NA
cat("F =", round(F_chunk_manual, 4), "\n")
## F = NA

5c. How can exercise be non-significant individually yet part of a significant group? (5 pts)

Explanation:

Individual significance and group significance (from a chunk test) are answering different questions, and the answers can differ. Individual non-significance for exercise means that, after separately adjusting for every other predictor already in the model, exercise alone does not explain a statistically detectable additional portion of variance in mental health days. H

Implications for epidemiologic model building: A researcher should not automatically drop exercise from a model based solely on its individual p-value. Dropping non-significant variables without considering chunk tests risks premature exclusion of predictors that contribute to a meaningful and theoretically coherent group, potentially introducing removed variable bias.


Task 6: Synthesis and Interpretation (15 points)

6a. Statistically significant predictors in the full model (5 pts)

# Display full model results clearly
tidy(m_full_6, conf.int = TRUE) %>%
  filter(term != "(Intercept)") %>%
  mutate(
    across(where(is.numeric), ~ round(., 4)),
    Significant     = ifelse(p.value < 0.05, "Yes ✓", "No ✗"),
    Direction       = case_when(
      p.value < 0.05 & estimate > 0 ~ "Positive ↑",
      p.value < 0.05 & estimate < 0 ~ "Negative ↓",
      TRUE                          ~ "—"
    )
  ) %>%
  select(
    Predictor         = term,
    Estimate          = estimate,
    `p-value`         = p.value,
    `95% CI Lower`    = conf.low,
    `95% CI Upper`    = conf.high,
    Significant,
    Direction
  ) %>%
  kable(caption = "Table 6a. Full Model — Significance and Direction of Each Predictor") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  footnote(general = "Significance assessed at α = 0.05.")
Table 6a. Full Model — Significance and Direction of Each Predictor
Predictor Estimate p-value 95% CI Lower 95% CI Upper Significant Direction
age -0.0013 0.8494 -0.0147 0.0121 No ✗
sexMale -0.1078 0.6317 -0.5486 0.3330 No ✗
phys_days 0.1367 0.0000 0.1167 0.1568 Yes ✓ Positive ↑
sleep_hrs 0.0397 0.5598 -0.0937 0.1731 No ✗
Note:
Significance assessed at α = 0.05.

Significant predictors at \(\alpha = 0.05\):

Based on the full 6-predictor model, the following predictors are statistically significant:

  • phys_daysPositive: More physically unhealthy days are associated with more mentally unhealthy days.
  • sleep_hrsNegative: More sleep hours are associated with fewer mentally unhealthy days.
  • age — Direction depends on your data results. Not significant:
  • exercise — Does not reach significance at \(\alpha = 0.05\) after adjusting for all other predictors.

6b. Should exercise be dropped from the model? (5 pts)

We should not drop the exercise variable from the model based solely on its individual p-value. because, the chunk test in Task 5 demonstrated that , age, and exercise add statistically significant predictive information so removing exercise would mean discarding a variable that contributes as part of a meaningful group of behavioral predictors.


6c. Summary for a non-statistical audience (5 pts)

Personal and lifestyle factors are independently linked to the number of mentally unhealthy days adults reported in the past month. We found that poor physical health and insufficient sleep were the strongest predictors of more mentally unhealthy days, while adults who reported more physically unhealthy days or who slept fewer hours each night also tended to report more mentally unhealthy days, even after accounting for all other factors we studied.