Q632: Assignment 7 Appendix

Install packages and load dataset.

# knitted document: http://rpubs.com/jasohosk/1426671

# Load libraries
packages <- c("tidyverse", "psych", "gmodels", "lattice", "lme4", "lmerTest")
installed <- packages %in% rownames(installed.packages())
if (any(!installed)) {
  install.packages(packages[!installed])
}
lapply(packages, library, character.only = TRUE)

# Import dataset
url <- "https://github.com/JasonMHoskin/q632-assignment-7/raw/refs/heads/main/data_assn7.rda"
download.file(url, destfile = "file.RDA", mode = "wb")
load("file.RDA")
df1 <- df

# Remove unnecessary variables
rm(installed, packages, url, df)

Part A

Standardize ‘bmi’. Check for missing data across all the variables. If there are, report the number of missing data by variable. Delete cases with missing data.

df1$bmi_z <- as.numeric(scale(df1$bmi))
colSums(is.na(df1))

##    bmi    sex   race    edu health    aag    msg  state  bmi_z 
##   6129    244      0    466      0      0      0      0   6129

# Conduct listwise deletion
df2 <- na.omit(df1)
# Confirm NAs are removed
colSums(is.na(df2))

##    bmi    sex   race    edu health    aag    msg  state  bmi_z 
##      0      0      0      0      0      0      0      0      0

Rename ‘sex’ as ‘female’. Code male as 0 and female as 1. Designate ‘female’, ‘race’, and ‘edu’ as factor (categorical) variables and ‘health’ as a numeric (continuous) variable.

df3 <- df2 |>
  mutate(female = case_when(sex == "1.male" ~ 0, sex == "2.female" ~ 1))

# Factor 'female', 'race', 'edu'
df3$female <- as.factor(df3$female)
df3$race <- as.factor(df3$race)
df3$edu <- as.factor(df3$edu)

# Designate 'health' as a numeric variable
df3$health <- as.numeric(df3$health)

# Confirm dataframe classes
sapply(df3, class)

##         bmi         sex        race         edu      health         aag 
##   "numeric" "character"    "factor"    "factor"   "numeric"   "numeric" 
##         msg       state       bmi_z      female 
##   "numeric"   "integer"   "numeric"    "factor"

Draw histogram of DV (z-BMI) and visually evaluate if there are potential outliers. Then check for univariate outliers (use criterion: z = ±3.3). If there are, delete the cases with univariate outliers. After deleting outliers, draw a histogram of z-BMI with a normal curve superimposed to check improvement of data fit to the normality.

# Draw a histogram of bmi_z scores
ggplot(df3, aes(x = bmi_z)) +
  geom_histogram(
    aes(y = after_stat(density)),
    binwidth = 0.5,
    fill = "#00BE67",
    color = "black"
  ) +
  stat_function(fun = dnorm, args = list(mean = mean(df3$bmi_z), sd = sd(df3$bmi_z))) +
  labs(title = "Histogram of scaled BMI scores")

# Univariate outlier test for bmi_z
df3$outlier <- ifelse(df3$bmi_z > 3.3 | df3$bmi_z < -3.3,
                      1,
                      ifelse(df3$bmi_z <= 3.3 &
                               df3$bmi_z >= -3.3, 0, NA))

# Calculate number of outliers
df3 |>
  filter(outlier == 1) |>
  summarize(freq_n = n())

##   freq_n
## 1    514

# Remove outliers and simplify dataset
df4 <- df3 |>
  filter(!(outlier == 1)) |>
  dplyr::select(bmi_z, race, edu, health, aag, msg, state, female)

# Draw new histogram after removal of outliers
ggplot(df4, aes(x = bmi_z)) +
  geom_histogram(
    aes(y = after_stat(density)),
    binwidth = 0.5,
    fill = "#00BE67",
    color = "black"
  ) +
  stat_function(fun = dnorm, args = list(mean = mean(df4$bmi_z), sd = sd(df4$bmi_z))) +
  labs(title = "Histogram of scaled BMI scores after outlier removal")

Report frequencies and proportions for each individual-level factor IV and create dot plots of z-BMI by ‘female’,‘race’, and ‘edu’. Then, test mean differences of z-BMI between different levels of these three factor variables using either t-test or ANOVA. Interpret the results.

# Run frequencies and proportions for 'female','race', 'edu'
CrossTable(df4$female)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  59552 
## 
##  
##           |         0 |         1 | 
##           |-----------|-----------|
##           |     28675 |     30877 | 
##           |     0.482 |     0.518 | 
##           |-----------|-----------|
## 
## 
## 
##

CrossTable(df4$race)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  59552 
## 
##  
##            |    1.white |    2.black | 3.hispanic |   4.others | 
##            |------------|------------|------------|------------|
##            |      44627 |       5530 |       4750 |       4645 | 
##            |      0.749 |      0.093 |      0.080 |      0.078 | 
##            |------------|------------|------------|------------|
## 
## 
## 
##

CrossTable(df4$edu)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  59552 
## 
##  
##                 |           0.<HS |            1.HS |  2.col attended | 3.col graduated | 
##                 |-----------------|-----------------|-----------------|-----------------|
##                 |            4178 |           14618 |           15526 |           25230 | 
##                 |           0.070 |           0.245 |           0.261 |           0.424 | 
##                 |-----------------|-----------------|-----------------|-----------------|
## 
## 
## 
##

# Create dot plot of 'bmi_z' by 'female"
dotplot(
  female ~ bmi_z,
  data = df4,
  jitter.y = TRUE,
  ylab = "Sex",
  main = "Dotplot of sex"
)

# Create dot plot of 'bmi_z' by 'race'
dotplot(
  race ~ bmi_z,
  data = df4,
  jitter.y = TRUE,
  ylab = "Race",
  main = "Dotplot of race"
)

# Create dot plot of 'bmi_z' by 'edu'
dotplot(
  edu ~ bmi_z,
  data = df4,
  jitter.y = TRUE,
  ylab = "Education",
  main = "Dotplot of education"
)

# Test mean differences using anova
t.test(bmi_z ~ female, data = df4, var.equal = TRUE)

## 
##  Two Sample t-test
## 
## data:  bmi_z by female
## t = 13.988, df = 59550, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  0.08841578 0.11723048
## sample estimates:
## mean in group 0 mean in group 1 
##      0.01536388     -0.08745924

anova((lm(bmi_z ~ race, data = df4)))

## Analysis of Variance Table
## 
## Response: bmi_z
##              Df Sum Sq Mean Sq F value    Pr(>F)    
## race          3    533 177.664   222.9 < 2.2e-16 ***
## Residuals 59548  47463   0.797                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova((lm(bmi_z ~ edu, data = df4)))

## Analysis of Variance Table
## 
## Response: bmi_z
##              Df Sum Sq Mean Sq F value    Pr(>F)    
## edu           3    901 300.377  379.81 < 2.2e-16 ***
## Residuals 59548  47095   0.791                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Draw a scatterplot between z-BMI and ‘health’ (y = z-BMI and x = ‘health’) by state with prediction lines for each sex for the first six states (Hint: There are six states coded less than 9). Interpret the plots.

df4 |>
  filter(state < 9) |>
  ggplot(aes(
    x = health,
    y = bmi_z,
    color = factor(female)
  )) +
  geom_jitter(
    width = .2,
    height = 0,
    alpha = 0.15,
    size = 1
  ) +
  geom_smooth(method = "lm", se = TRUE) +
  facet_wrap(~ state) +
  theme_bw() +
  scale_color_manual(
    values = c("0" = "#2C7FB8", "1" = "#E8743B"),
    labels = c("0" = "Male", "1" = "Female"),
    name = "Sex"
  ) +
  labs(
    title = "z-BMI vs. Self-Reported Health by State",
    subtitle = "Linear prediction lines by sex, first six states",
    x = "Self-reported health (1 = excellent, 5 = poor)",
    y = "z-BMI"
  ) +
  theme(legend.position = "bottom")

## `geom_smooth()` using formula = 'y ~ x'

Report the number of state-level units (i.e., the frequency of ‘state’), and the minimum and maximum cell size of the ‘state’ (i.e., the smallest and the largest number of state residents). [Although we already know we have enough numbers of level-2 units and huge sample sizes in this specific dataset, you are expected to check the frequency of level-2 units and their cell sizes for a typical MLM analysis].

df4 |>
  count(state) |>
  summarise(n_states = n(),
            min_n = min(n),
            max_n = max(n))

##   n_states min_n max_n
## 1       51   376  5202

Part B

Fit a cross-sectional two-level multilevel model without predictors (i.e., null model or Model m0) and interpret the results. Make sure you calculate and interpret intraclass correlation coefficient (ICC) without using a package.

# Fit null model
m0 <- lme4::lmer(bmi_z ~ 1 + (1 | state), data = df4)
summary(m0)

## Linear mixed model fit by REML ['lmerMod']
## Formula: bmi_z ~ 1 + (1 | state)
##    Data: df4
## 
## REML criterion at convergence: 155728.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.4362 -0.7095 -0.1469  0.5441  3.8395 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  state    (Intercept) 0.008912 0.0944  
##  Residual             0.798409 0.8935  
## Number of obs: 59552, groups:  state, 51
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) -0.03356    0.01385  -2.424

# Calculate ICC manually
0.008912 / (0.008912 + 0.798409)

## [1] 0.01103898

# Confirm results using 'performance' package
performance::icc(m0)

## # Intraclass Correlation Coefficient
## 
##     Adjusted ICC: 0.011
##   Unadjusted ICC: 0.011

Do the grand-mean centering for the variable ‘health’ and name it as ‘Chealth’). Fit a random-coefficients regression model with only ‘Chealth’ as the predictor (Model m1). Compute 95% profile confidence interval. Interpret the results including both fixed and random effects. Compare model fit between m0 and m1 models.

# Grand mean centering of 'health'
df4$Chealth <- df4$health - mean(df4$health)

# Fit random-coefficients model
m1 <- lme4::lmer(bmi_z ~ Chealth + (1 | state), data = df4)
summary(m1)

## Linear mixed model fit by REML ['lmerMod']
## Formula: bmi_z ~ Chealth + (1 | state)
##    Data: df4
## 
## REML criterion at convergence: 151458.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.9512 -0.6753 -0.1241  0.5516  4.3643 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  state    (Intercept) 0.005554 0.07452 
##  Residual             0.743282 0.86214 
## Number of obs: 59552, groups:  state, 51
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept) -0.035848   0.011162  -3.212
## Chealth      0.217502   0.003264  66.640
## 
## Correlation of Fixed Effects:
##         (Intr)
## Chealth -0.003

# Compute 95% profile confidence intervals
confint(m1, method = c("profile"))

## Computing profile confidence intervals ...

##                   2.5 %      97.5 %
## .sig01       0.05931246  0.09300225
## .sigma       0.85725557  0.86705198
## (Intercept) -0.05794233 -0.01378489
## Chealth      0.21111232  0.22391196

# Compare model fit between m0 and m1
anova(m0, m1)

## refitting model(s) with ML (instead of REML)

## Data: df4
## Models:
## m0: bmi_z ~ 1 + (1 | state)
## m1: bmi_z ~ Chealth + (1 | state)
##    npar    AIC    BIC logLik -2*log(L) Chisq Df Pr(>Chisq)    
## m0    3 155728 155755 -77861    155722                        
## m1    4 151450 151486 -75721    151442  4280  1  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Fit a random-coefficients regression model with all the individual-level (level-1) predictors (i.e., ‘Chealth’, ‘female’, ‘race’, and ‘edu’) (Model m2). Compute 95% profile confidence interval. Interpret the results including both fixed and random effects. Compare model fit between m1 and m2 models.

# Conduct model fitting
m2 <- lme4::lmer(bmi_z ~ Chealth + female + race + edu + (1 | state), data = df4)
summary(m2)

## Linear mixed model fit by REML ['lmerMod']
## Formula: bmi_z ~ Chealth + female + race + edu + (1 | state)
##    Data: df4
## 
## REML criterion at convergence: 150496.9
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.0734 -0.6729 -0.1295  0.5418  4.2823 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  state    (Intercept) 0.004516 0.0672  
##  Residual             0.730944 0.8550  
## Number of obs: 59552, groups:  state, 51
## 
## Fixed effects:
##                     Estimate Std. Error t value
## (Intercept)        -0.081923   0.017586  -4.658
## Chealth             0.206993   0.003412  60.668
## female1            -0.108255   0.007048 -15.359
## race2.black         0.259984   0.012890  20.169
## race3.hispanic      0.041393   0.014108   2.934
## race4.others       -0.064772   0.013802  -4.693
## edu1.HS             0.114337   0.015475   7.389
## edu2.col attended   0.160948   0.015555  10.347
## edu3.col graduated  0.022390   0.015343   1.459
## 
## Correlation of Fixed Effects:
##             (Intr) Chelth femal1 rc2.bl rc3.hs rc4.th ed1.HS ed2.ca
## Chealth     -0.131                                                 
## female1     -0.182 -0.020                                          
## race2.black -0.099 -0.047 -0.024                                   
## race3.hspnc -0.257 -0.043  0.000  0.109                            
## race4.othrs -0.101 -0.060  0.012  0.099  0.129                     
## edu1.HS     -0.716  0.104  0.001  0.002  0.193  0.023              
## ed2.clattnd -0.720  0.143 -0.038  0.022  0.218  0.024  0.796       
## ed3.clgrdtd -0.749  0.238 -0.045  0.044  0.236  0.027  0.820  0.834

# Calculate profile confidence intervals
confint(m2, method = c("profile"))

## Computing profile confidence intervals ...

##                           2.5 %      97.5 %
## .sig01              0.052900782  0.08435017
## .sigma              0.850061684  0.85977633
## (Intercept)        -0.116358628 -0.04750581
## Chealth             0.200314011  0.21369272
## female1            -0.122060417 -0.09443147
## race2.black         0.234703328  0.28522563
## race3.hispanic      0.013539627  0.06901182
## race4.others       -0.091927415 -0.03776165
## edu1.HS             0.084003011  0.14465944
## edu2.col attended   0.130452243  0.19142265
## edu3.col graduated -0.007725382  0.05242989

# Compare model fit between m1 and m2
anova(m1, m2)

## refitting model(s) with ML (instead of REML)

## Data: df4
## Models:
## m1: bmi_z ~ Chealth + (1 | state)
## m2: bmi_z ~ Chealth + female + race + edu + (1 | state)
##    npar    AIC    BIC logLik -2*log(L)  Chisq Df Pr(>Chisq)    
## m1    4 151450 151486 -75721    151442                         
## m2   11 150452 150550 -75215    150430 1012.1  7  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Fit a random-coefficients regression model with only the state-level (level-2) predictors (i.e.,‘aag’ and ‘msg’) (Model m3). Compute 95% profile confidence interval. Interpret the results including both fixed and random effects. Compare model fit between m2 and m3 models.

# Conduct model fitting
m3 <- lme4::lmer(bmi_z ~ aag + msg + (1 | state), data = df4)
summary(m3)

## Linear mixed model fit by REML ['lmerMod']
## Formula: bmi_z ~ aag + msg + (1 | state)
##    Data: df4
## 
## REML criterion at convergence: 155696.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.4366 -0.7102 -0.1467  0.5434  3.8406 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  state    (Intercept) 0.002732 0.05227 
##  Residual             0.798409 0.89354 
## Number of obs: 59552, groups:  state, 51
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept)  0.734878   0.082819   8.873
## aag         -0.009595   0.002674  -3.588
## msg         -0.009547   0.004255  -2.244
## 
## Correlation of Fixed Effects:
##     (Intr) aag   
## aag -0.458       
## msg -0.156 -0.805

# Calculate profile confidence intervals
confint(m3, method = c("profile"))

## Computing profile confidence intervals ...

##                   2.5 %       97.5 %
## .sig01       0.03875176  0.065242747
## .sigma       0.88848455  0.898637766
## (Intercept)  0.57431779  0.895310618
## aag         -0.01477491 -0.004408447
## msg         -0.01779655 -0.001304096

# Compare model fit between m2 and m3
anova(m2, m3)

## refitting model(s) with ML (instead of REML)

## Data: df4
## Models:
## m3: bmi_z ~ aag + msg + (1 | state)
## m2: bmi_z ~ Chealth + female + race + edu + (1 | state)
##    npar    AIC    BIC logLik -2*log(L)  Chisq Df Pr(>Chisq)    
## m3    5 155678 155723 -77834    155668                         
## m2   11 150452 150550 -75215    150430 5238.9  6  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Fit a random-coefficients regression model with all the level-1 and level-2 predictors and an interaction between ‘female’ and ‘race’ (Model m4). Compute 95% profile confidence interval. Interpret the results including both fixed and random effects. Compare model fit between m2 and m4 models.

# Conduct model fitting
m4 <- lme4::lmer(bmi_z ~ Chealth + aag + msg + female + race + edu + female * race + (1 | state), data = df4)
summary(m4)

## Linear mixed model fit by REML ['lmerMod']
## Formula: bmi_z ~ Chealth + aag + msg + female + race + edu + female *  
##     race + (1 | state)
##    Data: df4
## 
## REML criterion at convergence: 150178.3
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.1122 -0.6693 -0.1328  0.5416  4.2656 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  state    (Intercept) 0.001922 0.04384 
##  Residual             0.727060 0.85268 
## Number of obs: 59552, groups:  state, 51
## 
## Fixed effects:
##                         Estimate Std. Error t value
## (Intercept)             0.436763   0.073499   5.942
## Chealth                 0.205538   0.003404  60.385
## aag                    -0.003475   0.002330  -1.492
## msg                    -0.010679   0.003711  -2.878
## female1                -0.164610   0.008105 -20.308
## race2.black             0.020163   0.018650   1.081
## race3.hispanic         -0.027083   0.019059  -1.421
## race4.others           -0.101261   0.018862  -5.369
## edu1.HS                 0.113210   0.015431   7.336
## edu2.col attended       0.159068   0.015510  10.256
## edu3.col graduated      0.020050   0.015302   1.310
## female1:race2.black     0.429358   0.024484  17.536
## female1:race3.hispanic  0.139295   0.026050   5.347
## female1:race4.others    0.074050   0.026310   2.815

## 
## Correlation matrix not shown by default, as p = 14 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it

# Compute 95% profile confidence intervals
confint(m4, method = c("profile"))

## Computing profile confidence intervals ...

##                               2.5 %       97.5 %
## .sig01                  0.031591431  0.055340411
## .sigma                  0.847778528  0.857467005
## (Intercept)             0.294435955  0.579009995
## Chealth                 0.198875801  0.212217348
## aag                    -0.007983798  0.001042552
## msg                    -0.017873485 -0.003498097
## female1                -0.180478591 -0.148708222
## race2.black            -0.016653781  0.056491131
## race3.hispanic         -0.064666993  0.010070029
## race4.others           -0.138390599 -0.064456890
## edu1.HS                 0.083014535  0.143499982
## edu2.col attended       0.128720362  0.189516013
## edu3.col graduated     -0.009942108  0.050032155
## female1:race2.black     0.381385403  0.477351717
## female1:race3.hispanic  0.088224611  0.190329414
## female1:race4.others    0.022509419  0.125633973

# Compare model fit between m2 and m4
anova(m2, m4)

## refitting model(s) with ML (instead of REML)

## Data: df4
## Models:
## m2: bmi_z ~ Chealth + female + race + edu + (1 | state)
## m4: bmi_z ~ Chealth + aag + msg + female + race + edu + female * race + (1 | state)
##    npar    AIC    BIC logLik -2*log(L)  Chisq Df Pr(>Chisq)    
## m2   11 150452 150550 -75215    150430                         
## m4   16 150105 150249 -75036    150073 356.62  5  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Fit a random-coefficients regression model with all the variables you entered in Model m4 and add a cross-level interaction between ‘female’ and ‘aag’ (Model m5). Compute 95% profile confidence interval. Interpret the results including both fixed and random effects. Compare model fit between m4 and m5 models.

# Conduct model
m5 <- lme4::lmer(
  bmi_z ~ Chealth + aag + msg + female + race + edu + female * race + female * aag + (1 | state), data = df4)
summary(m5)

## Linear mixed model fit by REML ['lmerMod']
## Formula: bmi_z ~ Chealth + aag + msg + female + race + edu + female *  
##     race + female * aag + (1 | state)
##    Data: df4
## 
## REML criterion at convergence: 150182.7
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.1058 -0.6679 -0.1326  0.5415  4.2658 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  state    (Intercept) 0.001927 0.04389 
##  Residual             0.726989 0.85264 
## Number of obs: 59552, groups:  state, 51
## 
## Fixed effects:
##                         Estimate Std. Error t value
## (Intercept)             0.333091   0.083682   3.980
## Chealth                 0.205509   0.003404  60.379
## aag                    -0.001464   0.002457  -0.596
## msg                    -0.010676   0.003714  -2.874
## female1                 0.034045   0.076879   0.443
## race2.black             0.024021   0.018708   1.284
## race3.hispanic         -0.030517   0.019105  -1.597
## race4.others           -0.103878   0.018888  -5.500
## edu1.HS                 0.113138   0.015431   7.332
## edu2.col attended       0.159175   0.015510  10.263
## edu3.col graduated      0.020042   0.015301   1.310
## female1:race2.black     0.421823   0.024654  17.110
## female1:race3.hispanic  0.145927   0.026173   5.575
## female1:race4.others    0.079223   0.026384   3.003
## aag:female1            -0.003865   0.001487  -2.598

## 
## Correlation matrix not shown by default, as p = 15 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it

# Calculate profile confidence intervals
confint(m5, method = c("profile"))

## Computing profile confidence intervals ...

##                               2.5 %        97.5 %
## .sig01                  0.031641926  0.0554033176
## .sigma                  0.847729804  0.8574177252
## (Intercept)             0.171143469  0.4951172742
## Chealth                 0.198846933  0.2121877856
## aag                    -0.006218845  0.0032956259
## msg                    -0.017876329 -0.0034883794
## female1                -0.116689012  0.1846403865
## race2.black            -0.012913622  0.0604600637
## race3.hispanic         -0.068188629  0.0067244005
## race4.others           -0.141059461 -0.0670231238
## edu1.HS                 0.082944654  0.1434268961
## edu2.col attended       0.128828522  0.1896210630
## edu3.col graduated     -0.009947625  0.0500232874
## female1:race2.black     0.373520880  0.4701524733
## female1:race3.hispanic  0.094611948  0.1971997842
## female1:race4.others    0.027534917  0.1309482623
## aag:female1            -0.006778197 -0.0009483422

# Compare model fit between m4 and m5
anova(m4, m5)

## refitting model(s) with ML (instead of REML)

## Data: df4
## Models:
## m4: bmi_z ~ Chealth + aag + msg + female + race + edu + female * race + (1 | state)
## m5: bmi_z ~ Chealth + aag + msg + female + race + edu + female * race + female * aag + (1 | state)
##    npar    AIC    BIC logLik -2*log(L)  Chisq Df Pr(>Chisq)   
## m4   16 150105 150249 -75036    150073                        
## m5   17 150100 150253 -75033    150066 6.7475  1   0.009388 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Check if residuals are normally distributed for the final model by plotting a histogram with density curve.

hist(
  scale(resid(m5)),
  freq = FALSE,
  ylim = c(0, .5),
  xlim = c(-3.5, 3.5),
  main = "Histogram of Standardized Residuals from model m5",
  xlab = "Standardized Residuals"
)
lines(density(scale(resid(m5))))
box()

Q632: Assignment 7 Appendix

Jason Hoskin

2026-05-08

Part A

Part B