Q632: Assignment 7 Appendix

Install packages and load dataset.

# Load libraries
packages <- c("tidyverse", "psych", "gmodels", "lattice", "lme4", "lmerTest")
installed <- packages %in% rownames(installed.packages())
if (any(!installed)) {
  install.packages(packages[!installed])
}
lapply(packages, library, character.only = TRUE)

# Import dataset
url <- "https://github.com/JasonMHoskin/q632-assignment-7/raw/refs/heads/main/data_assn7.rda"
download.file(url, destfile = "file.RDA", mode = "wb")
load("file.RDA")
df1 <- df

# Remove unnecessary variables
rm(installed, packages, url, df)

Part A

Standardize ‘bmi’. Check for missing data across all the variables. If there are, report the number of missing data by variable. Delete cases with missing data.

df1$bmi_z <- as.numeric(scale(df1$bmi))
colSums(is.na(df1))

##    bmi    sex   race    edu health    aag    msg  state  bmi_z 
##   6129    244      0    466      0      0      0      0   6129

# Conduct listwise deletion
df2 <- na.omit(df1)
# Confirm NAs are removed
colSums(is.na(df2))

##    bmi    sex   race    edu health    aag    msg  state  bmi_z 
##      0      0      0      0      0      0      0      0      0

Rename ‘sex’ as ‘female’. Code male as 0 and female as 1. Designate ‘female’, ‘race’, and ‘edu’ as factor (categorical) variables and ‘health’ as a numeric (continuous) variable.

df3 <- df2 |>
  mutate(female = case_when(sex == "1.male" ~ 0, sex == "2.female" ~ 1))

# Factor 'female', 'race', 'edu'
df3$female <- as.factor(df3$female)
df3$race <- as.factor(df3$race)
df3$edu <- as.factor(df3$edu)

# Designate 'health' as a numeric variable
df3$health <- as.numeric(df3$health)

# Confirm dataframe classes
sapply(df3, class)

##         bmi         sex        race         edu      health         aag 
##   "numeric" "character"    "factor"    "factor"   "numeric"   "numeric" 
##         msg       state       bmi_z      female 
##   "numeric"   "integer"   "numeric"    "factor"

Draw histogram of DV (z-BMI) and visually evaluate if there are potential outliers. Then check for univariate outliers (use criterion: z = ±3.3). If there are, delete the cases with univariate outliers. After deleting outliers, draw a histogram of z-BMI with a normal curve superimposed to check improvement of data fit to the normality.

# Draw a histogram of bmi_z scores
ggplot(df3, aes(x = bmi_z)) +
  geom_histogram(
    aes(y = after_stat(density)),
    binwidth = 0.5,
    fill = "#00BE67",
    color = "black"
  ) +
  labs(title = "Histogram of scaled BMI scores")

# Univariate outlier test for bmi_z
df3$outlier <- ifelse(df3$bmi_z > 3.3 | df3$bmi_z < -3.3,
                      1,
                      ifelse(df3$bmi_z <= 3.3 &
                               df3$bmi_z >= -3.3, 0, NA))

# Calculate number of outliers
df3 |>
  filter(outlier == 1) |>
  summarize(freq_n = n())

##   freq_n
## 1    514

# Remove outliers and simplify dataset
df4 <- df3 |>
  filter(!(outlier == 1)) |>
  dplyr::select(bmi_z, race, edu, health, aag, msg, state, female)

# Draw new histogram after removal of outliers
ggplot(df4, aes(x = bmi_z)) +
  geom_histogram(
    aes(y = after_stat(density)),
    binwidth = 0.5,
    fill = "#00BE67",
    color = "black"
  ) +
  stat_function(fun = dnorm, args = list(mean = mean(df2$bmi_z), sd = sd(df2$bmi_z))) +
  labs(title = "Histogram of scaled BMI scores after outlier removal")

Report frequencies and proportions for each individual-level factor IV and create dot plots of z-BMI by ‘female’,‘race’, and ‘edu’. Then, test mean differences of z-BMI between different levels of these three factor variables using either t-test or ANOVA. Interpret the results.

# Run frequencies and proportions for 'female','race', 'edu'
CrossTable(df4$female)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  59552 
## 
##  
##           |         0 |         1 | 
##           |-----------|-----------|
##           |     28675 |     30877 | 
##           |     0.482 |     0.518 | 
##           |-----------|-----------|
## 
## 
## 
##

CrossTable(df4$race)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  59552 
## 
##  
##            |    1.white |    2.black | 3.hispanic |   4.others | 
##            |------------|------------|------------|------------|
##            |      44627 |       5530 |       4750 |       4645 | 
##            |      0.749 |      0.093 |      0.080 |      0.078 | 
##            |------------|------------|------------|------------|
## 
## 
## 
##

CrossTable(df4$edu)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  59552 
## 
##  
##                 |           0.<HS |            1.HS |  2.col attended | 3.col graduated | 
##                 |-----------------|-----------------|-----------------|-----------------|
##                 |            4178 |           14618 |           15526 |           25230 | 
##                 |           0.070 |           0.245 |           0.261 |           0.424 | 
##                 |-----------------|-----------------|-----------------|-----------------|
## 
## 
## 
##

# Create dot plot of 'bmi_z' by 'female"
dotplot(
  female ~ bmi_z,
  data = df4,
  jitter.y = TRUE,
  ylab = "Race",
  main = "Dotplot of sex"
)

# Create dot plot of 'bmi_z' by 'race'
dotplot(
  race ~ bmi_z,
  data = df4,
  jitter.y = TRUE,
  ylab = "Race",
  main = "Dotplot of race"
)

# Create dot plot of 'bmi_z' by 'edu'
dotplot(
  edu ~ bmi_z,
  data = df4,
  jitter.y = TRUE,
  ylab = "Race",
  main = "Dotplot of education"
)

# Test mean differences using anova
t.test(bmi_z ~ female, data = df4, var.equal = TRUE)

## 
##  Two Sample t-test
## 
## data:  bmi_z by female
## t = 13.988, df = 59550, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  0.08841578 0.11723048
## sample estimates:
## mean in group 0 mean in group 1 
##      0.01536388     -0.08745924

anova((lm(bmi_z ~ race, data = df4)))

## Analysis of Variance Table
## 
## Response: bmi_z
##              Df Sum Sq Mean Sq F value    Pr(>F)    
## race          3    533 177.664   222.9 < 2.2e-16 ***
## Residuals 59548  47463   0.797                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova((lm(bmi_z ~ edu, data = df4)))

## Analysis of Variance Table
## 
## Response: bmi_z
##              Df Sum Sq Mean Sq F value    Pr(>F)    
## edu           3    901 300.377  379.81 < 2.2e-16 ***
## Residuals 59548  47095   0.791                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Draw a scatterplot between z-BMI and ‘health’ (y = z-BMI and x = ‘health’) by state with prediction lines for each sex for the first six states (Hint: There are six states coded less than 9). Interpret the plots.

df4 |>
  filter(state %in% 1:9) |>
  ggplot(aes(
    x = health,
    y = bmi_z,
    color = factor(female)
  )) +
  geom_jitter(
    width = .2,
    height = 0,
    alpha = 0.15,
    size = 1
  ) +
  geom_smooth(method = "lm", se = TRUE) +
  facet_wrap(~ state) +
  theme_bw() +
  scale_color_manual(
    values = c("0" = "#2C7FB8", "1" = "#E8743B"),
    labels = c("0" = "Male", "1" = "Female"),
    name = "Sex"
  ) +
  labs(
    title = "z-BMI vs. Self-Reported Health by State",
    subtitle = "Linear prediction lines by sex, first six states",
    x = "Self-reported health (1 = excellent, 5 = poor)",
    y = "z-BMI"
  ) +
  theme(legend.position = "bottom")

## `geom_smooth()` using formula = 'y ~ x'

Report the number of state-level units (i.e., the frequency of ‘state’), and the minimum and maximum cell size of the ‘state’ (i.e., the smallest and the largest number of state residents). [Although we already know we have enough numbers of level-2 units and huge sample sizes in this specific dataset, you are expected to check the frequency of level-2 units and their cell sizes for a typical MLM analysis].

# Calculate the number of state-level units
df4 |>
  count(state) |>
  summarize(freq_n = n())

##   freq_n
## 1     51

# Calculate the minimum cell size
df4 |>
  count(state) |>
  summarise(min_n = min(n))

##   min_n
## 1   376

# Calculate the maximum cell size
df4 |>
  count(state) |>
  summarise(max_n = max(n))

##   max_n
## 1  5202

Part B

Fit a cross-sectional two-level multilevel model without predictors (i.e., null model or Model m0) and interpret the results. Make sure you calculate and interpret intraclass correlation coefficient (ICC) without using a package.

# Fit null model
m0 <- lme4::lmer(bmi_z ~ 1 + (1 | state), data = df4)
summary(m0)

## Linear mixed model fit by REML ['lmerMod']
## Formula: bmi_z ~ 1 + (1 | state)
##    Data: df4
## 
## REML criterion at convergence: 155728.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.4362 -0.7095 -0.1469  0.5441  3.8395 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  state    (Intercept) 0.008912 0.0944  
##  Residual             0.798409 0.8935  
## Number of obs: 59552, groups:  state, 51
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) -0.03356    0.01385  -2.424

m0.1 <- lmerTest::lmer(bmi_z ~ 1 + (1 | state), data = df4)
summary(m0.1)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: bmi_z ~ 1 + (1 | state)
##    Data: df4
## 
## REML criterion at convergence: 155728.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.4362 -0.7095 -0.1469  0.5441  3.8395 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  state    (Intercept) 0.008912 0.0944  
##  Residual             0.798409 0.8935  
## Number of obs: 59552, groups:  state, 51
## 
## Fixed effects:
##             Estimate Std. Error       df t value Pr(>|t|)  
## (Intercept) -0.03356    0.01385 49.35874  -2.424   0.0191 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Calculate ICC manually
0.009 / (0.009 + 0.798)

## [1] 0.01115242

# Confirm results using 'performance' package
performance::icc(m0)

## # Intraclass Correlation Coefficient
## 
##     Adjusted ICC: 0.011
##   Unadjusted ICC: 0.011

Do the grand-mean centering for the variable ‘health’ and name it as ‘Chealth’). Fit a random-coefficients regression model with only ‘Chealth’ as the predictor (Model m1). Compute 95% profile confidence interval. Interpret the results including both fixed and random effects. Compare model fit between m0 and m1 models.

# Grand mean centering of 'health'
Chealth <- df4$health - mean(df4$health)

# Fit random-coefficients model
m1 <- lme4::lmer(bmi_z ~ Chealth + (1 | state), data = df4)
summary(m1)

## Linear mixed model fit by REML ['lmerMod']
## Formula: bmi_z ~ Chealth + (1 | state)
##    Data: df4
## 
## REML criterion at convergence: 151458.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.9512 -0.6753 -0.1241  0.5516  4.3643 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  state    (Intercept) 0.005554 0.07452 
##  Residual             0.743282 0.86214 
## Number of obs: 59552, groups:  state, 51
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept) -0.035848   0.011162  -3.212
## Chealth      0.217502   0.003264  66.640
## 
## Correlation of Fixed Effects:
##         (Intr)
## Chealth -0.003

m1.1 <- lmerTest::lmer(bmi_z ~ Chealth + (1 | state), data = df4)
summary(m1.1)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: bmi_z ~ Chealth + (1 | state)
##    Data: df4
## 
## REML criterion at convergence: 151458.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.9512 -0.6753 -0.1241  0.5516  4.3643 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  state    (Intercept) 0.005554 0.07452 
##  Residual             0.743282 0.86214 
## Number of obs: 59552, groups:  state, 51
## 
## Fixed effects:
##               Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept) -3.585e-02  1.116e-02  4.917e+01  -3.212  0.00233 ** 
## Chealth      2.175e-01  3.264e-03  5.951e+04  66.640  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##         (Intr)
## Chealth -0.003

# Compute 95% profile confidence intervals
confint(m1, method = c("Wald"))

##                  2.5 %      97.5 %
## .sig01              NA          NA
## .sigma              NA          NA
## (Intercept) -0.0577245 -0.01397093
## Chealth      0.2111054  0.22389943

# Compare model fit between m0 and m1
anova(m0, m1)

## refitting model(s) with ML (instead of REML)

## Data: df4
## Models:
## m0: bmi_z ~ 1 + (1 | state)
## m1: bmi_z ~ Chealth + (1 | state)
##    npar    AIC    BIC logLik -2*log(L) Chisq Df Pr(>Chisq)    
## m0    3 155728 155755 -77861    155722                        
## m1    4 151450 151486 -75721    151442  4280  1  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Fit a random-coefficients regression model with all the individual-level (level-1) predictors (i.e., ‘Chealth’, ‘female’, ‘race’, and ‘edu’) (Model m2). Compute 95% profile confidence interval. Interpret the results including both fixed and random effects. Compare model fit between m1 and m2 models.

# Conduct model fitting
m2 <- lme4::lmer(bmi_z ~ Chealth + female + race + edu + (1 |
                                                            state), data = df4)
summary(m2)

## Linear mixed model fit by REML ['lmerMod']
## Formula: bmi_z ~ Chealth + female + race + edu + (1 | state)
##    Data: df4
## 
## REML criterion at convergence: 150496.9
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.0734 -0.6729 -0.1295  0.5418  4.2823 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  state    (Intercept) 0.004516 0.0672  
##  Residual             0.730944 0.8550  
## Number of obs: 59552, groups:  state, 51
## 
## Fixed effects:
##                     Estimate Std. Error t value
## (Intercept)        -0.081923   0.017586  -4.658
## Chealth             0.206993   0.003412  60.668
## female1            -0.108255   0.007048 -15.359
## race2.black         0.259984   0.012890  20.169
## race3.hispanic      0.041393   0.014108   2.934
## race4.others       -0.064772   0.013802  -4.693
## edu1.HS             0.114337   0.015475   7.389
## edu2.col attended   0.160948   0.015555  10.347
## edu3.col graduated  0.022390   0.015343   1.459
## 
## Correlation of Fixed Effects:
##             (Intr) Chelth femal1 rc2.bl rc3.hs rc4.th ed1.HS ed2.ca
## Chealth     -0.131                                                 
## female1     -0.182 -0.020                                          
## race2.black -0.099 -0.047 -0.024                                   
## race3.hspnc -0.257 -0.043  0.000  0.109                            
## race4.othrs -0.101 -0.060  0.012  0.099  0.129                     
## edu1.HS     -0.716  0.104  0.001  0.002  0.193  0.023              
## ed2.clattnd -0.720  0.143 -0.038  0.022  0.218  0.024  0.796       
## ed3.clgrdtd -0.749  0.238 -0.045  0.044  0.236  0.027  0.820  0.834

m2.1 <- lmerTest::lmer(bmi_z ~ Chealth + female + race + edu + (1 |
                                                                  state), data = df4)
summary(m2.1)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: bmi_z ~ Chealth + female + race + edu + (1 | state)
##    Data: df4
## 
## REML criterion at convergence: 150496.9
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.0734 -0.6729 -0.1295  0.5418  4.2823 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  state    (Intercept) 0.004516 0.0672  
##  Residual             0.730944 0.8550  
## Number of obs: 59552, groups:  state, 51
## 
## Fixed effects:
##                      Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)        -8.192e-02  1.759e-02  4.230e+02  -4.658 4.28e-06 ***
## Chealth             2.070e-01  3.412e-03  5.954e+04  60.668  < 2e-16 ***
## female1            -1.083e-01  7.048e-03  5.953e+04 -15.359  < 2e-16 ***
## race2.black         2.600e-01  1.289e-02  4.475e+04  20.169  < 2e-16 ***
## race3.hispanic      4.139e-02  1.411e-02  5.128e+04   2.934  0.00335 ** 
## race4.others       -6.477e-02  1.380e-02  4.976e+04  -4.693 2.70e-06 ***
## edu1.HS             1.143e-01  1.547e-02  5.954e+04   7.389 1.50e-13 ***
## edu2.col attended   1.609e-01  1.555e-02  5.954e+04  10.347  < 2e-16 ***
## edu3.col graduated  2.239e-02  1.534e-02  5.952e+04   1.459  0.14450    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) Chelth femal1 rc2.bl rc3.hs rc4.th ed1.HS ed2.ca
## Chealth     -0.131                                                 
## female1     -0.182 -0.020                                          
## race2.black -0.099 -0.047 -0.024                                   
## race3.hspnc -0.257 -0.043  0.000  0.109                            
## race4.othrs -0.101 -0.060  0.012  0.099  0.129                     
## edu1.HS     -0.716  0.104  0.001  0.002  0.193  0.023              
## ed2.clattnd -0.720  0.143 -0.038  0.022  0.218  0.024  0.796       
## ed3.clgrdtd -0.749  0.238 -0.045  0.044  0.236  0.027  0.820  0.834

# Calculate profile confidence intervals
confint(m2, method = c("Wald"))

##                           2.5 %      97.5 %
## .sig01                       NA          NA
## .sigma                       NA          NA
## (Intercept)        -0.116391036 -0.04745466
## Chealth             0.200306189  0.21368054
## female1            -0.122069373 -0.09444105
## race2.black         0.234720032  0.28524807
## race3.hispanic      0.013742167  0.06904325
## race4.others       -0.091823934 -0.03772053
## edu1.HS             0.084007323  0.14466690
## edu2.col attended   0.130461180  0.19143404
## edu3.col graduated -0.007682054  0.05246108

# Compare model fit between m1 and m2
anova(m1, m2)

## refitting model(s) with ML (instead of REML)

## Data: df4
## Models:
## m1: bmi_z ~ Chealth + (1 | state)
## m2: bmi_z ~ Chealth + female + race + edu + (1 | state)
##    npar    AIC    BIC logLik -2*log(L)  Chisq Df Pr(>Chisq)    
## m1    4 151450 151486 -75721    151442                         
## m2   11 150452 150550 -75215    150430 1012.1  7  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Fit a random-coefficients regression model with only the state-level (level-2) predictors (i.e.,‘aag’ and ‘msg’) (Model m3). Compute 95% profile confidence interval. Interpret the results including both fixed and random effects. Compare model fit between m2 and m3 models.

# Conduct model fitting
m3 <- lme4::lmer(bmi_z ~ aag + msg + (aag + msg |
                                        state), data = df4)

## boundary (singular) fit: see help('isSingular')

summary(m3)

## Linear mixed model fit by REML ['lmerMod']
## Formula: bmi_z ~ aag + msg + (aag + msg | state)
##    Data: df4
## 
## REML criterion at convergence: 155724.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.4362 -0.7090 -0.1484  0.5444  3.8485 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev. Corr        
##  state    (Intercept) 0.9302915 0.96452              
##           aag         0.0002193 0.01481   1.00       
##           msg         0.0034622 0.05884  -1.00 -1.00 
##  Residual             0.7981317 0.89338              
## Number of obs: 59552, groups:  state, 51
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept)  0.877970   0.181802   4.829
## aag         -0.006434   0.004344  -1.481
## msg         -0.020328   0.010339  -1.966
## 
## Correlation of Fixed Effects:
##     (Intr) aag   
## aag  0.098       
## msg -0.670 -0.804
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

m3.1 <- lmerTest::lmer(bmi_z ~ aag + msg + (aag + msg |
                                              state), data = df4)

## boundary (singular) fit: see help('isSingular')

## Warning: Model failed to converge with 3 negative eigenvalues: -2.3e+02
## -4.9e+04 -1.9e+06

summary(m3.1)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: bmi_z ~ aag + msg + (aag + msg | state)
##    Data: df4
## 
## REML criterion at convergence: 155724.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.4362 -0.7090 -0.1484  0.5444  3.8485 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev. Corr        
##  state    (Intercept) 0.9302915 0.96452              
##           aag         0.0002193 0.01481   1.00       
##           msg         0.0034622 0.05884  -1.00 -1.00 
##  Residual             0.7981317 0.89338              
## Number of obs: 59552, groups:  state, 51
## 
## Fixed effects:
##               Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)   0.877970   0.181802 160.159234   4.829 3.18e-06 ***
## aag          -0.006434   0.004344  59.008409  -1.481   0.1440    
## msg          -0.020328   0.010339 129.944740  -1.966   0.0514 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##     (Intr) aag   
## aag  0.098       
## msg -0.670 -0.804
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

# Calculate profile confidence intervals
confint(m3, method = c("Wald"))

##                   2.5 %        97.5 %
## .sig01               NA            NA
## .sig02               NA            NA
## .sig03               NA            NA
## .sig04               NA            NA
## .sig05               NA            NA
## .sig06               NA            NA
## .sigma               NA            NA
## (Intercept)  0.52164375  1.234296e+00
## aag         -0.01494862  2.081253e-03
## msg         -0.04059221 -6.341367e-05

# Compare model fit between m2 and m3
anova(m2, m3)

## refitting model(s) with ML (instead of REML)

## Warning in optwrap(optimizer, devfun, rho$mkPar(rho$pp$theta), lower =
## getLower(x), : convergence code 1 from bobyqa: bobyqa -- maximum number of
## function evaluations exceeded

## Data: df4
## Models:
## m3: bmi_z ~ aag + msg + (aag + msg | state)
## m2: bmi_z ~ Chealth + female + race + edu + (1 | state)
##    npar    AIC    BIC logLik -2*log(L) Chisq Df Pr(>Chisq)    
## m3   10 155691 155781 -77836    155671                        
## m2   11 150452 150550 -75215    150430  5242  1  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Fit a random-coefficients regression model with all the level-1 and level-2 predictors and an interaction between ‘female’ and ‘race’ (Model m4). Compute 95% profile confidence interval. Interpret the results including both fixed and random effects. Compare model fit between m2 and m4 models.

# Conduct model fitting
m4 <- lme4::lmer(bmi_z ~ Chealth + aag + msg + female + race + edu + female * race + (aag + msg |
                                                                                        state),
                 data = df4)

## boundary (singular) fit: see help('isSingular')

summary(m4)

## Linear mixed model fit by REML ['lmerMod']
## Formula: bmi_z ~ Chealth + aag + msg + female + race + edu + female *  
##     race + (aag + msg | state)
##    Data: df4
## 
## REML criterion at convergence: 150197.2
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.1082 -0.6686 -0.1322  0.5410  4.2659 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev. Corr        
##  state    (Intercept) 7.833e-01 0.885037             
##           aag         4.277e-06 0.002068 -1.00       
##           msg         6.864e-04 0.026200 -1.00  0.99 
##  Residual             7.268e-01 0.852553             
## Number of obs: 59552, groups:  state, 51
## 
## Fixed effects:
##                         Estimate Std. Error t value
## (Intercept)             0.539671   0.166499   3.241
## Chealth                 0.205470   0.003404  60.369
## aag                    -0.003540   0.002526  -1.402
## msg                    -0.014114   0.005736  -2.461
## female1                -0.164701   0.008105 -20.321
## race2.black             0.022058   0.018709   1.179
## race3.hispanic         -0.026319   0.019085  -1.379
## race4.others           -0.100095   0.018927  -5.289
## edu1.HS                 0.113110   0.015433   7.329
## edu2.col attended       0.158771   0.015512  10.235
## edu3.col graduated      0.020305   0.015303   1.327
## female1:race2.black     0.429316   0.024481  17.537
## female1:race3.hispanic  0.139532   0.026047   5.357
## female1:race4.others    0.073973   0.026307   2.812

## 
## Correlation matrix not shown by default, as p = 14 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it

## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

m4.1 <- lmerTest::lmer(bmi_z ~ Chealth + aag + msg + female + race + edu + female * race + (aag + msg |
                                                                                              state),
                       data = df4)

## boundary (singular) fit: see help('isSingular')

## Warning: Model failed to converge with 1 negative eigenvalue: -4.8e+02

summary(m4.1)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: bmi_z ~ Chealth + aag + msg + female + race + edu + female *  
##     race + (aag + msg | state)
##    Data: df4
## 
## REML criterion at convergence: 150197.2
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.1082 -0.6686 -0.1322  0.5410  4.2659 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev. Corr        
##  state    (Intercept) 7.833e-01 0.885037             
##           aag         4.277e-06 0.002068 -1.00       
##           msg         6.864e-04 0.026200 -1.00  0.99 
##  Residual             7.268e-01 0.852553             
## Number of obs: 59552, groups:  state, 51
## 
## Fixed effects:
##                          Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)             5.397e-01  1.665e-01  1.965e+02   3.241  0.00140 ** 
## Chealth                 2.055e-01  3.404e-03  5.950e+04  60.369  < 2e-16 ***
## aag                    -3.540e-03  2.526e-03  1.669e+01  -1.402  0.17935    
## msg                    -1.411e-02  5.736e-03  7.509e+01  -2.461  0.01616 *  
## female1                -1.647e-01  8.105e-03  5.951e+04 -20.321  < 2e-16 ***
## race2.black             2.206e-02  1.871e-02  5.748e+04   1.179  0.23840    
## race3.hispanic         -2.632e-02  1.909e-02  5.665e+04  -1.379  0.16789    
## race4.others           -1.001e-01  1.893e-02  5.589e+04  -5.289 1.24e-07 ***
## edu1.HS                 1.131e-01  1.543e-02  5.951e+04   7.329 2.34e-13 ***
## edu2.col attended       1.588e-01  1.551e-02  5.950e+04  10.235  < 2e-16 ***
## edu3.col graduated      2.031e-02  1.530e-02  5.944e+04   1.327  0.18457    
## female1:race2.black     4.293e-01  2.448e-02  5.953e+04  17.537  < 2e-16 ***
## female1:race3.hispanic  1.395e-01  2.605e-02  5.953e+04   5.357 8.49e-08 ***
## female1:race4.others    7.397e-02  2.631e-02  5.952e+04   2.812  0.00493 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## 
## Correlation matrix not shown by default, as p = 14 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it

## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

# Compute 95% profile confidence intervals
confint(m4, method = c("Wald"))

##                               2.5 %       97.5 %
## .sig01                           NA           NA
## .sig02                           NA           NA
## .sig03                           NA           NA
## .sig04                           NA           NA
## .sig05                           NA           NA
## .sig06                           NA           NA
## .sigma                           NA           NA
## (Intercept)             0.213339157  0.866002264
## Chealth                 0.198799233  0.212141060
## aag                    -0.008491056  0.001410291
## msg                    -0.025356124 -0.002872411
## female1                -0.180586923 -0.148815789
## race2.black            -0.014610718  0.058726917
## race3.hispanic         -0.063724505  0.011086857
## race4.others           -0.137190727 -0.062998823
## edu1.HS                 0.082862750  0.143358058
## edu2.col attended       0.128367626  0.189173493
## edu3.col graduated     -0.009688684  0.050298358
## female1:race2.black     0.381333690  0.477297592
## female1:race3.hispanic  0.088481302  0.190581924
## female1:race4.others    0.022412141  0.125533925

# Compare model fit between m2 and m4
anova(m2, m4)

## refitting model(s) with ML (instead of REML)

## Warning in optwrap(optimizer, devfun, rho$mkPar(rho$pp$theta), lower =
## getLower(x), : convergence code 1 from bobyqa: bobyqa -- maximum number of
## function evaluations exceeded

## Data: df4
## Models:
## m2: bmi_z ~ Chealth + female + race + edu + (1 | state)
## m4: bmi_z ~ Chealth + aag + msg + female + race + edu + female * race + (aag + msg | state)
##    npar    AIC    BIC logLik -2*log(L)  Chisq Df Pr(>Chisq)    
## m2   11 150452 150550 -75215    150430                         
## m4   21 150120 150309 -75039    150078 351.43 10  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Fit a random-coefficients regression model with all the variables you entered in Model m4 and add a cross-level interaction between ‘female’ and ‘aag’ (Model m5). Compute 95% profile confidence interval. Interpret the results including both fixed and random effects. Compare model fit between m4 and m5 models.

# Conduct model
m5 <- lme4::lmer(
  bmi_z ~ Chealth + aag + msg + female + race + edu + female * race + female * aag + (aag + msg |
                                                                                        state),
  data = df4
)

## boundary (singular) fit: see help('isSingular')

summary(m5)

## Linear mixed model fit by REML ['lmerMod']
## Formula: bmi_z ~ Chealth + aag + msg + female + race + edu + female *  
##     race + female * aag + (aag + msg | state)
##    Data: df4
## 
## REML criterion at convergence: 150249.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.1081 -0.6691 -0.1323  0.5408  4.2697 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr        
##  state    (Intercept) 0.852573 0.92335              
##           aag         0.002169 0.04658   0.97       
##           msg         0.013148 0.11466  -0.98 -1.00 
##  Residual             0.726619 0.85242              
## Number of obs: 59552, groups:  state, 51
## 
## Fixed effects:
##                         Estimate Std. Error t value
## (Intercept)             0.419957   0.191372   2.194
## Chealth                 0.205305   0.003404  60.316
## aag                    -0.006436   0.008305  -0.775
## msg                    -0.004932   0.018374  -0.268
## female1                 0.033680   0.076881   0.438
## race2.black             0.025151   0.018779   1.339
## race3.hispanic         -0.028355   0.019161  -1.480
## race4.others           -0.102875   0.018931  -5.434
## edu1.HS                 0.112919   0.015432   7.317
## edu2.col attended       0.159123   0.015510  10.259
## edu3.col graduated      0.020793   0.015303   1.359
## female1:race2.black     0.421902   0.024650  17.116
## female1:race3.hispanic  0.146146   0.026170   5.585
## female1:race4.others    0.079326   0.026379   3.007
## aag:female1            -0.003865   0.001487  -2.599

## 
## Correlation matrix not shown by default, as p = 15 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it

## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

m5.1 <- lmerTest::lmer(
  bmi_z ~ Chealth + aag + msg + female + race + edu + female * race + female * aag + (aag + msg |
                                                                                        state),
  data = df4
)

## boundary (singular) fit: see help('isSingular')

## Warning: Model failed to converge with 2 negative eigenvalues: -5.1e+02
## -3.0e+06

summary(m5.1)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: bmi_z ~ Chealth + aag + msg + female + race + edu + female *  
##     race + female * aag + (aag + msg | state)
##    Data: df4
## 
## REML criterion at convergence: 150249.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.1081 -0.6691 -0.1323  0.5408  4.2697 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr        
##  state    (Intercept) 0.852573 0.92335              
##           aag         0.002169 0.04658   0.97       
##           msg         0.013148 0.11466  -0.98 -1.00 
##  Residual             0.726619 0.85242              
## Number of obs: 59552, groups:  state, 51
## 
## Fixed effects:
##                          Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)             4.200e-01  1.914e-01  2.317e+02   2.194  0.02920 *  
## Chealth                 2.053e-01  3.404e-03  5.949e+04  60.316  < 2e-16 ***
## aag                    -6.436e-03  8.305e-03  2.037e+02  -0.775  0.43929    
## msg                    -4.932e-03  1.837e-02  5.275e+02  -0.268  0.78847    
## female1                 3.368e-02  7.688e-02  5.953e+04   0.438  0.66133    
## race2.black             2.515e-02  1.878e-02  5.869e+04   1.339  0.18049    
## race3.hispanic         -2.836e-02  1.916e-02  5.828e+04  -1.480  0.13892    
## race4.others           -1.029e-01  1.893e-02  4.177e+04  -5.434 5.54e-08 ***
## edu1.HS                 1.129e-01  1.543e-02  5.949e+04   7.317 2.56e-13 ***
## edu2.col attended       1.591e-01  1.551e-02  5.950e+04  10.259  < 2e-16 ***
## edu3.col graduated      2.079e-02  1.530e-02  5.940e+04   1.359  0.17424    
## female1:race2.black     4.219e-01  2.465e-02  5.953e+04  17.116  < 2e-16 ***
## female1:race3.hispanic  1.461e-01  2.617e-02  5.953e+04   5.585 2.35e-08 ***
## female1:race4.others    7.933e-02  2.638e-02  5.952e+04   3.007  0.00264 ** 
## aag:female1            -3.865e-03  1.487e-03  5.953e+04  -2.599  0.00936 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## 
## Correlation matrix not shown by default, as p = 15 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it

## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

# Calculate profile confidence intervals
confint(m5, method = c("Wald"))

##                               2.5 %        97.5 %
## .sig01                           NA            NA
## .sig02                           NA            NA
## .sig03                           NA            NA
## .sig04                           NA            NA
## .sig05                           NA            NA
## .sig06                           NA            NA
## .sigma                           NA            NA
## (Intercept)             0.044875428  0.7950383036
## Chealth                 0.198633391  0.2119760622
## aag                    -0.022713340  0.0098420126
## msg                    -0.040944394  0.0310798662
## female1                -0.117004406  0.1843637957
## race2.black            -0.011656552  0.0619576419
## race3.hispanic         -0.065909146  0.0091993053
## race4.others           -0.139979352 -0.0657697873
## edu1.HS                 0.082673187  0.1431638679
## edu2.col attended       0.128723492  0.1895224407
## edu3.col graduated     -0.009201126  0.0507865165
## female1:race2.black     0.373589317  0.4702148127
## female1:race3.hispanic  0.094854369  0.1974383055
## female1:race4.others    0.027624843  0.1310269517
## aag:female1            -0.006780642 -0.0009501011

# Compare model fit between m4 and m5
anova(m4, m5)

## refitting model(s) with ML (instead of REML)

## Warning in optwrap(optimizer, devfun, rho$mkPar(rho$pp$theta), lower =
## getLower(x), : convergence code 1 from bobyqa: bobyqa -- maximum number of
## function evaluations exceeded

## Warning in optwrap(optimizer, devfun, rho$mkPar(rho$pp$theta), lower =
## getLower(x), : convergence code 1 from bobyqa: bobyqa -- maximum number of
## function evaluations exceeded

## Data: df4
## Models:
## m4: bmi_z ~ Chealth + aag + msg + female + race + edu + female * race + (aag + msg | state)
## m5: bmi_z ~ Chealth + aag + msg + female + race + edu + female * race + female * aag + (aag + msg | state)
##    npar    AIC    BIC logLik -2*log(L)  Chisq Df Pr(>Chisq)    
## m4   21 150120 150309 -75039    150078                         
## m5   22 150110 150308 -75033    150066 12.028  1  0.0005242 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Check if residuals are normally distributed for the final model by plotting a histogram with density curve.

hist(
  scale(resid(m5)),
  freq = FALSE,
  ylim = c(0, .5),
  xlim = c(-3.5, 3.5),
  main = "Histogram of Standardized Residuals from Model m5",
  xlab = "Standardized Residuals"
)
lines(density(scale(resid(m4))))
box()

Q632: Assignment 7 Appendix

Jason Hoskin

2026-05-08

Part A

Part B