Lab 5: Multiple linear least squares regression

Exercise 1

Question: Use the table1 package to create a table with appropriate descriptive information about the data set. Use the examples provided in the following vignette to help you build a table.

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(broom)
library(table1)

## 
## Attaching package: 'table1'
## 
## The following objects are masked from 'package:base':
## 
##     units, units<-

library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(car)

## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

library(easystats)

## # Attaching packages: easystats 0.6.0 (red = needs update)
## ✔ bayestestR  0.13.1   ✔ correlation 0.8.4 
## ✔ datawizard  0.9.0    ✔ effectsize  0.8.6 
## ✔ insight     0.19.6   ✔ modelbased  0.8.6 
## ✖ performance 0.10.5   ✖ parameters  0.21.2
## ✔ report      0.5.7    ✖ see         0.8.0 
## 
## Restart the R-Session and update packages in red with `easystats::easystats_update()`.

library(modelsummary)

## 
## Attaching package: 'modelsummary'
## 
## The following object is masked from 'package:parameters':
## 
##     supported_models
## 
## The following object is masked from 'package:insight':
## 
##     supported_models

natality <- read_rds("data/natality.rds")


title <- "Maternal demographics stratified by smoking"
label(natality$PRIORLIVE) <- "Prior Births (living)"
label(natality$PRIORDEAD) <- "Prior Births (dead)"


table1(~ DBWT + risks + MAGER + MRACEHISP + DMAR + MEDUC + PRIORLIVE + PRIORDEAD + BMI | CIG_REC, data = natality, overall = FALSE, caption = title)

Maternal demographics stratified by smoking
	No (N=1564)	Yes (N=124)
Birth Weight (g)
Mean (SD)	3250 (601)	3070 (597)
Median [Min, Max]	3290 [369, 5880]	3090 [630, 4220]
Risk Factors Reported
No	1088 (69.6%)	84 (67.7%)
Yes	476 (30.4%)	40 (32.3%)
Mother Age
Mean (SD)	29.0 (5.79)	27.8 (5.79)
Median [Min, Max]	29.0 [15.0, 49.0]	28.0 [17.0, 42.0]
Mother Race/Hispanic Origin
NH White	844 (54.0%)	94 (75.8%)
NH Black	266 (17.0%)	14 (11.3%)
NH Other	125 (8.0%)	10 (8.1%)
Hispanic	329 (21.0%)	6 (4.8%)
Marital Status
Married	1017 (65.0%)	40 (32.3%)
Unmarried	547 (35.0%)	84 (67.7%)
Mother Education
lower	177 (11.3%)	34 (27.4%)
HS	377 (24.1%)	45 (36.3%)
Some college	437 (27.9%)	37 (29.8%)
Uni	573 (36.6%)	8 (6.5%)
Prior Births (living)
0	601 (38.4%)	38 (30.6%)
1	516 (33.0%)	41 (33.1%)
2	265 (16.9%)	17 (13.7%)
3	93 (5.9%)	17 (13.7%)
4+	89 (5.7%)	11 (8.9%)
Prior Births (dead)
0	1551 (99.2%)	123 (99.2%)
1+	13 (0.8%)	1 (0.8%)
Mother BMI
Mean (SD)	26.9 (6.32)	27.3 (6.42)
Median [Min, Max]	25.6 [15.6, 58.4]	26.3 [16.5, 50.1]

Exercise 2

Question: Use the ggpairs function to explore the association between the different variables. Try to replicate the figure below, or create your own variation.

title <- "Natality in the United States (2018)"

ex2 <- ggpairs(natality, columns = c(1,3,4,6,10), title = title,
               ggplot2::aes(colour = CIG_REC))

ex2

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Exercise 3

Question: Fit four linear models, each nested within the other: the null model, the unadjusted model, the full model and the optimal model. Use the step function find the optimal model using the stepwise method. Then use anova to compare the four models. What is your conclusion? Now compare your models using modelsummary function. What does the number in the (brackets) mean? How can you identify the optimal model just by looking at the R2 and the AIC?

M.NULL <- lm(DBWT ~ 1, natality)
M.UNADJUSTED <- lm(DBWT ~ CIG_REC, natality)
# The dot represents all variables except the outcome DBWT
M.FULL <- lm(DBWT ~ ., natality)  

# Finding the optimal model
step(M.NULL,
     scope = list(upper = M.FULL),
     direction="both",
     data=natality)

## Start:  AIC=21609.55
## DBWT ~ 1
## 
##             Df Sum of Sq       RSS   AIC
## + DMAR       1  28672236 583166851 21530
## + MRACEHISP  3  28557357 583281730 21535
## + PRIORLIVE  4   7485192 604353895 21597
## + MEDUC      3   5189885 606649203 21601
## + CIG_REC    1   3685187 608153901 21601
## + risks      1   3277949 608561138 21602
## + BMI        1   3170730 608668358 21603
## + MAGER      1   2941408 608897679 21603
## + PRIORDEAD  1    849628 610989460 21609
## <none>                   611839087 21610
## 
## Step:  AIC=21530.53
## DBWT ~ DMAR
## 
##             Df Sum of Sq       RSS   AIC
## + MRACEHISP  3  15359037 567807814 21492
## + risks      1   4395289 578771562 21520
## + BMI        1   3607980 579558871 21522
## + PRIORLIVE  4   4500359 578666492 21526
## + CIG_REC    1    978350 582188501 21530
## + PRIORDEAD  1    818812 582348039 21530
## <none>                   583166851 21530
## + MAGER      1     82475 583084376 21532
## + MEDUC      3    313468 582853384 21536
## - DMAR       1  28672236 611839087 21610
## 
## Step:  AIC=21491.48
## DBWT ~ DMAR + MRACEHISP
## 
##             Df Sum of Sq       RSS   AIC
## + BMI        1   4532441 563275373 21480
## + risks      1   3924653 563883161 21482
## + CIG_REC    1   2181695 565626119 21487
## + PRIORLIVE  4   3880005 563927809 21488
## + PRIORDEAD  1    846700 566961114 21491
## <none>                   567807814 21492
## + MAGER      1     52434 567755381 21493
## + MEDUC      3     98014 567709800 21497
## - MRACEHISP  3  15359037 583166851 21530
## - DMAR       1  15473916 583281730 21535
## 
## Step:  AIC=21479.95
## DBWT ~ DMAR + MRACEHISP + BMI
## 
##             Df Sum of Sq       RSS   AIC
## + risks      1   6059709 557215664 21464
## + CIG_REC    1   2362075 560913299 21475
## + PRIORLIVE  4   3457999 559817375 21478
## + PRIORDEAD  1    896345 562379028 21479
## <none>                   563275373 21480
## + MAGER      1    125890 563149483 21482
## + MEDUC      3    115009 563160364 21486
## - BMI        1   4532441 567807814 21492
## - MRACEHISP  3  16283498 579558871 21522
## - DMAR       1  15325756 578601129 21523
## 
## Step:  AIC=21463.69
## DBWT ~ DMAR + MRACEHISP + BMI + risks
## 
##             Df Sum of Sq       RSS   AIC
## + PRIORLIVE  4   5580134 551635530 21455
## + CIG_REC    1   2199370 555016294 21459
## + PRIORDEAD  1   1025923 556189741 21463
## <none>                   557215664 21464
## + MAGER      1     22588 557193076 21466
## + MEDUC      3     69675 557145989 21470
## - risks      1   6059709 563275373 21480
## - BMI        1   6667497 563883161 21482
## - MRACEHISP  3  15921403 573137067 21505
## - DMAR       1  16431163 573646827 21511
## 
## Step:  AIC=21454.7
## DBWT ~ DMAR + MRACEHISP + BMI + risks + PRIORLIVE
## 
##             Df Sum of Sq       RSS   AIC
## + CIG_REC    1   2707991 548927539 21448
## + PRIORDEAD  1    921529 550714001 21454
## <none>                   551635530 21455
## + MAGER      1    280671 551354859 21456
## + MEDUC      3    173640 551461890 21460
## - PRIORLIVE  4   5580134 557215664 21464
## - BMI        1   6486820 558122350 21472
## - risks      1   8181845 559817375 21478
## - MRACEHISP  3  15145908 566781438 21494
## - DMAR       1  14456460 566091990 21496
## 
## Step:  AIC=21448.4
## DBWT ~ DMAR + MRACEHISP + BMI + risks + PRIORLIVE + CIG_REC
## 
##             Df Sum of Sq       RSS   AIC
## + PRIORDEAD  1    922493 548005046 21448
## <none>                   548927539 21448
## + MAGER      1    337056 548590483 21449
## + MEDUC      3     75431 548852108 21454
## - CIG_REC    1   2707991 551635530 21455
## - PRIORLIVE  4   6088755 555016294 21459
## - BMI        1   6662694 555590232 21467
## - risks      1   8131760 557059299 21471
## - DMAR       1  10955889 559883427 21480
## - MRACEHISP  3  16576572 565504111 21493
## 
## Step:  AIC=21447.56
## DBWT ~ DMAR + MRACEHISP + BMI + risks + PRIORLIVE + CIG_REC + 
##     PRIORDEAD
## 
##             Df Sum of Sq       RSS   AIC
## <none>                   548005046 21448
## - PRIORDEAD  1    922493 548927539 21448
## + MAGER      1    361028 547644018 21448
## + MEDUC      3     82260 547922786 21453
## - CIG_REC    1   2708955 550714001 21454
## - PRIORLIVE  4   5984714 553989760 21458
## - BMI        1   6749558 554754604 21466
## - risks      1   8234365 556239411 21471
## - DMAR       1  10970232 558975278 21479
## - MRACEHISP  3  16594210 564599256 21492

## 
## Call:
## lm(formula = DBWT ~ DMAR + MRACEHISP + BMI + risks + PRIORLIVE + 
##     CIG_REC + PRIORDEAD, data = natality)
## 
## Coefficients:
##       (Intercept)      DMARUnmarried  MRACEHISPNH Black  MRACEHISPNH Other  
##           3080.56            -184.27            -292.14            -121.70  
## MRACEHISPHispanic                BMI           risksYes         PRIORLIVE1  
##            -74.49              10.26            -158.81             130.69  
##        PRIORLIVE2         PRIORLIVE3        PRIORLIVE4+         CIG_RECYes  
##            107.17             138.78             131.95            -160.08  
##       PRIORDEAD1+  
##            258.43

# Assigning optimal model the optimal model status
M.OPT <- lm(formula = DBWT ~ DMAR + MRACEHISP + BMI + risks + PRIORLIVE + CIG_REC + PRIORDEAD, data = natality)

# Displaying the four models by each other
modelsummary(
  list(
    null = M.NULL, 
    unadjasted = M.UNADJUSTED, 
    opt = M.OPT,
    full = M.FULL), 
  stars = TRUE
  )

	null	unadjasted	opt	full
(Intercept)	3234.746***	3247.903***	3080.559***	3162.373***
	(14.658)	(15.187)	(64.625)	(107.021)
CIG_RECYes		−179.096**	−160.083**	−156.593**
		(56.032)	(55.632)	(56.325)
DMARUnmarried			−184.275***	−189.782***
			(31.823)	(35.084)
MRACEHISPNH Black			−292.140***	−290.175***
			(41.690)	(41.829)
MRACEHISPNH Other			−121.695*	−120.774*
			(52.796)	(52.960)
MRACEHISPHispanic			−74.492+	−72.464+
			(38.075)	(39.120)
BMI			10.261***	10.493***
			(2.259)	(2.277)
risksYes			−158.808***	−152.902***
			(31.655)	(32.055)
PRIORLIVE1			130.686***	140.483***
			(33.814)	(34.725)
PRIORLIVE2			107.170*	123.385**
			(41.746)	(43.880)
PRIORLIVE3			138.776*	161.298*
			(59.972)	(63.149)
PRIORLIVE4+			131.949*	162.508*
			(62.333)	(67.562)
PRIORDEAD1+			258.434+	265.467+
			(153.905)	(154.184)
MAGER				−3.836
				(3.085)
MEDUCHS				8.448
				(49.909)
MEDUCSome college				−0.240
				(50.838)
MEDUCUni				31.610
				(56.902)
Num.Obs.	1688	1688	1688	1688
R2	0.000	0.006	0.104	0.105
R2 Adj.	0.000	0.005	0.098	0.097
AIC	26401.9	26393.7	26239.9	26246.1
BIC	26412.7	26410.0	26315.9	26343.8
Log.Lik.	−13198.942	−13193.843	−13105.946	−13105.039
F		10.217	16.259	12.291
RMSE	602.05	600.23	569.78	569.47
+ p < 0.1, * p < 0.05, p < 0.01, * p < 0.001

# ANOVA
anova(M.NULL, M.UNADJUSTED, M.OPT, M.FULL)

## Analysis of Variance Table
## 
## Model 1: DBWT ~ 1
## Model 2: DBWT ~ CIG_REC
## Model 3: DBWT ~ DMAR + MRACEHISP + BMI + risks + PRIORLIVE + CIG_REC + 
##     PRIORDEAD
## Model 4: DBWT ~ CIG_REC + risks + MAGER + MRACEHISP + DMAR + MEDUC + PRIORLIVE + 
##     PRIORDEAD + BMI
##   Res.Df       RSS Df Sum of Sq       F    Pr(>F)    
## 1   1687 611839087                                   
## 2   1686 608153901  1   3685187 11.2491 0.0008144 ***
## 3   1675 548005046 11  60148855 16.6914 < 2.2e-16 ***
## 4   1671 547416336  4    588710  0.4493 0.7730024    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Answer: Looking at the ANOVA output, we see that model 2 differs significantly in its explanatory power from model 1. Model 3 also differs significantly from model 2. Model 4 does not differ significantly from model 3, however. Given these results, we can say that Model 3 is the best fit for the data. Adding more explanatory variables (like in model 4) does not yield substantial improvement in predicting the outcome variable. From the model summary output we can conclude that out of all models, model 3 explains the most variance (Adjusted R^2 = 0.098) in the data. The number in the brackets next to the estimated coefficients reflects the respective standard errors. By looking at the AIC we see that the number for model 3 is the lowest (AIC = 26239.9). This can be interpreted as the model having the least prediction error.

Exercise 4

Question: Center and scale all the continuous variables, and create a forest plot for the optimal model.

# Scaling
natality.ctr <- natality |>
  mutate_if(is.numeric, scale) |>
  mutate_if(is.numeric, ~ as.numeric(.x / 2))

# Fitting the model
M.OPT.ctr <- lm(
  DBWT ~ DMAR + MRACEHISP + BMI + 
    risks + PRIORLIVE + CIG_REC + 
    PRIORDEAD, 
  data = natality.ctr
  )

# creating a forest plot
M.OPT.ctr %>% 
  model_parameters() %>% 
  plot()

# Fitting the FULL model
M.FULL.ctr <- lm(
  DBWT ~ ., 
  data = natality.ctr)
  
  # creating a forest plot for FULL model
M.FULL.ctr %>% 
  model_parameters() %>% 
  plot()

summary(M.OPT.ctr)

## 
## Call:
## lm(formula = DBWT ~ DMAR + MRACEHISP + BMI + risks + PRIORLIVE + 
##     CIG_REC + PRIORDEAD, data = natality.ctr)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.2845 -0.2487  0.0243  0.2949  1.9881 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        0.10140    0.02385   4.252 2.24e-05 ***
## DMARUnmarried     -0.15299    0.02642  -5.791 8.36e-09 ***
## MRACEHISPNH Black -0.24255    0.03461  -7.007 3.51e-12 ***
## MRACEHISPNH Other -0.10104    0.04383  -2.305 0.021288 *  
## MRACEHISPHispanic -0.06185    0.03161  -1.956 0.050577 .  
## BMI                0.10778    0.02373   4.542 5.97e-06 ***
## risksYes          -0.13185    0.02628  -5.017 5.81e-07 ***
## PRIORLIVE1         0.10850    0.02807   3.865 0.000115 ***
## PRIORLIVE2         0.08898    0.03466   2.567 0.010339 *  
## PRIORLIVE3         0.11522    0.04979   2.314 0.020788 *  
## PRIORLIVE4+        0.10955    0.05175   2.117 0.034420 *  
## CIG_RECYes        -0.13291    0.04619  -2.878 0.004059 ** 
## PRIORDEAD1+        0.21457    0.12778   1.679 0.093304 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4749 on 1675 degrees of freedom
## Multiple R-squared:  0.1043, Adjusted R-squared:  0.09791 
## F-statistic: 16.26 on 12 and 1675 DF,  p-value: < 2.2e-16

Exercise 5

Question: How do the effect estimate of smoking, the standard error of the estimate, its 95% confidence interval, and its statistical significance change between the four models?

M.UNADJUSTED |> tidy(conf.int = TRUE)

## # A tibble: 2 × 7
##   term        estimate std.error statistic p.value conf.low conf.high
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 (Intercept)    3248.      15.2    214.   0          3218.    3278. 
## 2 CIG_RECYes     -179.      56.0     -3.20 0.00142    -289.     -69.2

M.FULL |> tidy(conf.int = TRUE)

## # A tibble: 17 × 7
##    term              estimate std.error statistic   p.value conf.low conf.high
##    <chr>                <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
##  1 (Intercept)       3162.       107.    29.5     9.73e-155  2952.     3372.  
##  2 CIG_RECYes        -157.        56.3   -2.78    5.49e-  3  -267.      -46.1 
##  3 risksYes          -153.        32.1   -4.77    2.00e-  6  -216.      -90.0 
##  4 MAGER               -3.84       3.09  -1.24    2.14e-  1    -9.89      2.22
##  5 MRACEHISPNH Black -290.        41.8   -6.94    5.70e- 12  -372.     -208.  
##  6 MRACEHISPNH Other -121.        53.0   -2.28    2.27e-  2  -225.      -16.9 
##  7 MRACEHISPHispanic  -72.5       39.1   -1.85    6.41e-  2  -149.        4.26
##  8 DMARUnmarried     -190.        35.1   -5.41    7.24e-  8  -259.     -121.  
##  9 MEDUCHS              8.45      49.9    0.169   8.66e-  1   -89.4     106.  
## 10 MEDUCSome college   -0.240     50.8   -0.00472 9.96e-  1  -100.       99.5 
## 11 MEDUCUni            31.6       56.9    0.556   5.79e-  1   -80.0     143.  
## 12 PRIORLIVE1         140.        34.7    4.05    5.46e-  5    72.4     209.  
## 13 PRIORLIVE2         123.        43.9    2.81    4.98e-  3    37.3     209.  
## 14 PRIORLIVE3         161.        63.1    2.55    1.07e-  2    37.4     285.  
## 15 PRIORLIVE4+        163.        67.6    2.41    1.63e-  2    30.0     295.  
## 16 PRIORDEAD1+        265.       154.     1.72    8.53e-  2   -36.9     568.  
## 17 BMI                 10.5        2.28   4.61    4.36e-  6     6.03     15.0

M.OPT |> tidy(conf.int = TRUE)

## # A tibble: 13 × 7
##    term              estimate std.error statistic   p.value conf.low conf.high
##    <chr>                <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
##  1 (Intercept)         3081.      64.6      47.7  4.22e-314  2954.    3207.   
##  2 DMARUnmarried       -184.      31.8      -5.79 8.36e-  9  -247.    -122.   
##  3 MRACEHISPNH Black   -292.      41.7      -7.01 3.51e- 12  -374.    -210.   
##  4 MRACEHISPNH Other   -122.      52.8      -2.30 2.13e-  2  -225.     -18.1  
##  5 MRACEHISPHispanic    -74.5     38.1      -1.96 5.06e-  2  -149.       0.187
##  6 BMI                   10.3      2.26      4.54 5.97e-  6     5.83    14.7  
##  7 risksYes            -159.      31.7      -5.02 5.81e-  7  -221.     -96.7  
##  8 PRIORLIVE1           131.      33.8       3.86 1.15e-  4    64.4    197.   
##  9 PRIORLIVE2           107.      41.7       2.57 1.03e-  2    25.3    189.   
## 10 PRIORLIVE3           139.      60.0       2.31 2.08e-  2    21.1    256.   
## 11 PRIORLIVE4+          132.      62.3       2.12 3.44e-  2     9.69   254.   
## 12 CIG_RECYes          -160.      55.6      -2.88 4.06e-  3  -269.     -51.0  
## 13 PRIORDEAD1+          258.     154.        1.68 9.33e-  2   -43.4    560.

Answer:

Confidence Intervals for CIG_RECS_Yes - M.UNADJUSTED: estimate: -179, SE: 56.0, CI: {-289.; -69.2}, p = 0.00142 - M.FULL: estimate: -157, SE: 56.3, CI: {-267 ; -46.1}, p < 0.001 - M.OPT: estimate: -160, SE: 55.6, CI: {-269 ; -51.0}, p < 0.001

In the full and optimal model, the estimate is smaller indicating a smaller effect. This is because the first model overestimates the effect of smoking. In the full and optimal model other variables adjust for this, so that the estimate for smoking is more accurate. The standard errors are roughly the same across the models, which is why the confidence interval width is also relatively the same. The confidence intervals do move a little with the coefficient estimates. The p-values get progressively smaller but are all very significant. This means that smoking plays a significant role in predicting the outcome variable in all models.

Exercise 6

Question: Does the association between cigarette smoking during pregnancy and birth weight depend on mother’s education? Use the scaled Modify the FULL model appropriately to be able to answer this question. Use anova to test whether the model with the interaction is significantly better than the one without it. Regardless of statistical significance, create a dumbell plot illustrating the smoking effect at each level of mother’s education and estimate the cigarette smoking effect at each level of mother’s education. Hint: When creating the dataset for plotting, include the value zero for each predictor not included in the interaction. Picking the value zero in a centered predictor is like choosing the mean of that variable, which is a reasonable choice.

Note: We’re using a standardized unit of measurement so 0 means exactly equal to the mean. This means that we’re going to be interpreting the estimates in the unit of one half of a standard deviation.

M.FULL_int.ctr <- lm(formula = DBWT ~ . + CIG_REC * MEDUC, data = natality.ctr)

newdata <- data.frame(
  CIG_REC = c("No","Yes","No","Yes","No", "Yes","No", "Yes"),  
  MEDUC = c("lower", "lower", "HS", "HS", "Some college", "Some college", "Uni", "Uni") 
  ) |> 
  mutate(
    risks = "No", 
    MAGER = 0, 
    MRACEHISP = "NH White", 
    DMAR = "Married", 
    PRIORLIVE = "0", 
    PRIORDEAD = "0", 
    BMI = 0
    )

newdata |> 
  mutate(
    DBWT = predict(M.FULL_int.ctr, newdata)) |> select(CIG_REC, MEDUC, DBWT)

##   CIG_REC        MEDUC        DBWT
## 1      No        lower  0.06227469
## 2     Yes        lower  0.04447809
## 3      No           HS  0.08854779
## 4     Yes           HS -0.05014673
## 5      No Some college  0.08777361
## 6     Yes Some college -0.13236808
## 7      No          Uni  0.10674030
## 8     Yes          Uni  0.02404656

summary(M.FULL_int.ctr)

## 
## Call:
## lm(formula = DBWT ~ . + CIG_REC * MEDUC, data = natality.ctr)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.27704 -0.24697  0.02151  0.29953  1.98393 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   0.06227    0.05010   1.243  0.21402    
## CIG_RECYes                   -0.01780    0.09114  -0.195  0.84520    
## risksYes                     -0.12659    0.02664  -4.752 2.18e-06 ***
## MAGER                        -0.03616    0.02977  -1.215  0.22466    
## MRACEHISPNH Black            -0.23940    0.03476  -6.888 7.98e-12 ***
## MRACEHISPNH Other            -0.10225    0.04402  -2.323  0.02030 *  
## MRACEHISPHispanic            -0.05612    0.03260  -1.721  0.08539 .  
## DMARUnmarried                -0.15687    0.02915  -5.382 8.40e-08 ***
## MEDUCHS                       0.02627    0.04459   0.589  0.55580    
## MEDUCSome college             0.02550    0.04503   0.566  0.57128    
## MEDUCUni                      0.04447    0.04899   0.908  0.36418    
## PRIORLIVE1                    0.11737    0.02884   4.070 4.93e-05 ***
## PRIORLIVE2                    0.10388    0.03646   2.850  0.00443 ** 
## PRIORLIVE3                    0.12858    0.05261   2.444  0.01463 *  
## PRIORLIVE4+                   0.13510    0.05610   2.408  0.01615 *  
## PRIORDEAD1+                   0.22467    0.12811   1.754  0.07965 .  
## BMI                           0.10834    0.02394   4.526 6.45e-06 ***
## CIG_RECYes:MEDUCHS           -0.12090    0.11702  -1.033  0.30171    
## CIG_RECYes:MEDUCSome college -0.20235    0.12165  -1.663  0.09645 .  
## CIG_RECYes:MEDUCUni          -0.06490    0.19249  -0.337  0.73605    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4752 on 1668 degrees of freedom
## Multiple R-squared:  0.1068, Adjusted R-squared:  0.09665 
## F-statistic:  10.5 on 19 and 1668 DF,  p-value: < 2.2e-16

anova(M.FULL_int.ctr, M.FULL.ctr)

## Analysis of Variance Table
## 
## Model 1: DBWT ~ CIG_REC + risks + MAGER + MRACEHISP + DMAR + MEDUC + PRIORLIVE + 
##     PRIORDEAD + BMI + CIG_REC * MEDUC
## Model 2: DBWT ~ CIG_REC + risks + MAGER + MRACEHISP + DMAR + MEDUC + PRIORLIVE + 
##     PRIORDEAD + BMI
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1   1668 376.70                           
## 2   1671 377.34 -3  -0.64613 0.9537 0.4138

tribble(
  ~Education, ~No_Smoking, ~Smoking, 
  "Lower",    0.06227469,  0.04447809, 
  "HS",       0.08854779,  -0.05014673, 
  "Some College", 0.08777361, -0.13236808,
  "Uni", 0.10674030, 0.02404656
  ) |> ggplot() + 
  geom_segment(
    aes(
      x = "No Smoking", y = No_Smoking, 
      xend = "Smoking", yend = Smoking, 
      color = Education
      ),
    linewidth = 2
    ) + 
  geom_point(
    aes(x = "No Smoking", y = No_Smoking, color = Education), 
    size = 6
    ) + 
  geom_point(
    aes(x = "Smoking", y = Smoking, color = Education), 
    size = 6
    )

Answer: The Model with the interaction term is not significantly better than the one without. Although statistically insignificant, the effect of the interaction becomes obvious when we look at the dumbbell plot. Thus we conclude that while there might be an association between cigarette smoking in pregnancy and the mother’s education, including the interaction term in the regression model does not lead to statistically more explanatory power than the full model that does not include the interaction.

Exrecise 7

Question: Does the association between mother’s BMI and birth-weight differ between mothers of different race/ethnicity? Modify the OPTIMAL model appropriately to be able to answer this question, using the dataset with the centered variables. Regardless of statistical significance, estimate the BMI effect at each level of mother’s race/ethnicity and plot the BMI effect at each level of mother’s race/ethnicity using a dumbbell plot.

M.OPT_int.ctr <- lm(formula = DBWT ~ . + BMI * MRACEHISP, data = natality.ctr)

anova(M.OPT_int.ctr, M.OPT.ctr)

## Analysis of Variance Table
## 
## Model 1: DBWT ~ CIG_REC + risks + MAGER + MRACEHISP + DMAR + MEDUC + PRIORLIVE + 
##     PRIORDEAD + BMI + BMI * MRACEHISP
## Model 2: DBWT ~ DMAR + MRACEHISP + BMI + risks + PRIORLIVE + CIG_REC + 
##     PRIORDEAD
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1   1668 376.67                           
## 2   1675 377.75 -7   -1.0822 0.6846 0.6852

newdata2 <- data.frame(
  BMI = c(0, 2, 0, 2, 0, 2, 0, 2),  
    MRACEHISP = c("NH White", "NH White", "NH Black", "NH Black", "NH Other","NH Other", "Hispanic", "Hispanic")
  ) |> 
  mutate(
    risks = "No", 
    MAGER = 0, 
    DMAR = "Married", 
    PRIORLIVE = "0", 
    PRIORDEAD = "0", 
    CIG_REC = "No"
    )

newdata2 |> 
  mutate(
    DBWT = predict(
      M.OPT_int.ctr, 
      newdata
      )
  ) |> select(BMI, MRACEHISP, DBWT)

##   BMI MRACEHISP        DBWT
## 1   0  NH White  0.08149527
## 2   2  NH White -0.05036478
## 3   0  NH Black  0.09048937
## 4   2  NH Black -0.04137068
## 5   0  NH Other  0.08182620
## 6   2  NH Other -0.05003385
## 7   0  Hispanic  0.11108875
## 8   2  Hispanic -0.02077130

tribble(
  ~Race, ~mean_bmi, ~bmi_sd, 
  "NH White",    0.08149527, -0.05036478, 
  "NH Black",       0.09048937, -0.04137068, 
  "NH Other",  0.08182620, -0.05003385, 
  "Hispanic",        0.11108875, -0.02077130                
  ) |> ggplot() +
  ylab("Birth weight (centred , scaled)") +
  xlab("") +
  geom_segment(
    aes(
      x = "mean(BMI)", y = mean_bmi, 
      xend = "mean(BMI) + sd (BMI)", yend = bmi_sd, 
      color = Race
      ),
    linewidth = 2
    ) + 
  geom_point(
    aes(x = "mean(BMI)", y = mean_bmi, color = Race), 
    size = 6
    ) + 
  geom_point(
    aes(x = "mean(BMI) + sd (BMI)", y = bmi_sd, color = Race), 
    size = 6
    )

Answer: The results from the ANOVA do not indicate a statistically significant difference in terms of the explanatory power of the two models. This means that we cannot conclude that including the interaction term leads to a better model. The dumbbell plots show the same pattern, in that the overall effect of BMI on birth weight does not differ for the different races.

Exercise 8

Question: Does the association between mother’s BMI and birthweight depend on mother’s age? Modify the adjusted model appropriately to be able to answer this question. Regardless of statistical significance, estimate the BMI effect for mothers age 20, 30, and 40 years and make a plot illustrating the BMI effect at each of these ages.

M.ADJUSTED_int.ctr <- lm(formula = DBWT ~ . + BMI * MAGER, data = natality.ctr)

anova(M.ADJUSTED_int.ctr, M.OPT.ctr)

## Analysis of Variance Table
## 
## Model 1: DBWT ~ CIG_REC + risks + MAGER + MRACEHISP + DMAR + MEDUC + PRIORLIVE + 
##     PRIORDEAD + BMI + BMI * MAGER
## Model 2: DBWT ~ DMAR + MRACEHISP + BMI + risks + PRIORLIVE + CIG_REC + 
##     PRIORDEAD
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1   1670 377.28                           
## 2   1675 377.75 -5  -0.46619 0.4127 0.8402

newdata3 <- data.frame(
  BMI = c(0, 2, 0, 2, 0, 2),  
    MAGER = c(20, 20, 30, 30, 40, 40)
  ) |> 
  mutate(
    risks = "No", 
    MRACEHISP = "NH White",
    MEDUC = "lower",
    DMAR = "Married", 
    PRIORLIVE = "0", 
    PRIORDEAD = "0", 
    CIG_REC = "No"
    )

newdata3 |> 
  mutate(
    DBWT = predict(
      M.ADJUSTED_int.ctr, 
      newdata3
      )
  ) |> select(BMI, MAGER, DBWT)

##   BMI MAGER       DBWT
## 1   0    20 -0.6575681
## 2   2    20 -1.4076355
## 3   0    30 -1.0279007
## 4   2    30 -2.2629360
## 5   0    40 -1.3982333
## 6   2    40 -3.1182366

#dumbbell plot
tribble(
  ~MAGER.ctr , ~mean_bmi, ~bmi_sd, 
  "Age 20", -0.6575681, -1.4076355, 
  "Age 30", -1.02790077, -2.2629360, 
  "Age 40", -1.3982333, -3.1182366, 
  ) |> ggplot() +
  ylab("Birth weight (centred , scaled)") +
  xlab("") +
  geom_segment(
    aes(
      x = "mean(BMI)", y = mean_bmi, 
      xend = "mean(BMI) + sd (BMI)", yend = bmi_sd, 
      color = MAGER.ctr
      ),
    linewidth = 2
    ) + 
  geom_point(
    aes(x = "mean(BMI)", y = mean_bmi, color = MAGER.ctr), 
    size = 6
    ) + 
  geom_point(
    aes(x = "mean(BMI) + sd (BMI)", y = bmi_sd, color = MAGER.ctr), 
    size = 6
    )

Answer: The results from the ANOVA do not indicate a statistically significant difference in terms of the explanatory power of the two models. This means that we cannot conclude that including the interaction term leads to a better model. The dumbbell plots however, do show somewhat different slopes for the age categories. This goes against the results of our ANOVA. We thus cannot conclude that the overall effect of BMI on birth weight differs for the different age categories enough to be statistically significant.

Lab 5: Multiple linear least squares regression - Group 2

Silke Hoekstra, Jesse Lindenhovius, Mengqian Zhao, Rina Sevcik

2023-11-10