**The following running R codes are mainly a combination of my edits and chaptgpt’s generated response. In some place I compared it with your codes published in datalab. Based on the output, my codes all worked, but they’re probably not the most efficient.
#if package not installed, install and also use pacman for ease
pacman::p_load(
tidyverse, easystats,
haven, curl,
janitor,
broom, broom.mixed,
ggeffects,
multcomp,
lmtest, sandwich,
lme4, lmerTest,
patchwork)
library(haven)
options(scipen=999)
# Load the dataset from STATA (.dta) file
data <- read_dta('D:/academic/UA/Spring 2025/BER 664/Fan, Jiani/Fan, Jiani/Homeworks/mlmus3/mlmus3/smoking.dta')
data_first <- subset(data, idx == 1) # Keep only first births
#^ kind of feel the above is more efficient than the following but more space for flexibility (%>%) in the below
## Filter only first children
#df1 <- df_raw %>%
# filter(idx %in% c(1))
#
#head(df1)
#OR download from url:
## Download the data from the URL to your local machine
#url <- "http://www.stata-press.com/data/mlmus3/smoking.dta"
#destfile <- "smoking.dta"
#curl::curl_download(url, destfile)
data_first$education <- with(data_first, ifelse(collgrad == 1, 3,
ifelse(somecoll == 1, 2,
ifelse(hsgrad == 1, 1, 0))))
#faster and no need for ifelse:
#df2 <- df1 %>%
# mutate(
# education = 1 * hsgrad + 2 * somecoll + 3 * collgrad
# )
#
#df2 %>%
# dplyr::select(hsgrad, somecoll, collgrad, education)
library(dplyr)
mean_sd_table <- data_first %>%
group_by(male, black, smoke, education) %>%
summarise(
mean_birwt = mean(birwt, na.rm = TRUE),
sd_birwt = sd(birwt, na.rm = TRUE),
.groups = "drop"
)
mean_sd_table
## # A tibble: 32 × 6
## male black smoke education mean_birwt sd_birwt
## <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl> <dbl> <dbl>
## 1 0 [Female] 0 [White] 0 [Nonsmoker] 0 3308. 456.
## 2 0 [Female] 0 [White] 0 [Nonsmoker] 1 3444. 473.
## 3 0 [Female] 0 [White] 0 [Nonsmoker] 2 3444. 468.
## 4 0 [Female] 0 [White] 0 [Nonsmoker] 3 3462. 483.
## 5 0 [Female] 0 [White] 1 [Smoker] 0 3075. 552.
## 6 0 [Female] 0 [White] 1 [Smoker] 1 3158. 506.
## 7 0 [Female] 0 [White] 1 [Smoker] 2 3266. 452.
## 8 0 [Female] 0 [White] 1 [Smoker] 3 3291. 491.
## 9 0 [Female] 1 [Black] 0 [Nonsmoker] 0 3147. 410.
## 10 0 [Female] 1 [Black] 0 [Nonsmoker] 1 3135. 530.
## # ℹ 22 more rows
#I like the idea of defining a factor structure here early and operates, makes things more clear. Mine skipped steps to show only subgroup analysis and did factor analysis below
## Assign a factor structure to education
#df3 <- data_first$education %>%
# dplyr::mutate(
# education_fac = factor(education,
# levels = c(0, 1, 2, 3),
# labels = c("Less than HS",
# "High School Grad",
# "Some College",
# "College Grad")),
# smoke_fac = factor(smoke,
# levels = c(0, 1),
# labels = c("Nonsmoker", "Smoker")),
# male_fac = factor(male,
# levels = c(0, 1),
# labels = c("Female", "Male")),
# black_fac = factor(black,
# levels = c(0, 1),
# labels = c("Nonblack", "Black"))
# )
#
#df3 %>%
# count(education_fac, education)
#
## Tabulation using janitor package
#df3 %>%
# janitor::tabyl(education_fac) %>%
# adorn_pct_formatting(digits = 1) %>%
# adorn_totals()
My observation: In general, smokers’ newborns tend to weigh less than non-smokers’ for mothers who are white regardless of education level. The white mother boy vs white mother girl groups do not seem to be too different from each other in terms of mean birthweight. However, the black mother boy vs black mother girl groups seem to be rather varied in birthweights, especially with black mother having some college degrees showing a weird pattern that newborn girls of non-smoker black mother tend to be heavier than newborn girls of smoker black mother, while it’s vise versa (but less of a mean difference) for new born boys of non-smoker vs smoker black mothers.Same thing happens in the opposite direction with black mothers graduated from college.
library(ggplot2)
# Convert grouping variables to factors for clarity in plotting
data_first$smoke <- factor(data_first$smoke, levels = c(0,1), labels = c("Non-Smoker","Smoker"))
data_first$male <- factor(data_first$male, levels = c(0,1), labels = c("Female","Male"))
data_first$black <- factor(data_first$black, levels = c(0,1), labels = c("White","Black"))
data_first$education <- factor(data_first$education,
levels = c(0,1,2,3),
labels = c("BelowHS","HSGrad","SomeColl","CollGrad"))
ggplot(data_first, aes(x = education, y = birwt, fill = smoke)) +
geom_boxplot(outlier.shape = NA) +
facet_grid(male ~ black) +
labs(x = "Education Level", y = "Birthweight (grams)", fill = "Smoking Status") +
theme_bw()
#a different box plot with slightly different grouping:
## Create a box plot with no outliers and limited y-axis range
#p <- df3 %>%
# ggplot(aes(y = birwt, group = education, fill = #education_fac)) +
# geom_boxplot(outlier.shape = NA) + # Remove outliers
# facet_grid(cols = vars(male_fac, smoke_fac, black_fac)) +
# ylim(1500, 5000) + # Limit y-axis range
# theme_bw()
#
#p
reg1 <- lm(birwt ~ smoke, data = data_first)
summary(reg1)
##
## Call:
## lm(formula = birwt ~ smoke, data = data_first)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2527.9 -302.9 9.1 321.1 1801.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3477.902 8.495 409.42 <0.0000000000000002 ***
## smokeSmoker -289.763 22.620 -12.81 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 496.6 on 3976 degrees of freedom
## Multiple R-squared: 0.03963, Adjusted R-squared: 0.03939
## F-statistic: 164.1 on 1 and 3976 DF, p-value: < 0.00000000000000022
reg2 <- lm(birwt ~ smoke + mage + male + black + hsgrad + somecoll + collgrad,
data = data_first)
summary(reg2)
##
## Call:
## lm(formula = birwt ~ smoke + mage + male + black + hsgrad + somecoll +
## collgrad, data = data_first)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2478.15 -303.91 5.54 316.14 1702.80
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3214.389 45.941 69.968 < 0.0000000000000002 ***
## smokeSmoker -239.292 23.758 -10.072 < 0.0000000000000002 ***
## mage 5.133 1.713 2.996 0.00276 **
## maleMale 102.218 15.463 6.610 0.0000000000434145 ***
## blackBlack -237.016 30.597 -7.746 0.0000000000000119 ***
## hsgrad 81.563 28.275 2.885 0.00394 **
## somecoll 92.746 30.559 3.035 0.00242 **
## collgrad 100.044 31.742 3.152 0.00163 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 487.2 on 3970 degrees of freedom
## Multiple R-squared: 0.077, Adjusted R-squared: 0.07538
## F-statistic: 47.32 on 7 and 3970 DF, p-value: < 0.00000000000000022
#can also use easystats and broom packages, see datalab page
#to add comparison:
# Compare parameter estimates
compare_parameters(reg1, reg2) %>%
print()
## Parameter | reg1 | reg2
## ------------------------------------------------------------------------
## (Intercept) | 3477.90 (3461.25, 3494.56) | 3214.39 (3124.32, 3304.46)
## smoke [Smoker] | -289.76 (-334.11, -245.41) | -239.29 (-285.87, -192.71)
## mage | | 5.13 ( 1.77, 8.49)
## male [Male] | | 102.22 ( 71.90, 132.53)
## black [Black] | | -237.02 (-297.00, -177.03)
## hsgrad | | 81.56 ( 26.13, 137.00)
## somecoll | | 92.75 ( 32.83, 152.66)
## collgrad | | 100.04 ( 37.81, 162.28)
## ------------------------------------------------------------------------
## Observations | 3978 | 3978
# Compare performances
compare_performance(reg1, reg2) %>% print()
## # Comparison of Model Performance Indices
##
## Name | Model | AIC (weights) | AICc (weights) | BIC (weights) | R2
## --------------------------------------------------------------------------
## reg1 | lm | 60681.6 (<.001) | 60681.6 (<.001) | 60700.4 (<.001) | 0.040
## reg2 | lm | 60535.7 (>.999) | 60535.8 (>.999) | 60592.3 (>.999) | 0.077
##
## Name | R2 (adj.) | RMSE | Sigma
## ------------------------------------
## reg1 | 0.039 | 496.436 | 496.561
## reg2 | 0.075 | 486.682 | 487.172
# Unadjusted mean difference
library(ggeffects) # Ensure ggeffects package is loaded for ggpredict
marginal_means <- ggpredict(reg1, term = c("smoke"))
marginal_means %>% print()
## # Predicted values of birwt
##
## smoke | Predicted | 95% CI
## -----------------------------------------
## Non-Smoker | 3477.90 | 3461.25, 3494.56
## Smoker | 3188.14 | 3147.04, 3229.24
# Adjusted mean difference
adjusted_means <- ggpredict(reg2, term = c("smoke"))
adjusted_means %>% print()
## # Predicted values of birwt
##
## smoke | Predicted | 95% CI
## -----------------------------------------
## Non-Smoker | 3436.19 | 3413.19, 3459.18
## Smoker | 3196.89 | 3151.45, 3242.34
##
## Adjusted for:
## * mage = 27.36
## * male = Female
## * black = White
## * hsgrad = 0.29
## * somecoll = 0.24
## * collgrad = 0.36
Difference: The newly added variables also explain a certain percentage of variance in Y (birth weight) thus we are seeing a change in the parameter of predictor smoke and criterion birth weight. As such, as also suggested in the data lab output, these newly added variables are control variable to consider when testing the relationship between mother smoking behavior and baby birth weight.
Answer: No it doesn’t (t = -1.193, df = 3969, p = .233 > .05). There is no statistically significant interaction effect of mother race and baby gender on baby birth weight.
output evidence: ## maleMale:blackBlack -71.638 60.055 -1.193 0.23299
reg3 <- lm(birwt ~ smoke + mage + male + black + hsgrad + somecoll + collgrad
+ male:black,
data = data_first)
summary(reg3)
##
## Call:
## lm(formula = birwt ~ smoke + mage + male + black + hsgrad + somecoll +
## collgrad + male:black, data = data_first)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2475.46 -304.88 6.18 316.27 1699.64
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3211.934 45.984 69.848 < 0.0000000000000002 ***
## smokeSmoker -239.026 23.758 -10.061 < 0.0000000000000002 ***
## mage 5.131 1.713 2.995 0.00276 **
## maleMale 107.309 16.041 6.690 0.0000000000255 ***
## blackBlack -202.243 42.259 -4.786 0.0000017652696 ***
## hsgrad 81.853 28.275 2.895 0.00381 **
## somecoll 92.227 30.560 3.018 0.00256 **
## collgrad 99.843 31.740 3.146 0.00167 **
## maleMale:blackBlack -71.638 60.055 -1.193 0.23299
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 487.1 on 3969 degrees of freedom
## Multiple R-squared: 0.07733, Adjusted R-squared: 0.07547
## F-statistic: 41.58 on 8 and 3969 DF, p-value: < 0.00000000000000022
#comparison:
compare_parameters(reg2, reg3) %>% print()
## Parameter | reg2 | reg3
## -------------------------------------------------------------------------------------
## (Intercept) | 3214.39 (3124.32, 3304.46) | 3211.93 (3121.78, 3302.09)
## smoke [Smoker] | -239.29 (-285.87, -192.71) | -239.03 (-285.61, -192.45)
## mage | 5.13 ( 1.77, 8.49) | 5.13 ( 1.77, 8.49)
## male [Male] | 102.22 ( 71.90, 132.53) | 107.31 ( 75.86, 138.76)
## black [Black] | -237.02 (-297.00, -177.03) | -202.24 (-285.10, -119.39)
## hsgrad | 81.56 ( 26.13, 137.00) | 81.85 ( 26.42, 137.29)
## somecoll | 92.75 ( 32.83, 152.66) | 92.23 ( 32.31, 152.14)
## collgrad | 100.04 ( 37.81, 162.28) | 99.84 ( 37.61, 162.07)
## male [Male] × black [Black] | | -71.64 (-189.38, 46.10)
## -------------------------------------------------------------------------------------
## Observations | 3978 | 3978
compare_performance(reg2, reg3) %>% print()
## # Comparison of Model Performance Indices
##
## Name | Model | AIC (weights) | AICc (weights) | BIC (weights) | R2
## --------------------------------------------------------------------------
## reg2 | lm | 60535.7 (0.571) | 60535.8 (0.573) | 60592.3 (0.969) | 0.077
## reg3 | lm | 60536.3 (0.429) | 60536.3 (0.427) | 60599.2 (0.031) | 0.077
##
## Name | R2 (adj.) | RMSE | Sigma
## ------------------------------------
## reg2 | 0.075 | 486.682 | 487.172
## reg3 | 0.075 | 486.595 | 487.146