setwd("C:/Users/Chenh/OneDrive - Massachusetts Institute of Technology/000_2025_Fall/11.220/recitation 5")
CHR <-
"CHR_QR_2025.csv" |>
read.csv() 11.220 QR Recitation 5 Regression
1. Data Preparation
1.1 Read the dataset into R Studio
1.2 Check what variables we have in this dataset
colnames(CHR) [1] "Name" "median_hhi"
[3] "food_env" "exercise_oppo"
[5] "adult_ob" "insufficient_sleep"
[7] "motor_vehicle_death" "social_association"
[9] "poor_mental_health_days" "rac_seg"
[11] "exercise_oppo_nominal"
1.3 Check how many percent of the data is missing for each variable
# check missing data
CHR |>
summarise(across(everything(), ~ mean(is.na(.)) * 100)) |>
t() |>
as.data.frame() |>
setNames("percent_missing") percent_missing
Name 0
median_hhi 0
food_env 0
exercise_oppo 0
adult_ob 0
insufficient_sleep 0
motor_vehicle_death 0
social_association 0
poor_mental_health_days 0
rac_seg 0
exercise_oppo_nominal 0
2. Regression
2.1 Regression with one continuous variable
2.1.1 Regress adult_ob on food_env
Note: the order of the two variables in the regression does matter here!
# run the regression
model1 <- lm(adult_ob ~ food_env, data = CHR)
# view the outcome
stargazer(model1, type = "html",
out = "model1_results.html",
notes = c("Significance: * p<0.1; ** p<0.05; *** p<0.01"),
notes.append = FALSE) | Dependent variable: | |
| adult_ob | |
| food_env | -0.029*** |
| (0.007) | |
| Constant | 0.497*** |
| (0.050) | |
| Observations | 51 |
| R2 | 0.268 |
| Adjusted R2 | 0.253 |
| Residual Std. Error | 0.031 (df = 49) |
| F Statistic | 17.944*** (df = 1; 49) |
| Note: | Significance: * p<0.1; ** p<0.05; *** p<0.01 |
2.1.2 Visualize the regression
if (!require(ggplot2)) {install.packages("ggplot2")
library(ggplot2) } else {library(ggplot2) }Loading required package: ggplot2
ggplot(CHR, aes(x = food_env, y = adult_ob)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "orange") +
labs(title = "Food Environment and Obesity",
x = "food_env", y = "adult_ob")+
theme_minimal() `geom_smooth()` using formula = 'y ~ x'
2.1.2 Exercise: How to regress rac_reg on food_env and visualize the regression?
# type your codes here 2.2 Regression with one continuous variable + one categorical/ grouping variable
2.2.1 Regress adult_ob on food_env and exercise_oppo_nominal
model2 <- lm(adult_ob ~ food_env + exercise_oppo_nominal, data = CHR)
# view the outcome
stargazer(model2, type = "html",
out = "model1_results.html",
notes = c("Significance: * p<0.1; ** p<0.05; *** p<0.01"),
notes.append = FALSE)| Dependent variable: | |
| adult_ob | |
| food_env | -0.017** |
| (0.007) | |
| exercise_oppo_nominallow | 0.028*** |
| (0.009) | |
| Constant | 0.396*** |
| (0.058) | |
| Observations | 51 |
| R2 | 0.382 |
| Adjusted R2 | 0.356 |
| Residual Std. Error | 0.029 (df = 48) |
| F Statistic | 14.826*** (df = 2; 48) |
| Note: | Significance: * p<0.1; ** p<0.05; *** p<0.01 |
2.2.2 Visualize the regression in a 2-D plot
2.2.2 Visualize the regression in a “3-D” plot
2.3 Regression with two continuous variables
2.3.1 Regress adult_ob on food_env and exercise_oppo
model3 <- lm(adult_ob ~ food_env + exercise_oppo, data = CHR)
stargazer(model3,
type = "html",
out = "model1_results.html",
notes = c("Significance: * p<0.1; ** p<0.05; *** p<0.01"),
notes.append = FALSE)| Dependent variable: | |
| adult_ob | |
| food_env | -0.010 |
| (0.007) | |
| exercise_oppo | -0.192*** |
| (0.040) | |
| Constant | 0.513*** |
| (0.042) | |
| Observations | 51 |
| R2 | 0.503 |
| Adjusted R2 | 0.482 |
| Residual Std. Error | 0.026 (df = 48) |
| F Statistic | 24.279*** (df = 2; 48) |
| Note: | Significance: * p<0.1; ** p<0.05; *** p<0.01 |
2.3.2. Visualize the regression in a 3-D plane
2.4 Compare and contrast regression models
stargazer(
model1, model2, model3,
type = "html",
out = "models_1_3_results.html", # saved to Posit Cloud working directory
notes = c("Significance: * p < 0.1; ** p < 0.05; *** p < 0.01"),
notes.append = FALSE
) | Dependent variable: | |||
| adult_ob | |||
| (1) | (2) | (3) | |
| food_env | -0.029*** | -0.017** | -0.010 |
| (0.007) | (0.007) | (0.007) | |
| exercise_oppo_nominallow | 0.028*** | ||
| (0.009) | |||
| exercise_oppo | -0.192*** | ||
| (0.040) | |||
| Constant | 0.497*** | 0.396*** | 0.513*** |
| (0.050) | (0.058) | (0.042) | |
| Observations | 51 | 51 | 51 |
| R2 | 0.268 | 0.382 | 0.503 |
| Adjusted R2 | 0.253 | 0.356 | 0.482 |
| Residual Std. Error | 0.031 (df = 49) | 0.029 (df = 48) | 0.026 (df = 48) |
| F Statistic | 17.944*** (df = 1; 49) | 14.826*** (df = 2; 48) | 24.279*** (df = 2; 48) |
| Note: | Significance: * p < 0.1; ** p < 0.05; *** p < 0.01 | ||