## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
GOAL: is to see how sugar intake is related to diet and demographic variables, and to understand the meaning of a multiple linear regression model
GOAL:Getting a sense of how sugar density behaves and how it relates to several continuing predictors.
#Histogram of sugar density
hist(
dat_core$sugar_den,
main = "Histogram of Sugar Density",
xlab = "Sugar Density"
)
#Selecting continuous variables and remove missing values
cont_vars=dat_core %>%
select(
sugar_den,
fiber_den,
satfat_den,
sodium_den,
age,
INDFMPIR,
DR1TKCAL
) %>%
na.omit()
#Correlation between sugar density and predictors
cor_sugar=cor(cont_vars$sugar_den,
cont_vars %>% select(-sugar_den))
cor_sugar
## fiber_den satfat_den sodium_den age INDFMPIR DR1TKCAL
## [1,] -0.06124463 -0.2634305 -0.1794895 0.008018612 -0.1170348 -0.04280339
#Correlation matrix from predictors
cor_predictors=cor(cont_vars %>% select(-sugar_den))
round(cor_predictors, 3)
## fiber_den satfat_den sodium_den age INDFMPIR DR1TKCAL
## fiber_den 1.000 -0.265 0.006 0.114 0.070 -0.173
## satfat_den -0.265 1.000 -0.001 0.002 0.034 0.152
## sodium_den 0.006 -0.001 1.000 -0.019 0.036 -0.084
## age 0.114 0.002 -0.019 1.000 0.056 -0.145
## INDFMPIR 0.070 0.034 0.036 0.056 1.000 0.018
## DR1TKCAL -0.173 0.152 -0.084 -0.145 0.018 1.000
#Conditional plot of sugar density vs saturated fat by income level
coplot(
sugar_den ~ satfat_den | INDFMPIR,
data = dat_core,
xlab = "Saturated Fat Density",
ylab = "Sugar Density"
)
##
## Missing rows: 27, 30, 31, 52, 65, 66, 68, 75, 91, 92, 93, 99, 103, 106, 114, 115, 118, 121, 126, 167, 176, 185, 213, 216, 225, 228, 232, 234, 236, 255, 261, 270, 272, 280, 283, 302, 309, 322, 332, 345, 369, 375, 379, 385, 405, 409, 410, 414, 431, 447, 453, 456, 465, 480, 511, 526, 537, 540, 547, 579, 585, 590, 591, 600, 620, 627, 629, 656, 663, 673, 676, 687, 692, 697, 701, 702, 711, 715, 719, 728, 740, 743, 744, 752, 758, 759, 770, 781, 791, 810, 822, 829, 838, 844, 846, 854, 866, 868, 872, 874, 882, 891, 904, 941, 964, 970, 972, 973, 989, 1012, 1014, 1018, 1048, 1049, 1055, 1060, 1066, 1074, 1108, 1121, 1124, 1129, 1139, 1163, 1178, 1180, 1184, 1191, 1193, 1195, 1206, 1225, 1230, 1237, 1243, 1246, 1247, 1249, 1251, 1253, 1262, 1265, 1266, 1283, 1323, 1327, 1366, 1379, 1380, 1381, 1383, 1388, 1393, 1420, 1421, 1425, 1437, 1447, 1453, 1455, 1456, 1459, 1462, 1468, 1477, 1479, 1495, 1502, 1504, 1516, 1538, 1541, 1544, 1557, 1579, 1587, 1593, 1595, 1596, 1603, 1604, 1605, 1626, 1628, 1638, 1639, 1649, 1653, 1661, 1682, 1685, 1691, 1717, 1718, 1729, 1744, 1754, 1767, 1772, 1790, 1794, 1799, 1813, 1836, 1849, 1850, 1853, 1868, 1872, 1875, 1898, 1912, 1913, 1916, 1922, 1926, 1935, 1942, 1943, 1946, 1956, 1958, 1960, 1971, 1980, 1996, 2018, 2028, 2040, 2048, 2053, 2063, 2070, 2076, 2097, 2106, 2108, 2110, 2116, 2119, 2120, 2130, 2135, 2136, 2137, 2138, 2139, 2143, 2144, 2149, 2151, 2168, 2175, 2190, 2192, 2204, 2207, 2224, 2233, 2235, 2244, 2249, 2250, 2252, 2256, 2265, 2269, 2271, 2274, 2282, 2286, 2300, 2308, 2313, 2317, 2326, 2327, 2331, 2336, 2337, 2338, 2350, 2353, 2357, 2377, 2381, 2387, 2388, 2401, 2405, 2417, 2424, 2429, 2432, 2447, 2453, 2468, 2480, 2488, 2489, 2492, 2496, 2497, 2501, 2508, 2514, 2518, 2524, 2527, 2530, 2535, 2548, 2556, 2557, 2567, 2576, 2579, 2593, 2597, 2600, 2606, 2626, 2634, 2645, 2646, 2648, 2658, 2662, 2677, 2684, 2686, 2693, 2715, 2723, 2729, 2732, 2744, 2755, 2757, 2762, 2780, 2781, 2789, 2794, 2798, 2800, 2804, 2809, 2811, 2818, 2827, 2837, 2860, 2861, 2867, 2880, 2897, 2900, 2904, 2907, 2908, 2913, 2916, 2918, 2924, 2940, 2946, 2966, 2971, 2977, 2982, 2984, 2998, 3000, 3009, 3020, 3027, 3031, 3032, 3034, 3047, 3049, 3056, 3074, 3111, 3112, 3143, 3146, 3164, 3166, 3179, 3213, 3214, 3220, 3222, 3241, 3250, 3257, 3262, 3278, 3294, 3304, 3305, 3318, 3341, 3343, 3347, 3349, 3369, 3370, 3372, 3383, 3386, 3388, 3395, 3419, 3424, 3443, 3444, 3452, 3455, 3460, 3467, 3473, 3479, 3484, 3486, 3487, 3488, 3490, 3496, 3500, 3519, 3535, 3547, 3550, 3552, 3563, 3573, 3595, 3598, 3599, 3601, 3617, 3653, 3664, 3668, 3670, 3692, 3697, 3705, 3712, 3732, 3763, 3774, 3775, 3782, 3790, 3791, 3794, 3809, 3812, 3815, 3818, 3819, 3820, 3842, 3845, 3846, 3848, 3880, 3886, 3910, 3912, 3915, 3925, 3927, 3945, 3957, 3962, 3968, 3970, 3974, 3978, 3988, 3990, 4003, 4008, 4026, 4029, 4031, 4040, 4092, 4093, 4100, 4103, 4112, 4146, 4167, 4168, 4169, 4178, 4191, 4193, 4195, 4216, 4222, 4229, 4246, 4260, 4261, 4277, 4278, 4280, 4303, 4307, 4312, 4324, 4333, 4347, 4359, 4363, 4372, 4383, 4415, 4462, 4463, 4470, 4477, 4484, 4486, 4491, 4493, 4501, 4512, 4530, 4540, 4574, 4579, 4591, 4607, 4615, 4622, 4623, 4638, 4640, 4648, 4658, 4665, 4667, 4669, 4675, 4681, 4689
title("Sugar Density vs. Saturated Fat Intake by Income Level")
GOAL: Use a multiple linear regression model to describe how your Sugar Density outcome changes with several predictors at once
#Multiple linear regression model
model_full=lm(
sugar_den ~ fiber_den + satfat_den + sodium_den +
age + sex + race + INDFMPIR + DR1TKCAL,
data = dat_core
)
model_full
##
## Call:
## lm(formula = sugar_den ~ fiber_den + satfat_den + sodium_den +
## age + sex + race + INDFMPIR + DR1TKCAL, data = dat_core)
##
## Coefficients:
## (Intercept)
## 84.0471173
## fiber_den
## -0.6429814
## satfat_den
## -1.7679756
## sodium_den
## -0.0040056
## age
## 0.0188055
## sexFemale
## 3.8716178
## raceOther Hispanic
## 1.7090347
## raceNon-Hispanic White
## 4.0922383
## raceNon-Hispanic Black
## 3.2310434
## raceNon-Hispanic Asian
## -4.5609113
## raceOther Race - Including Multi-Racial
## 4.2503907
## INDFMPIR
## -1.2447684
## DR1TKCAL
## -0.0002689
#Display of regression summary
summary(model_full)
##
## Call:
## lm(formula = sugar_den ~ fiber_den + satfat_den + sodium_den +
## age + sex + race + INDFMPIR + DR1TKCAL, data = dat_core)
##
## Residuals:
## Min 1Q Median 3Q Max
## -90.21 -16.28 -1.45 12.79 243.85
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 84.0471173 2.4796427 33.895 < 2e-16
## fiber_den -0.6429814 0.0875465 -7.344 2.48e-13
## satfat_den -1.7679756 0.0875072 -20.204 < 2e-16
## sodium_den -0.0040056 0.0003418 -11.718 < 2e-16
## age 0.0188055 0.0225105 0.835 0.40354
## sexFemale 3.8716178 0.7973600 4.856 1.25e-06
## raceOther Hispanic 1.7090347 1.6700095 1.023 0.30619
## raceNon-Hispanic White 4.0922383 1.2747591 3.210 0.00134
## raceNon-Hispanic Black 3.2310434 1.3564942 2.382 0.01727
## raceNon-Hispanic Asian -4.5609113 1.5264328 -2.988 0.00283
## raceOther Race - Including Multi-Racial 4.2503907 1.9899648 2.136 0.03275
## INDFMPIR -1.2447684 0.2430188 -5.122 3.16e-07
## DR1TKCAL -0.0002689 0.0003962 -0.679 0.49734
##
## (Intercept) ***
## fiber_den ***
## satfat_den ***
## sodium_den ***
## age
## sexFemale ***
## raceOther Hispanic
## raceNon-Hispanic White **
## raceNon-Hispanic Black *
## raceNon-Hispanic Asian **
## raceOther Race - Including Multi-Racial *
## INDFMPIR ***
## DR1TKCAL
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.31 on 4124 degrees of freedom
## (554 observations deleted due to missingness)
## Multiple R-squared: 0.1454, Adjusted R-squared: 0.1429
## F-statistic: 58.49 on 12 and 4124 DF, p-value: < 2.2e-16
NOTE: In write Up
#Extract components used by the model
yhat=fitted(model_full)
res=residuals(model_full)
y=yhat + res
#Compute SSE and SST
SSE=sum(res^2)
SST=sum((y - mean(y))^2)
#R-squared
R2=1 - SSE / SST
R2
## [1] 0.1454357
#Adjusted R-squared
n=nrow(dat_core)
p=length(coefficients(model_full))
Adjusted_R2= 1 - ((n-1)/(n-p)) * (SSE/SST)
Adjusted_R2
## [1] 0.1432435
# Compute residual standard error
RSE=sqrt(mean(residuals(model_full)^2))
RSE
## [1] 24.26785
#Create dataframe for residual plot
library(ggplot2)
df_res=data.frame(
fitted = fitted(model_full),
residuals = residuals(model_full)
)
#Plot of residuals vs fitted values
ggplot(df_res, aes(x = fitted, y = residuals)) +
geom_point() +
geom_hline(yintercept = 0, color = "pink") +
labs(
title = "Residuals vs Fitted",
x = "Fitted Values",
y = "Residuals"
)
Goal: we treat obesity as a yes/no outcome and use logistic regression to study how obesity is related to sugar measures and other predictors. We will look both at the size and direction of associations and at how well the model classifies individuals as obese or not.
GOAL: Compute the proportion of participants who are obese
(obese = 1) in your analysis dataset.
Express this as a proportion and as a percentage. This gives a baseline:
if we always guessed “not
obese,” how often would we be wrong?
#Remove non obesity values
NUMBER_OF_OBESITY=dat_core %>%
filter(!is.na(obese))
## Percentage Obese
PORPORTION_OBESE=mean(NUMBER_OF_OBESITY$obese == 1)
PORPORTION_OBESE
## [1] 0.425922
#Percentage Obses
PERCENT_OBESE=PORPORTION_OBESE * 100
PERCENT_OBESE
## [1] 42.5922
GOAL: Fit a logistic regression model and interpret the main coefficients in terms of odds and log-odds
Fit a logistic regression model with obese as the target and the following predictors: your chosen sugar outcome, fiber_den, satfat_den, sodium_den, age, sex, race, INDFMPIR, and DR1TKCAL.
#Fit logistic regression model for obesity
logit_model=glm(
obese ~ sugar_den + fiber_den + satfat_den + sodium_den +
age + sex + race + INDFMPIR + DR1TKCAL,
data = dat_core,
family = "binomial"
)
#Logistic Regression Output
summary(logit_model)
##
## Call:
## glm(formula = obese ~ sugar_den + fiber_den + satfat_den + sodium_den +
## age + sex + race + INDFMPIR + DR1TKCAL, family = "binomial",
## data = dat_core)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.191e-01 2.690e-01 -1.186 0.235465
## sugar_den 3.799e-04 1.404e-03 0.271 0.786766
## fiber_den -4.123e-02 8.023e-03 -5.138 2.77e-07
## satfat_den 1.320e-02 7.880e-03 1.675 0.093858
## sodium_den 2.989e-04 6.315e-05 4.733 2.21e-06
## age -1.108e-03 1.931e-03 -0.574 0.566017
## sexFemale 2.537e-01 6.913e-02 3.669 0.000243
## raceOther Hispanic -3.643e-01 1.402e-01 -2.598 0.009372
## raceNon-Hispanic White -3.843e-01 1.068e-01 -3.597 0.000322
## raceNon-Hispanic Black -1.244e-01 1.133e-01 -1.098 0.272248
## raceNon-Hispanic Asian -1.889e+00 1.554e-01 -12.151 < 2e-16
## raceOther Race - Including Multi-Racial -2.162e-01 1.657e-01 -1.305 0.192042
## INDFMPIR 1.328e-02 2.109e-02 0.630 0.528974
## DR1TKCAL -9.031e-06 3.385e-05 -0.267 0.789610
##
## (Intercept)
## sugar_den
## fiber_den ***
## satfat_den .
## sodium_den ***
## age
## sexFemale ***
## raceOther Hispanic **
## raceNon-Hispanic White ***
## raceNon-Hispanic Black
## raceNon-Hispanic Asian ***
## raceOther Race - Including Multi-Racial
## INDFMPIR
## DR1TKCAL
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5643.9 on 4136 degrees of freedom
## Residual deviance: 5320.4 on 4123 degrees of freedom
## (554 observations deleted due to missingness)
## AIC: 5348.4
##
## Number of Fisher Scoring iterations: 4