Diet & Health Analysis with Health & Nutrition Survey Data

Final Project:

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

PART A: LINEAR REGRESSION WITH A SUGAR OUTCOME:

GOAL: is to see how sugar intake is related to diet and demographic variables, and to understand the meaning of a multiple linear regression model

PART A:1 Exploring Your Sugar Target and its Relationship

GOAL:Getting a sense of how sugar density behaves and how it relates to several continuing predictors.

Part A:1:A: Distribution of Sugar Density

#Histogram of sugar density
hist(
  dat_core$sugar_den,
  main = "Histogram of Sugar Density",
  xlab = "Sugar Density"
)

Part A:1:B: Correlations with continuous Predictors

#Selecting continuous variables and remove missing values

cont_vars=dat_core %>%
  select(
    sugar_den,      
    fiber_den,
    satfat_den,
    sodium_den,
    age,
    INDFMPIR,
    DR1TKCAL
  ) %>%
  na.omit()

#Correlation between sugar density and predictors
cor_sugar=cor(cont_vars$sugar_den, 
                 cont_vars %>% select(-sugar_den))

cor_sugar
##        fiber_den satfat_den sodium_den         age   INDFMPIR    DR1TKCAL
## [1,] -0.06124463 -0.2634305 -0.1794895 0.008018612 -0.1170348 -0.04280339
#Correlation matrix from predictors
cor_predictors=cor(cont_vars %>% select(-sugar_den))

round(cor_predictors, 3)
##            fiber_den satfat_den sodium_den    age INDFMPIR DR1TKCAL
## fiber_den      1.000     -0.265      0.006  0.114    0.070   -0.173
## satfat_den    -0.265      1.000     -0.001  0.002    0.034    0.152
## sodium_den     0.006     -0.001      1.000 -0.019    0.036   -0.084
## age            0.114      0.002     -0.019  1.000    0.056   -0.145
## INDFMPIR       0.070      0.034      0.036  0.056    1.000    0.018
## DR1TKCAL      -0.173      0.152     -0.084 -0.145    0.018    1.000

Part A:1:C: Coplot: Sugar Versus one Predictor within Groups

#Conditional plot of sugar density vs saturated fat by income level
coplot(
  sugar_den ~ satfat_den | INDFMPIR,
  data = dat_core,
  xlab = "Saturated Fat Density",
  ylab = "Sugar Density"
)
## 
##  Missing rows: 27, 30, 31, 52, 65, 66, 68, 75, 91, 92, 93, 99, 103, 106, 114, 115, 118, 121, 126, 167, 176, 185, 213, 216, 225, 228, 232, 234, 236, 255, 261, 270, 272, 280, 283, 302, 309, 322, 332, 345, 369, 375, 379, 385, 405, 409, 410, 414, 431, 447, 453, 456, 465, 480, 511, 526, 537, 540, 547, 579, 585, 590, 591, 600, 620, 627, 629, 656, 663, 673, 676, 687, 692, 697, 701, 702, 711, 715, 719, 728, 740, 743, 744, 752, 758, 759, 770, 781, 791, 810, 822, 829, 838, 844, 846, 854, 866, 868, 872, 874, 882, 891, 904, 941, 964, 970, 972, 973, 989, 1012, 1014, 1018, 1048, 1049, 1055, 1060, 1066, 1074, 1108, 1121, 1124, 1129, 1139, 1163, 1178, 1180, 1184, 1191, 1193, 1195, 1206, 1225, 1230, 1237, 1243, 1246, 1247, 1249, 1251, 1253, 1262, 1265, 1266, 1283, 1323, 1327, 1366, 1379, 1380, 1381, 1383, 1388, 1393, 1420, 1421, 1425, 1437, 1447, 1453, 1455, 1456, 1459, 1462, 1468, 1477, 1479, 1495, 1502, 1504, 1516, 1538, 1541, 1544, 1557, 1579, 1587, 1593, 1595, 1596, 1603, 1604, 1605, 1626, 1628, 1638, 1639, 1649, 1653, 1661, 1682, 1685, 1691, 1717, 1718, 1729, 1744, 1754, 1767, 1772, 1790, 1794, 1799, 1813, 1836, 1849, 1850, 1853, 1868, 1872, 1875, 1898, 1912, 1913, 1916, 1922, 1926, 1935, 1942, 1943, 1946, 1956, 1958, 1960, 1971, 1980, 1996, 2018, 2028, 2040, 2048, 2053, 2063, 2070, 2076, 2097, 2106, 2108, 2110, 2116, 2119, 2120, 2130, 2135, 2136, 2137, 2138, 2139, 2143, 2144, 2149, 2151, 2168, 2175, 2190, 2192, 2204, 2207, 2224, 2233, 2235, 2244, 2249, 2250, 2252, 2256, 2265, 2269, 2271, 2274, 2282, 2286, 2300, 2308, 2313, 2317, 2326, 2327, 2331, 2336, 2337, 2338, 2350, 2353, 2357, 2377, 2381, 2387, 2388, 2401, 2405, 2417, 2424, 2429, 2432, 2447, 2453, 2468, 2480, 2488, 2489, 2492, 2496, 2497, 2501, 2508, 2514, 2518, 2524, 2527, 2530, 2535, 2548, 2556, 2557, 2567, 2576, 2579, 2593, 2597, 2600, 2606, 2626, 2634, 2645, 2646, 2648, 2658, 2662, 2677, 2684, 2686, 2693, 2715, 2723, 2729, 2732, 2744, 2755, 2757, 2762, 2780, 2781, 2789, 2794, 2798, 2800, 2804, 2809, 2811, 2818, 2827, 2837, 2860, 2861, 2867, 2880, 2897, 2900, 2904, 2907, 2908, 2913, 2916, 2918, 2924, 2940, 2946, 2966, 2971, 2977, 2982, 2984, 2998, 3000, 3009, 3020, 3027, 3031, 3032, 3034, 3047, 3049, 3056, 3074, 3111, 3112, 3143, 3146, 3164, 3166, 3179, 3213, 3214, 3220, 3222, 3241, 3250, 3257, 3262, 3278, 3294, 3304, 3305, 3318, 3341, 3343, 3347, 3349, 3369, 3370, 3372, 3383, 3386, 3388, 3395, 3419, 3424, 3443, 3444, 3452, 3455, 3460, 3467, 3473, 3479, 3484, 3486, 3487, 3488, 3490, 3496, 3500, 3519, 3535, 3547, 3550, 3552, 3563, 3573, 3595, 3598, 3599, 3601, 3617, 3653, 3664, 3668, 3670, 3692, 3697, 3705, 3712, 3732, 3763, 3774, 3775, 3782, 3790, 3791, 3794, 3809, 3812, 3815, 3818, 3819, 3820, 3842, 3845, 3846, 3848, 3880, 3886, 3910, 3912, 3915, 3925, 3927, 3945, 3957, 3962, 3968, 3970, 3974, 3978, 3988, 3990, 4003, 4008, 4026, 4029, 4031, 4040, 4092, 4093, 4100, 4103, 4112, 4146, 4167, 4168, 4169, 4178, 4191, 4193, 4195, 4216, 4222, 4229, 4246, 4260, 4261, 4277, 4278, 4280, 4303, 4307, 4312, 4324, 4333, 4347, 4359, 4363, 4372, 4383, 4415, 4462, 4463, 4470, 4477, 4484, 4486, 4491, 4493, 4501, 4512, 4530, 4540, 4574, 4579, 4591, 4607, 4615, 4622, 4623, 4638, 4640, 4648, 4658, 4665, 4667, 4669, 4675, 4681, 4689
title("Sugar Density vs. Saturated Fat Intake by Income Level")

Part A:2: FITTING MULTIPLE LINEAR REGRESSION MODEL:

Part A:2:A:Coplot: Sugar Versus one Predictor within Groups:

GOAL: Use a multiple linear regression model to describe how your Sugar Density outcome changes with several predictors at once

#Multiple linear regression model
model_full=lm(
  sugar_den ~ fiber_den + satfat_den + sodium_den + 
    age + sex + race + INDFMPIR + DR1TKCAL,
  data = dat_core
)
model_full
## 
## Call:
## lm(formula = sugar_den ~ fiber_den + satfat_den + sodium_den + 
##     age + sex + race + INDFMPIR + DR1TKCAL, data = dat_core)
## 
## Coefficients:
##                             (Intercept)  
##                              84.0471173  
##                               fiber_den  
##                              -0.6429814  
##                              satfat_den  
##                              -1.7679756  
##                              sodium_den  
##                              -0.0040056  
##                                     age  
##                               0.0188055  
##                               sexFemale  
##                               3.8716178  
##                      raceOther Hispanic  
##                               1.7090347  
##                  raceNon-Hispanic White  
##                               4.0922383  
##                  raceNon-Hispanic Black  
##                               3.2310434  
##                  raceNon-Hispanic Asian  
##                              -4.5609113  
## raceOther Race - Including Multi-Racial  
##                               4.2503907  
##                                INDFMPIR  
##                              -1.2447684  
##                                DR1TKCAL  
##                              -0.0002689

Part A:2:B: The regression Output

#Display of regression summary
summary(model_full)
## 
## Call:
## lm(formula = sugar_den ~ fiber_den + satfat_den + sodium_den + 
##     age + sex + race + INDFMPIR + DR1TKCAL, data = dat_core)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -90.21 -16.28  -1.45  12.79 243.85 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                             84.0471173  2.4796427  33.895  < 2e-16
## fiber_den                               -0.6429814  0.0875465  -7.344 2.48e-13
## satfat_den                              -1.7679756  0.0875072 -20.204  < 2e-16
## sodium_den                              -0.0040056  0.0003418 -11.718  < 2e-16
## age                                      0.0188055  0.0225105   0.835  0.40354
## sexFemale                                3.8716178  0.7973600   4.856 1.25e-06
## raceOther Hispanic                       1.7090347  1.6700095   1.023  0.30619
## raceNon-Hispanic White                   4.0922383  1.2747591   3.210  0.00134
## raceNon-Hispanic Black                   3.2310434  1.3564942   2.382  0.01727
## raceNon-Hispanic Asian                  -4.5609113  1.5264328  -2.988  0.00283
## raceOther Race - Including Multi-Racial  4.2503907  1.9899648   2.136  0.03275
## INDFMPIR                                -1.2447684  0.2430188  -5.122 3.16e-07
## DR1TKCAL                                -0.0002689  0.0003962  -0.679  0.49734
##                                            
## (Intercept)                             ***
## fiber_den                               ***
## satfat_den                              ***
## sodium_den                              ***
## age                                        
## sexFemale                               ***
## raceOther Hispanic                         
## raceNon-Hispanic White                  ** 
## raceNon-Hispanic Black                  *  
## raceNon-Hispanic Asian                  ** 
## raceOther Race - Including Multi-Racial *  
## INDFMPIR                                ***
## DR1TKCAL                                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.31 on 4124 degrees of freedom
##   (554 observations deleted due to missingness)
## Multiple R-squared:  0.1454, Adjusted R-squared:  0.1429 
## F-statistic: 58.49 on 12 and 4124 DF,  p-value: < 2.2e-16

Part A:2:C:Linking Back To Correlations and Coplots

NOTE: In write Up

Part A:3: HOW WELL DOES THE LINEAR MODEL FIT?:

Part A:3:A R² and adjusted R²

#Extract components used by the model
yhat=fitted(model_full)
res=residuals(model_full)
y=yhat + res  

#Compute SSE and SST
SSE=sum(res^2)
SST=sum((y - mean(y))^2)

#R-squared
R2=1 - SSE / SST
R2
## [1] 0.1454357
#Adjusted R-squared
n=nrow(dat_core)
p=length(coefficients(model_full))  

Adjusted_R2= 1 - ((n-1)/(n-p)) * (SSE/SST)
Adjusted_R2
## [1] 0.1432435

Part A:3:B Residual Standard Error (RSE)

# Compute residual standard error
RSE=sqrt(mean(residuals(model_full)^2)) 
RSE
## [1] 24.26785

Part A:3:C Residuals versus fitted values.

#Create dataframe for residual plot
library(ggplot2)

df_res=data.frame(
  fitted = fitted(model_full),
  residuals = residuals(model_full)
)
#Plot of residuals vs fitted values
ggplot(df_res, aes(x = fitted, y = residuals)) +
  geom_point() +
  geom_hline(yintercept = 0, color = "pink") +
  labs(
    title = "Residuals vs Fitted",
    x = "Fitted Values",
    y = "Residuals"
  )

Part B: LOGISTIC REGRESSION WITH OBESITY:

Goal: we treat obesity as a yes/no outcome and use logistic regression to study how obesity is related to sugar measures and other predictors. We will look both at the size and direction of associations and at how well the model classifies individuals as obese or not.

Part B:1: Getting To Know The Obesity Outcome

GOAL: Understand how common obesity is in this sample and how it relates to your sugar outcome.

Part B:1:A. Overall prevalence of obesity

GOAL: Compute the proportion of participants who are obese (obese = 1) in your analysis dataset.
Express this as a proportion and as a percentage. This gives a baseline: if we always guessed “not
obese,” how often would we be wrong?

#Remove non obesity values
NUMBER_OF_OBESITY=dat_core %>% 
  filter(!is.na(obese))
## Percentage Obese
PORPORTION_OBESE=mean(NUMBER_OF_OBESITY$obese == 1)
PORPORTION_OBESE
## [1] 0.425922
#Percentage Obses
PERCENT_OBESE=PORPORTION_OBESE * 100
PERCENT_OBESE
## [1] 42.5922

B2: FITTING A LOGISTIC REGRESSION FOR OBESITY:

GOAL: Fit a logistic regression model and interpret the main coefficients in terms of odds and log-odds

Part B:2:A. Fitting a logistic regression model for obesity

Fit a logistic regression model with obese as the target and the following predictors: your chosen sugar outcome, fiber_den, satfat_den, sodium_den, age, sex, race, INDFMPIR, and DR1TKCAL.

#Fit logistic regression model for obesity
logit_model=glm(
  obese ~ sugar_den + fiber_den + satfat_den + sodium_den +
    age + sex + race + INDFMPIR + DR1TKCAL,
  data = dat_core,
  family = "binomial"
)

Part B:2:B Logistic Regression Output

#Logistic Regression Output

summary(logit_model)
## 
## Call:
## glm(formula = obese ~ sugar_den + fiber_den + satfat_den + sodium_den + 
##     age + sex + race + INDFMPIR + DR1TKCAL, family = "binomial", 
##     data = dat_core)
## 
## Coefficients:
##                                           Estimate Std. Error z value Pr(>|z|)
## (Intercept)                             -3.191e-01  2.690e-01  -1.186 0.235465
## sugar_den                                3.799e-04  1.404e-03   0.271 0.786766
## fiber_den                               -4.123e-02  8.023e-03  -5.138 2.77e-07
## satfat_den                               1.320e-02  7.880e-03   1.675 0.093858
## sodium_den                               2.989e-04  6.315e-05   4.733 2.21e-06
## age                                     -1.108e-03  1.931e-03  -0.574 0.566017
## sexFemale                                2.537e-01  6.913e-02   3.669 0.000243
## raceOther Hispanic                      -3.643e-01  1.402e-01  -2.598 0.009372
## raceNon-Hispanic White                  -3.843e-01  1.068e-01  -3.597 0.000322
## raceNon-Hispanic Black                  -1.244e-01  1.133e-01  -1.098 0.272248
## raceNon-Hispanic Asian                  -1.889e+00  1.554e-01 -12.151  < 2e-16
## raceOther Race - Including Multi-Racial -2.162e-01  1.657e-01  -1.305 0.192042
## INDFMPIR                                 1.328e-02  2.109e-02   0.630 0.528974
## DR1TKCAL                                -9.031e-06  3.385e-05  -0.267 0.789610
##                                            
## (Intercept)                                
## sugar_den                                  
## fiber_den                               ***
## satfat_den                              .  
## sodium_den                              ***
## age                                        
## sexFemale                               ***
## raceOther Hispanic                      ** 
## raceNon-Hispanic White                  ***
## raceNon-Hispanic Black                     
## raceNon-Hispanic Asian                  ***
## raceOther Race - Including Multi-Racial    
## INDFMPIR                                   
## DR1TKCAL                                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5643.9  on 4136  degrees of freedom
## Residual deviance: 5320.4  on 4123  degrees of freedom
##   (554 observations deleted due to missingness)
## AIC: 5348.4
## 
## Number of Fisher Scoring iterations: 4