Data 306 final

library(tibble)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.3     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.2     ✓ forcats 0.5.1
## ✓ purrr   0.3.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(haven)
library(ggplot2)
body.demo <- read_csv("/Volumes/Y.NI/Class/Fall' 21/DATA 306/Final/NHANES_v1(1).csv")

## Rows: 9756 Columns: 73

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (72): seqn, sddsrvyr, ridstatr, riagendr, ridageyr, ridagemn, ridreth1, ...
## lgl  (1): bmihead

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

dim (body.demo)

## [1] 9756   73

names(body.demo)

##  [1] "seqn"     "sddsrvyr" "ridstatr" "riagendr" "ridageyr" "ridagemn"
##  [7] "ridreth1" "ridreth3" "ridexmon" "ridexagy" "ridexagm" "dmqmiliz"
## [13] "dmqadfc"  "dmdborn4" "dmdcitzn" "dmdyrsus" "dmdeduc3" "dmdeduc2"
## [19] "dmdmartl" "ridexprg" "sialang"  "siaproxy" "siaintrp" "fialang" 
## [25] "fiaproxy" "fiaintrp" "mialang"  "miaproxy" "miaintrp" "aialanga"
## [31] "wtint2yr" "wtmec2yr" "sdmvpsu"  "sdmvstra" "indhhin2" "indfmin2"
## [37] "indfmpir" "dmdhhsiz" "dmdfmsiz" "dmdhhsza" "dmdhhszb" "dmdhhsze"
## [43] "dmdhrgnd" "dmdhrage" "dmdhrbr4" "dmdhredu" "dmdhrmar" "dmdhsedu"
## [49] "bmdstats" "bmxwt"    "bmiwt"    "bmxrecum" "bmirecum" "bmxhead" 
## [55] "bmihead"  "bmxht"    "bmiht"    "bmxbmi"   "bmdbmic"  "bmxleg"  
## [61] "bmileg"   "bmxarml"  "bmiarml"  "bmxarmc"  "bmiarmc"  "bmxwaist"
## [67] "bmiwaist" "bmxsad1"  "bmxsad2"  "bmxsad3"  "bmxsad4"  "bmdavsad"
## [73] "bmdsadcm"

#subset data 
body.sub <- data.frame(body.demo$bmxbmi, body.demo$indhhin2, body.demo$ridreth1, body.demo$riagendr, body.demo$dmdeduc2)
names(body.sub)

## [1] "body.demo.bmxbmi"   "body.demo.indhhin2" "body.demo.ridreth1"
## [4] "body.demo.riagendr" "body.demo.dmdeduc2"

summary(body.sub)

##  body.demo.bmxbmi body.demo.indhhin2 body.demo.ridreth1 body.demo.riagendr
##  Min.   :12.40    Min.   : 1.0       Min.   :1.000      Min.   :1.000     
##  1st Qu.:19.30    1st Qu.: 5.0       1st Qu.:3.000      1st Qu.:1.000     
##  Median :24.50    Median : 7.0       Median :3.000      Median :2.000     
##  Mean   :25.34    Mean   :11.5       Mean   :3.229      Mean   :1.502     
##  3rd Qu.:29.80    3rd Qu.:14.0       3rd Qu.:4.000      3rd Qu.:2.000     
##  Max.   :82.10    Max.   :99.0       Max.   :5.000      Max.   :2.000     
##  NA's   :1154     NA's   :81                                              
##  body.demo.dmdeduc2
##  Min.   :1.000     
##  1st Qu.:3.000     
##  Median :4.000     
##  Mean   :3.467     
##  3rd Qu.:5.000     
##  Max.   :9.000     
##  NA's   :4196

dim(body.sub)

## [1] 9756    5

colnames(body.sub) <- c("bmi", "income", "race", "gender", "edu")
names(body.sub)

## [1] "bmi"    "income" "race"   "gender" "edu"

#omit missing data# 
body.sub <- na.omit(body.sub)
summary(body.sub)

##       bmi            income           race           gender     
##  Min.   :13.40   Min.   : 1.00   Min.   :1.000   Min.   :1.000  
##  1st Qu.:23.90   1st Qu.: 5.00   1st Qu.:3.000   1st Qu.:1.000  
##  Median :27.60   Median : 7.00   Median :3.000   Median :2.000  
##  Mean   :28.79   Mean   :11.48   Mean   :3.306   Mean   :1.507  
##  3rd Qu.:32.20   3rd Qu.:14.00   3rd Qu.:4.000   3rd Qu.:2.000  
##  Max.   :82.10   Max.   :99.00   Max.   :5.000   Max.   :2.000  
##       edu       
##  Min.   :1.000  
##  1st Qu.:3.000  
##  Median :4.000  
##  Mean   :3.484  
##  3rd Qu.:5.000  
##  Max.   :9.000

dim(body.sub)

## [1] 5197    5

attach(body.sub)

# checking race data (categorical variable)
summary (race)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   3.000   3.000   3.306   4.000   5.000

race <- factor(race)
class (race)

## [1] "factor"

levels(race) <- c("Mexican American","Hispanic", "White", "Black", "Other")
summary(race)

## Mexican American         Hispanic            White            Black 
##              504              534             1911             1362 
##            Other 
##              886

# checking gender data (categorical variable)
summary (gender)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   2.000   1.507   2.000   2.000

table (gender)

## gender
##    1    2 
## 2562 2635

gender <- factor(gender)
class (gender)

## [1] "factor"

levels(gender) <- c("male","female")
summary(gender)

##   male female 
##   2562   2635

Dependent Variable = bmxbmi

Problem 1

pick one categorical independent variable (gender)
Estimate a simple linear regression model using the categorical IV as the predictor
the relationship between the IV and the DV
goodness-of-fit of Model 1

# select data bmxbmi and gender 
M1 <- lm(bmi ~ gender, data = body.sub)

# the relationship between the IV and the DV
# gender have 1.0858 impact on BMI. 
summary(M1)

## 
## Call:
## lm(formula = bmi ~ gender, data = body.sub)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.923  -4.737  -1.123   3.463  52.777 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  27.1516     0.3028  89.659  < 2e-16 ***
## gender        1.0858     0.1907   5.693 1.32e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.874 on 5195 degrees of freedom
## Multiple R-squared:  0.0062, Adjusted R-squared:  0.006009 
## F-statistic: 32.41 on 1 and 5195 DF,  p-value: 1.316e-08

# testing the goodness-of-fit of Model 1 
# the R-squared statistic is 0.006, not very close to 1, this is not a good model 
anova(M1)

## Analysis of Variance Table
## 
## Response: bmi
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## gender       1   1532 1531.55  32.412 1.316e-08 ***
## Residuals 5195 245479   47.25                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Problem 2

Choose one IV and theorize how it may affect the DV.

# select data bmxbmi, Annual household income, and gender
M2 <- lm(bmi ~ income + gender, data = body.sub)

# the relationship between the IV and the DV 
# for every one unit changes in household income, there is -0.027 changes in BMI
# for every one unit changes in gender, there is a 1.060 changes in BMI 
summary(M2)

## 
## Call:
## lm(formula = bmi ~ income + gender, data = body.sub)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.941  -4.745  -1.059   3.459  52.532 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 27.501161   0.311727  88.222  < 2e-16 ***
## income      -0.027094   0.005912  -4.583 4.69e-06 ***
## gender       1.060331   0.190442   5.568 2.71e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.861 on 5194 degrees of freedom
## Multiple R-squared:  0.0102, Adjusted R-squared:  0.009822 
## F-statistic: 26.77 on 2 and 5194 DF,  p-value: 2.711e-12

# testing the goodness-of-fit of M2
# the Multiple R-squared is 0.0102, and the Adjusted R-squared is 0.009, this is a better fit compare to M1  
anova(M2)

## Analysis of Variance Table
## 
## Response: bmi
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## income       1   1061 1061.06  22.541 2.112e-06 ***
## gender       1   1459 1459.21  31.000 2.710e-08 ***
## Residuals 5194 244491   47.07                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Model 2 have adjusted R-squares of 0.009, much higher than model 2 
summary(M2)$adj.r.squared

## [1] 0.009821966

Problem 3

Add one more continuous IV to M2 to create M3
Compare the adjusted R-squares between M3 and M2
ANOVA F-test

# select data bmxbmi, Annual household income, race
M3 <- lm(bmi ~ income + edu + gender, data = body.sub)

# the relationship between the IV and the DV 
# for every one unit changes in household income, there is -0.0255 changes in BMI
# for every one unit  changes in education, there is -0.468 changes in BMI
# for every one unit  changes in gender, there is a 1.106 changes in BMI 
summary(M3)

## 
## Call:
## lm(formula = bmi ~ income + edu + gender, data = body.sub)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.337  -4.757  -1.021   3.355  52.297 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 29.046907   0.395849  73.379  < 2e-16 ***
## income      -0.025571   0.005895  -4.338 1.47e-05 ***
## edu         -0.468591   0.074405  -6.298 3.27e-10 ***
## gender       1.106258   0.189877   5.826 6.01e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.835 on 5193 degrees of freedom
## Multiple R-squared:  0.01771,    Adjusted R-squared:  0.01714 
## F-statistic:  31.2 on 3 and 5193 DF,  p-value: < 2.2e-16

# testing the goodness-of-fit of M3
anova(M2, M3)

## Analysis of Variance Table
## 
## Model 1: bmi ~ income + gender
## Model 2: bmi ~ income + edu + gender
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   5194 244491                                  
## 2   5193 242637  1    1853.2 39.663 3.266e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Compare the adjusted R-squares
# Model 3 have adjusted R-squares of 0.017, much higher than model 2, but this is still far away from 1 
summary(M2)$adj.r.squared

## [1] 0.009821966

summary(M3)$adj.r.squared

## [1] 0.01713822

Problem 4

interaction between the two continuous variables - visualizes the interaction effect in M4 - compare the adjusted R-squares between M4 and M3

# select continuous variables, it's going to be household income, and education 
M4 <- lm(bmi ~ income + edu + gender + income * edu, data = body.sub)

# the relationship between the IV and the DV 
# for every one unit changes in household income, there is -0.012 changes in BMI
# for every one unit  changes in education, there is -0.413 changes in BMI
# for every one unit  changes in gender, there is a 1.099 changes in BMI 
# for every one unit  changes in income and education, there is a -0.004 changes in BMI 
summary(M4)

## 
## Call:
## lm(formula = bmi ~ income + edu + gender + income * edu, data = body.sub)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.275  -4.777  -0.997   3.353  52.302 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 28.889958   0.424106  68.120  < 2e-16 ***
## income      -0.012372   0.014093  -0.878    0.380    
## edu         -0.413350   0.091688  -4.508 6.68e-06 ***
## gender       1.099369   0.189993   5.786 7.61e-09 ***
## income:edu  -0.004328   0.004198  -1.031    0.303    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.835 on 5192 degrees of freedom
## Multiple R-squared:  0.01791,    Adjusted R-squared:  0.01715 
## F-statistic: 23.67 on 4 and 5192 DF,  p-value: < 2.2e-16

# visualizes interaction 
M4.dta <- data.frame(gender = c(rep("male", 1), rep("female", 2)))
print(M4.dta)

##   gender
## 1   male
## 2 female
## 3 female

# compare the adjusted R-squared between Model 4 and Model 3.
# both Model 3 and model 4 have adjusted R-squares of 0.017, this goodness of the fit is similar between model 3 and model 4
summary(M3)$adj.r.squared

## [1] 0.01713822

summary(M4)$adj.r.squared

## [1] 0.01715016

# compare anova between M3 and M4 
anova(M3,M4)

## Analysis of Variance Table
## 
## Model 1: bmi ~ income + edu + gender
## Model 2: bmi ~ income + edu + gender + income * edu
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1   5193 242637                           
## 2   5192 242588  1    49.669 1.0631 0.3026

Problem 5

interaction between the categorical variable and one of the two continuous variables - visualizes the interaction effect in M5 - compare the adjusted R-squares between M5 and M3

# select continuous variables (income), and categorical variable (gender)
M5 <- lm(bmi ~ income + edu + gender + income * gender, data = body.sub)

# the relationship between the IV and the DV 
# for every one unit changes in household income, there is 0.0263 changes in BMI
# for every one unit  changes in education, there is -0.473 changes in BMI
# for every one unit  changes in gender, there is a 1.511 changes in BMI 
# for every one unit  changes in income and gender, there is a -0.035 changes in BMI 
summary(M5)

## 
## Call:
## lm(formula = bmi ~ income + edu + gender + income * gender, data = body.sub)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.444  -4.762  -1.025   3.377  52.126 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   28.46094    0.44141  64.478  < 2e-16 ***
## income         0.02631    0.01832   1.436  0.15097    
## edu           -0.47369    0.07437  -6.370 2.06e-10 ***
## gender         1.51110    0.23307   6.484 9.79e-11 ***
## income:gender -0.03528    0.01179  -2.991  0.00279 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.83 on 5192 degrees of freedom
## Multiple R-squared:  0.0194, Adjusted R-squared:  0.01864 
## F-statistic: 25.67 on 4 and 5192 DF,  p-value: < 2.2e-16

# visualizes interaction 
M5.dta <- data.frame(gender = c(rep("male", 1), rep("female", 2)))
print(M5.dta)

##   gender
## 1   male
## 2 female
## 3 female

# compare the adjusted R-squared between Model 5 and Model 3.
# Model 3 have the Adjusted R-squared of 0.017, and model 5 have the Adjusted R-squared of 0.018
summary(M3)$adj.r.squared

## [1] 0.01713822

summary(M5)$adj.r.squared

## [1] 0.01863972

# compare anova between M3 and M5
anova(M3,M5)

## Analysis of Variance Table
## 
## Model 1: bmi ~ income + edu + gender
## Model 2: bmi ~ income + edu + gender + income * gender
##   Res.Df    RSS Df Sum of Sq      F   Pr(>F)   
## 1   5193 242637                                
## 2   5192 242220  1    417.32 8.9454 0.002795 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Problem 6

which one is the better fit model?

# which one is the better fit model? 
# The best fit model will be M5, follow by M3 and M4. 
# Gender have the highest impact on a person's BMI 
# both Education and household income have a negative relation with BMI 
# the interaction of household income and education, and income and gender have very small impact on BMI