library(tibble)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.0.2 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(haven)
library(ggplot2)
body.demo <- read_csv("/Volumes/Y.NI/Class/Fall' 21/DATA 306/Final/NHANES_v1(1).csv")
## Rows: 9756 Columns: 73
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (72): seqn, sddsrvyr, ridstatr, riagendr, ridageyr, ridagemn, ridreth1, ...
## lgl (1): bmihead
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim (body.demo)
## [1] 9756 73
names(body.demo)
## [1] "seqn" "sddsrvyr" "ridstatr" "riagendr" "ridageyr" "ridagemn"
## [7] "ridreth1" "ridreth3" "ridexmon" "ridexagy" "ridexagm" "dmqmiliz"
## [13] "dmqadfc" "dmdborn4" "dmdcitzn" "dmdyrsus" "dmdeduc3" "dmdeduc2"
## [19] "dmdmartl" "ridexprg" "sialang" "siaproxy" "siaintrp" "fialang"
## [25] "fiaproxy" "fiaintrp" "mialang" "miaproxy" "miaintrp" "aialanga"
## [31] "wtint2yr" "wtmec2yr" "sdmvpsu" "sdmvstra" "indhhin2" "indfmin2"
## [37] "indfmpir" "dmdhhsiz" "dmdfmsiz" "dmdhhsza" "dmdhhszb" "dmdhhsze"
## [43] "dmdhrgnd" "dmdhrage" "dmdhrbr4" "dmdhredu" "dmdhrmar" "dmdhsedu"
## [49] "bmdstats" "bmxwt" "bmiwt" "bmxrecum" "bmirecum" "bmxhead"
## [55] "bmihead" "bmxht" "bmiht" "bmxbmi" "bmdbmic" "bmxleg"
## [61] "bmileg" "bmxarml" "bmiarml" "bmxarmc" "bmiarmc" "bmxwaist"
## [67] "bmiwaist" "bmxsad1" "bmxsad2" "bmxsad3" "bmxsad4" "bmdavsad"
## [73] "bmdsadcm"
#subset data
body.sub <- data.frame(body.demo$bmxbmi, body.demo$indhhin2, body.demo$ridreth1, body.demo$riagendr, body.demo$dmdeduc2)
names(body.sub)
## [1] "body.demo.bmxbmi" "body.demo.indhhin2" "body.demo.ridreth1"
## [4] "body.demo.riagendr" "body.demo.dmdeduc2"
summary(body.sub)
## body.demo.bmxbmi body.demo.indhhin2 body.demo.ridreth1 body.demo.riagendr
## Min. :12.40 Min. : 1.0 Min. :1.000 Min. :1.000
## 1st Qu.:19.30 1st Qu.: 5.0 1st Qu.:3.000 1st Qu.:1.000
## Median :24.50 Median : 7.0 Median :3.000 Median :2.000
## Mean :25.34 Mean :11.5 Mean :3.229 Mean :1.502
## 3rd Qu.:29.80 3rd Qu.:14.0 3rd Qu.:4.000 3rd Qu.:2.000
## Max. :82.10 Max. :99.0 Max. :5.000 Max. :2.000
## NA's :1154 NA's :81
## body.demo.dmdeduc2
## Min. :1.000
## 1st Qu.:3.000
## Median :4.000
## Mean :3.467
## 3rd Qu.:5.000
## Max. :9.000
## NA's :4196
dim(body.sub)
## [1] 9756 5
colnames(body.sub) <- c("bmi", "income", "race", "gender", "edu")
names(body.sub)
## [1] "bmi" "income" "race" "gender" "edu"
#omit missing data#
body.sub <- na.omit(body.sub)
summary(body.sub)
## bmi income race gender
## Min. :13.40 Min. : 1.00 Min. :1.000 Min. :1.000
## 1st Qu.:23.90 1st Qu.: 5.00 1st Qu.:3.000 1st Qu.:1.000
## Median :27.60 Median : 7.00 Median :3.000 Median :2.000
## Mean :28.79 Mean :11.48 Mean :3.306 Mean :1.507
## 3rd Qu.:32.20 3rd Qu.:14.00 3rd Qu.:4.000 3rd Qu.:2.000
## Max. :82.10 Max. :99.00 Max. :5.000 Max. :2.000
## edu
## Min. :1.000
## 1st Qu.:3.000
## Median :4.000
## Mean :3.484
## 3rd Qu.:5.000
## Max. :9.000
dim(body.sub)
## [1] 5197 5
attach(body.sub)
# checking race data (categorical variable)
summary (race)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 3.000 3.000 3.306 4.000 5.000
race <- factor(race)
class (race)
## [1] "factor"
levels(race) <- c("Mexican American","Hispanic", "White", "Black", "Other")
summary(race)
## Mexican American Hispanic White Black
## 504 534 1911 1362
## Other
## 886
# checking gender data (categorical variable)
summary (gender)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 1.507 2.000 2.000
table (gender)
## gender
## 1 2
## 2562 2635
gender <- factor(gender)
class (gender)
## [1] "factor"
levels(gender) <- c("male","female")
summary(gender)
## male female
## 2562 2635
# select data bmxbmi and gender
M1 <- lm(bmi ~ gender, data = body.sub)
# the relationship between the IV and the DV
# gender have 1.0858 impact on BMI.
summary(M1)
##
## Call:
## lm(formula = bmi ~ gender, data = body.sub)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.923 -4.737 -1.123 3.463 52.777
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.1516 0.3028 89.659 < 2e-16 ***
## gender 1.0858 0.1907 5.693 1.32e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.874 on 5195 degrees of freedom
## Multiple R-squared: 0.0062, Adjusted R-squared: 0.006009
## F-statistic: 32.41 on 1 and 5195 DF, p-value: 1.316e-08
# testing the goodness-of-fit of Model 1
# the R-squared statistic is 0.006, not very close to 1, this is not a good model
anova(M1)
## Analysis of Variance Table
##
## Response: bmi
## Df Sum Sq Mean Sq F value Pr(>F)
## gender 1 1532 1531.55 32.412 1.316e-08 ***
## Residuals 5195 245479 47.25
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Choose one IV and theorize how it may affect the DV.
# select data bmxbmi, Annual household income, and gender
M2 <- lm(bmi ~ income + gender, data = body.sub)
# the relationship between the IV and the DV
# for every one unit changes in household income, there is -0.027 changes in BMI
# for every one unit changes in gender, there is a 1.060 changes in BMI
summary(M2)
##
## Call:
## lm(formula = bmi ~ income + gender, data = body.sub)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.941 -4.745 -1.059 3.459 52.532
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.501161 0.311727 88.222 < 2e-16 ***
## income -0.027094 0.005912 -4.583 4.69e-06 ***
## gender 1.060331 0.190442 5.568 2.71e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.861 on 5194 degrees of freedom
## Multiple R-squared: 0.0102, Adjusted R-squared: 0.009822
## F-statistic: 26.77 on 2 and 5194 DF, p-value: 2.711e-12
# testing the goodness-of-fit of M2
# the Multiple R-squared is 0.0102, and the Adjusted R-squared is 0.009, this is a better fit compare to M1
anova(M2)
## Analysis of Variance Table
##
## Response: bmi
## Df Sum Sq Mean Sq F value Pr(>F)
## income 1 1061 1061.06 22.541 2.112e-06 ***
## gender 1 1459 1459.21 31.000 2.710e-08 ***
## Residuals 5194 244491 47.07
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Model 2 have adjusted R-squares of 0.009, much higher than model 2
summary(M2)$adj.r.squared
## [1] 0.009821966
# select data bmxbmi, Annual household income, race
M3 <- lm(bmi ~ income + edu + gender, data = body.sub)
# the relationship between the IV and the DV
# for every one unit changes in household income, there is -0.0255 changes in BMI
# for every one unit changes in education, there is -0.468 changes in BMI
# for every one unit changes in gender, there is a 1.106 changes in BMI
summary(M3)
##
## Call:
## lm(formula = bmi ~ income + edu + gender, data = body.sub)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.337 -4.757 -1.021 3.355 52.297
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.046907 0.395849 73.379 < 2e-16 ***
## income -0.025571 0.005895 -4.338 1.47e-05 ***
## edu -0.468591 0.074405 -6.298 3.27e-10 ***
## gender 1.106258 0.189877 5.826 6.01e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.835 on 5193 degrees of freedom
## Multiple R-squared: 0.01771, Adjusted R-squared: 0.01714
## F-statistic: 31.2 on 3 and 5193 DF, p-value: < 2.2e-16
# testing the goodness-of-fit of M3
anova(M2, M3)
## Analysis of Variance Table
##
## Model 1: bmi ~ income + gender
## Model 2: bmi ~ income + edu + gender
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 5194 244491
## 2 5193 242637 1 1853.2 39.663 3.266e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Compare the adjusted R-squares
# Model 3 have adjusted R-squares of 0.017, much higher than model 2, but this is still far away from 1
summary(M2)$adj.r.squared
## [1] 0.009821966
summary(M3)$adj.r.squared
## [1] 0.01713822
interaction between the two continuous variables - visualizes the interaction effect in M4 - compare the adjusted R-squares between M4 and M3
# select continuous variables, it's going to be household income, and education
M4 <- lm(bmi ~ income + edu + gender + income * edu, data = body.sub)
# the relationship between the IV and the DV
# for every one unit changes in household income, there is -0.012 changes in BMI
# for every one unit changes in education, there is -0.413 changes in BMI
# for every one unit changes in gender, there is a 1.099 changes in BMI
# for every one unit changes in income and education, there is a -0.004 changes in BMI
summary(M4)
##
## Call:
## lm(formula = bmi ~ income + edu + gender + income * edu, data = body.sub)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.275 -4.777 -0.997 3.353 52.302
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 28.889958 0.424106 68.120 < 2e-16 ***
## income -0.012372 0.014093 -0.878 0.380
## edu -0.413350 0.091688 -4.508 6.68e-06 ***
## gender 1.099369 0.189993 5.786 7.61e-09 ***
## income:edu -0.004328 0.004198 -1.031 0.303
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.835 on 5192 degrees of freedom
## Multiple R-squared: 0.01791, Adjusted R-squared: 0.01715
## F-statistic: 23.67 on 4 and 5192 DF, p-value: < 2.2e-16
# visualizes interaction
M4.dta <- data.frame(gender = c(rep("male", 1), rep("female", 2)))
print(M4.dta)
## gender
## 1 male
## 2 female
## 3 female
# compare the adjusted R-squared between Model 4 and Model 3.
# both Model 3 and model 4 have adjusted R-squares of 0.017, this goodness of the fit is similar between model 3 and model 4
summary(M3)$adj.r.squared
## [1] 0.01713822
summary(M4)$adj.r.squared
## [1] 0.01715016
# compare anova between M3 and M4
anova(M3,M4)
## Analysis of Variance Table
##
## Model 1: bmi ~ income + edu + gender
## Model 2: bmi ~ income + edu + gender + income * edu
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 5193 242637
## 2 5192 242588 1 49.669 1.0631 0.3026
interaction between the categorical variable and one of the two continuous variables - visualizes the interaction effect in M5 - compare the adjusted R-squares between M5 and M3
# select continuous variables (income), and categorical variable (gender)
M5 <- lm(bmi ~ income + edu + gender + income * gender, data = body.sub)
# the relationship between the IV and the DV
# for every one unit changes in household income, there is 0.0263 changes in BMI
# for every one unit changes in education, there is -0.473 changes in BMI
# for every one unit changes in gender, there is a 1.511 changes in BMI
# for every one unit changes in income and gender, there is a -0.035 changes in BMI
summary(M5)
##
## Call:
## lm(formula = bmi ~ income + edu + gender + income * gender, data = body.sub)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.444 -4.762 -1.025 3.377 52.126
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 28.46094 0.44141 64.478 < 2e-16 ***
## income 0.02631 0.01832 1.436 0.15097
## edu -0.47369 0.07437 -6.370 2.06e-10 ***
## gender 1.51110 0.23307 6.484 9.79e-11 ***
## income:gender -0.03528 0.01179 -2.991 0.00279 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.83 on 5192 degrees of freedom
## Multiple R-squared: 0.0194, Adjusted R-squared: 0.01864
## F-statistic: 25.67 on 4 and 5192 DF, p-value: < 2.2e-16
# visualizes interaction
M5.dta <- data.frame(gender = c(rep("male", 1), rep("female", 2)))
print(M5.dta)
## gender
## 1 male
## 2 female
## 3 female
# compare the adjusted R-squared between Model 5 and Model 3.
# Model 3 have the Adjusted R-squared of 0.017, and model 5 have the Adjusted R-squared of 0.018
summary(M3)$adj.r.squared
## [1] 0.01713822
summary(M5)$adj.r.squared
## [1] 0.01863972
# compare anova between M3 and M5
anova(M3,M5)
## Analysis of Variance Table
##
## Model 1: bmi ~ income + edu + gender
## Model 2: bmi ~ income + edu + gender + income * gender
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 5193 242637
## 2 5192 242220 1 417.32 8.9454 0.002795 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# which one is the better fit model?
# The best fit model will be M5, follow by M3 and M4.
# Gender have the highest impact on a person's BMI
# both Education and household income have a negative relation with BMI
# the interaction of household income and education, and income and gender have very small impact on BMI