library(tibble)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.0.2 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(haven)
library(ggplot2)
body.demo <- read_csv("/Users/yunis/Desktop/NHANES_v1(1).csv")
## Rows: 9756 Columns: 73
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (72): seqn, sddsrvyr, ridstatr, riagendr, ridageyr, ridagemn, ridreth1, ...
## lgl (1): bmihead
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names (body.demo)
## [1] "seqn" "sddsrvyr" "ridstatr" "riagendr" "ridageyr" "ridagemn"
## [7] "ridreth1" "ridreth3" "ridexmon" "ridexagy" "ridexagm" "dmqmiliz"
## [13] "dmqadfc" "dmdborn4" "dmdcitzn" "dmdyrsus" "dmdeduc3" "dmdeduc2"
## [19] "dmdmartl" "ridexprg" "sialang" "siaproxy" "siaintrp" "fialang"
## [25] "fiaproxy" "fiaintrp" "mialang" "miaproxy" "miaintrp" "aialanga"
## [31] "wtint2yr" "wtmec2yr" "sdmvpsu" "sdmvstra" "indhhin2" "indfmin2"
## [37] "indfmpir" "dmdhhsiz" "dmdfmsiz" "dmdhhsza" "dmdhhszb" "dmdhhsze"
## [43] "dmdhrgnd" "dmdhrage" "dmdhrbr4" "dmdhredu" "dmdhrmar" "dmdhsedu"
## [49] "bmdstats" "bmxwt" "bmiwt" "bmxrecum" "bmirecum" "bmxhead"
## [55] "bmihead" "bmxht" "bmiht" "bmxbmi" "bmdbmic" "bmxleg"
## [61] "bmileg" "bmxarml" "bmiarml" "bmxarmc" "bmiarmc" "bmxwaist"
## [67] "bmiwaist" "bmxsad1" "bmxsad2" "bmxsad3" "bmxsad4" "bmdavsad"
## [73] "bmdsadcm"
# Dependent Variable = bmxbmi
# independent variables#1 Education level - Adults 20+ (DMDEDUC2) #
# independent variables#2 Annual family income (INDFMIN2) #
body.demo.new <- data.frame(body.demo$bmxbmi, body.demo$dmdeduc2, body.demo$indfmin2)
names (body.demo.new)
## [1] "body.demo.bmxbmi" "body.demo.dmdeduc2" "body.demo.indfmin2"
dim (body.demo.new)
## [1] 9756 3
colnames(body.demo.new) <- c("bmi", "edu", "income")
body.demo.new <- na.omit(body.demo.new)
lm(bmi ~ edu, data=body.demo.new)
##
## Call:
## lm(formula = bmi ~ edu, data = body.demo.new)
##
## Coefficients:
## (Intercept) edu
## 30.3870 -0.4584
lm(bmi ~ income, data=body.demo.new)
##
## Call:
## lm(formula = bmi ~ income, data = body.demo.new)
##
## Coefficients:
## (Intercept) income
## 29.03509 -0.02216
# remove missing value #
body.demo$dmdeduc2[body.demo$dmdeduc2==7] <- NA
body.demo$dmdeduc2[body.demo$dmdeduc2==9] <- NA
edu <- body.demo$dmdeduc2
edu <- na.omit(edu)
summary(edu)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 3.000 4.000 3.462 5.000 5.000
# remove missing value #
body.demo$indfmin2[body.demo$indfmin2==77] <- NA
body.demo$indfmin2[body.demo$indfmin2==99] <- NA
income <- body.demo$indfmin2
income <- na.omit(income)
summary(income)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 4.000 7.000 7.832 12.000 15.000
# Theorize why and how each of two the independent variables may affect the body mass index.
# in this exercise, I will use the education level for adults 20+ as independent variables and family income as independent variables; the dependent Variable will be BMI. In theory, education will affect the body mass index. This time, I want to see if a person's partner education and family income will have an effect on the body mass index #
# fitting the education level#
# Obtain and interpret the regression coefficient estimates #
# the multiple R-squared is less than 1 at 0.00722, and it is less 1%, which means education level can't explain the bmi value #
fit.edu <- lm(bmi ~ edu, data=body.demo.new)
summary(fit.edu)
##
## Call:
## lm(formula = bmi ~ edu, data = body.demo.new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.629 -4.829 -1.195 3.347 53.088
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.38697 0.27595 110.116 < 2e-16 ***
## edu -0.45845 0.07444 -6.158 7.91e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.866 on 5215 degrees of freedom
## Multiple R-squared: 0.00722, Adjusted R-squared: 0.007029
## F-statistic: 37.92 on 1 and 5215 DF, p-value: 7.906e-10
nobs(fit.edu) # 5217
## [1] 5217
anova(fit.edu)
## Analysis of Variance Table
##
## Response: bmi
## Df Sum Sq Mean Sq F value Pr(>F)
## edu 1 1788 1787.78 37.924 7.906e-10 ***
## Residuals 5215 245841 47.14
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# finding the coefficient for the education level and the body mass index#
# for one level increase in education, a person's bmi will decrease by -0.45845 #
coef(fit.edu)
## (Intercept) edu
## 30.3869697 -0.4584479
summary(fit.edu)
##
## Call:
## lm(formula = bmi ~ edu, data = body.demo.new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.629 -4.829 -1.195 3.347 53.088
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.38697 0.27595 110.116 < 2e-16 ***
## edu -0.45845 0.07444 -6.158 7.91e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.866 on 5215 degrees of freedom
## Multiple R-squared: 0.00722, Adjusted R-squared: 0.007029
## F-statistic: 37.92 on 1 and 5215 DF, p-value: 7.906e-10
# the starting value is 1 (Less than 9th grade), and max is 5 (College graduate or above), both 7 (refuse) and 9 (don't know) are removed #
confint(fit.edu, level=0.95)
## 2.5 % 97.5 %
## (Intercept) 29.8459843 30.9279551
## edu -0.6043904 -0.3125053
# predicted values, this is a vector of 5217 element #
nobs(fit.edu)
## [1] 5217
bmi.edu <- fitted(fit.edu)
# this is a skewed distribution #
hist(bmi.edu)

summary(bmi.edu)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 26.26 28.09 28.55 28.79 29.01 29.93
# in this predicted model adult for those with
# adult with no education have a predicted bmi value of 30.38697,
# adult with with less than 9th grade of education have a predicted bmi value of 29.92852,
# adult with with 9-11th grade education have a predicted bmi value of 29.47007,
# adult with with high school graduate/GED or equivalent have a predicted bmi value of 29.01163,
# adult with with some college or AA degree have a predicted bmi value of 28.55318,
# adult with with college graduate or above have a predicted bmi value of 28.09473
bmi.edu.plot <- predict(fit.edu, data.frame(edu=seq(0,5)))
bmi.edu.plot
## 1 2 3 4 5 6
## 30.38697 29.92852 29.47007 29.01163 28.55318 28.09473
# predicted values and the 95% confidence bands and the 95% prediction bands#
# confidence bands for the predicted mean values of dependent variable (bmi), at 95% confidence #
predict(fit.edu, data.frame(edu=seq(0,5)), level=.95, interval="c")
## fit lwr upr
## 1 30.38697 29.84598 30.92796
## 2 29.92852 29.52143 30.33561
## 3 29.47007 29.18480 29.75535
## 4 29.01163 28.81254 29.21071
## 5 28.55318 28.35196 28.75439
## 6 28.09473 27.80501 28.38445
# prediction bands for the predicted mean values of dependent variable (bmi), at 95% confidence #
predict(fit.edu, data.frame(edu=seq(0,5)), level=.95, interval="p")
## fit lwr upr
## 1 30.38697 16.91600 43.85794
## 2 29.92852 16.46226 43.39478
## 3 29.47007 16.00694 42.93320
## 4 29.01163 15.55005 42.47321
## 5 28.55318 15.09157 42.01479
## 6 28.09473 14.63151 41.55796
# Plot the confidence and prediction bands #
ci.lwr <- predict(fit.edu, data.frame(edu=seq(0,5)), level=.95, interval="c")[,2]
ci.upr <- predict(fit.edu, data.frame(edu=seq(0,5)), level=.95, interval="c")[,3]
pi.lwr <- predict(fit.edu, data.frame(edu=seq(0,5)), level=.95, interval="p")[,2]
pi.upr <- predict(fit.edu, data.frame(edu=seq(0,5)), level=.95, interval="p")[,3]
edu.plot <- seq(0,5)
plot(edu.plot, bmi.edu.plot, ylim=range(pi.lwr,pi.upr),type="l", col="blue")
lines(edu.plot, ci.lwr, lty=6, col="pink")
lines(edu.plot, ci.upr, lty=6, col="pink")
lines(edu.plot, pi.lwr, lty=9, col="green")
lines(edu.plot, pi.upr, lty=9, col="green")

# fitting the annual family income #
# Obtain and interpret the regression coefficient estimates #
# the multiple R-squared is less than 1 at 0.002629, and it is less 1%, which means annual family income can't explain the bmi value #
fit.income <- lm(bmi ~ income, data=body.demo.new)
summary(fit.income)
##
## Call:
## lm(formula = bmi ~ income, data = body.demo.new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.369 -4.803 -1.125 3.397 53.109
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.035091 0.115722 250.904 < 2e-16 ***
## income -0.022155 0.005976 -3.708 0.000211 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.882 on 5215 degrees of freedom
## Multiple R-squared: 0.002629, Adjusted R-squared: 0.002438
## F-statistic: 13.75 on 1 and 5215 DF, p-value: 0.0002115
nobs(fit.income) #5217
## [1] 5217
anova(fit.income)
## Analysis of Variance Table
##
## Response: bmi
## Df Sum Sq Mean Sq F value Pr(>F)
## income 1 651 650.98 13.746 0.0002115 ***
## Residuals 5215 246977 47.36
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# finding the coefficient for the education level and the body mass index#
# for one level increase in annual family income, bmi will decrease by 2% #
coef(fit.income)
## (Intercept) income
## 29.03509149 -0.02215547
summary(fit.income)
##
## Call:
## lm(formula = bmi ~ income, data = body.demo.new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.369 -4.803 -1.125 3.397 53.109
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.035091 0.115722 250.904 < 2e-16 ***
## income -0.022155 0.005976 -3.708 0.000211 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.882 on 5215 degrees of freedom
## Multiple R-squared: 0.002629, Adjusted R-squared: 0.002438
## F-statistic: 13.75 on 1 and 5215 DF, p-value: 0.0002115
# the starting value is 1 ($ 0 to $ 4,999), and max is 15 ($100,000 and Over), both 71 (refuse) and 99 (don't know) are removed #
confint(fit.income, level=0.95)
## 2.5 % 97.5 %
## (Intercept) 28.80822778 29.26195520
## income -0.03387062 -0.01044032
# predicted values, this is a vector of 5217 element #
nobs(fit.income)
## [1] 5217
bmi.income <- fitted(fit.income)
# this is a skewed distribution #
hist(bmi.income)

# in this predicted model adult for those with
bmi.income.plot <- predict(fit.income, data.frame(income=seq(0,15)))
bmi.income.plot
## 1 2 3 4 5 6 7 8
## 29.03509 29.01294 28.99078 28.96863 28.94647 28.92431 28.90216 28.88000
## 9 10 11 12 13 14 15 16
## 28.85785 28.83569 28.81354 28.79138 28.76923 28.74707 28.72491 28.70276
predict(fit.income, data.frame(income=seq(0,15)), level=.95, interval="c")
## fit lwr upr
## 1 29.03509 28.80823 29.26196
## 2 29.01294 28.79251 29.23336
## 3 28.99078 28.77635 29.20522
## 4 28.96863 28.75970 29.17756
## 5 28.94647 28.74252 29.15042
## 6 28.92431 28.72478 29.12385
## 7 28.90216 28.70644 29.09788
## 8 28.88000 28.68746 29.07255
## 9 28.85785 28.66781 29.04789
## 10 28.83569 28.64746 29.02393
## 11 28.81354 28.62639 29.00068
## 12 28.79138 28.60460 28.97817
## 13 28.76923 28.58207 28.95638
## 14 28.74707 28.55881 28.93533
## 15 28.72491 28.53483 28.91500
## 16 28.70276 28.51016 28.89536
predict(fit.income, data.frame(income=seq(0,15)), level=.95, interval="p")
## fit lwr upr
## 1 29.03509 15.54199 42.52819
## 2 29.01294 15.51994 42.50593
## 3 28.99078 15.49788 42.48368
## 4 28.96863 15.47582 42.46143
## 5 28.94647 15.45374 42.43920
## 6 28.92431 15.43165 42.41698
## 7 28.90216 15.40955 42.39477
## 8 28.88000 15.38744 42.37257
## 9 28.85785 15.36532 42.35038
## 10 28.83569 15.34319 42.32820
## 11 28.81354 15.32105 42.30603
## 12 28.79138 15.29890 42.28387
## 13 28.76923 15.27674 42.26172
## 14 28.74707 15.25457 42.23958
## 15 28.72491 15.23238 42.21745
## 16 28.70276 15.21019 42.19533
ci.lwr <- predict(fit.income, data.frame(income=seq(0,15)), level=.95, interval="c")[,2]
ci.upr <- predict(fit.income, data.frame(income=seq(0,15)), level=.95, interval="c")[,3]
pi.lwr <- predict(fit.income, data.frame(income=seq(0,15)), level=.95, interval="p")[,2]
pi.upr <- predict(fit.income, data.frame(income=seq(0,15)), level=.95, interval="p")[,3]
income.plot <- seq(0,15)
plot(income.plot, bmi.income.plot, ylim=range(pi.lwr,pi.upr),type="l", col="orange")
lines(income.plot, ci.lwr, lty=6, col="#009999")
lines(income.plot, ci.upr, lty=6, col="#009999")
lines(income.plot, pi.lwr, lty=9, col="grey")
lines(income.plot, pi.upr, lty=9, col="grey")

# 2 random sets #
n.sub <- round(nrow(body.demo.new)/2)
set.seed(2021)
M <- 100
MSE <- vector()
for(i in 1:M){
ix.train <- sample(1:nrow(body.demo.new), n.sub, replace=F) # indexing the random set #
hrs.train <- body.demo.new[ix.train,] # setting sequence #
ix.test <- which(!(1:nrow(body.demo.new)%in%ix.train))
hrs.test <- body.demo.new[ix.test,]
head(hrs.test)
fit <- lm(bmi ~ edu, data=hrs.train)
summary(fit)
yhat <- predict(fit, hrs.test)
MSE[i] <- mean((hrs.test$bmi-yhat)^2)}
hist(MSE)

summary(MSE)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 43.76 45.70 46.97 46.93 48.10 49.91
MSE.02 <- vector()
for(i in 1:M){
ix.train <- sample(1:nrow(body.demo.new), n.sub, replace=F) # indexing the random set #
hrs.train <- body.demo.new[ix.train,] # setting sequence #
ix.test <- which(!(1:nrow(body.demo.new)%in%ix.train))
hrs.test <- body.demo.new[ix.test,]
head(hrs.test)
fit.02 <- lm(bmi ~ income, data=hrs.train)
summary(fit.02)
yhat.02 <- predict(fit, hrs.test)
MSE.02[i] <- mean((hrs.test$bmi-yhat.02)^2)}
hist(MSE.02)

summary(MSE.02)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 43.42 45.98 47.14 47.00 47.89 50.04